In [None]:
%matplotlib inline
%matplotlib notebook

In [None]:
import scipy.stats as st
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import re
import hvplot.pandas
import csv

In [None]:
abs_file = Path("Group Project Resources\Average weekly ordinary time earnings, full-time adults by state, original.csv")

vehicle_file = Path("Group Project Resources\Australian Vehicle Prices.csv")


# Define the file paths. Reading in Australian poscode csv from sql
input_file_path = r'C:\Users\Anries-PC\Documents\Bootcamp - Data Analyst UWA\Week 7\07-Project-1-Week-1\Starter_Code\Group Project Resources\australian-postcodes.csv'

output_file_path = 'postcodes_geo.csv'

# Regular expression pattern to extract values from INSERT INTO statements
pattern = r"\('(.*?)', '(.*?)', '(.*?)', (.*?), (.*?)\)"

# List to store extracted data
data = []

# Read the SQL-like file and extract data
with open(input_file_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()
    for line in lines:
        match = re.findall(pattern, line)
        if match:
            data.extend(match)

# Write extracted data to a CSV file
with open(output_file_path, mode='w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(['postcode', 'suburb', 'state', 'latitude', 'longitude'])  # Write header
    writer.writerows(data)  # Write the extracted data

print(f"CSV file '{output_file_path}' has been created.")

In [None]:
abs_df = pd.read_csv(abs_file, skiprows=1)
abs_df.reset_index(inplace=True)

vehicle_df = pd.read_csv(vehicle_file)


postcode_file = Path("postcodes_geo.csv")
postcode_df = pd.read_csv(postcode_file)

In [None]:
vehicle_df = vehicle_df.dropna(how="any")

Get the average Weekly income of the each State in Australia

In [None]:
abs_df = abs_df.rename(columns={"Unnamed: 0": "State"})

abs_df = abs_df[abs_df['State'] != "Source: Australian Bureau of Statistics, Average Weekly Earnings, Australia May 2023"]

columns_to_convert = ['Persons ($)', 'Males ($)', 'Females ($)']
abs_df[columns_to_convert] = abs_df[columns_to_convert].apply(lambda x: x.str.replace(',', '').astype(float))



In [None]:
print(abs_df.dtypes)

In [None]:
average_abs = abs_df.drop(columns=["Males ($)", "Females ($)"])

In [None]:
average_abs = average_abs.set_index("State")

In [None]:
new_states = {"Tas.": "TAS", "Vic.": "VIC"}
average_abs = average_abs.rename(index=new_states)
average_abs

In [None]:
average_abs = average_abs.reset_index()

In [None]:
plt.bar(average_abs['State'], average_abs['Persons ($)'])


plt.title("Average Weekly Income by State")
plt.ylabel("Weekly income ($)")
plt.xlabel("State")
plt.show()

The first graph above (figure 1.0) shows the average weekly income by State from the Australian Buruea of Statistics (ABS). The data displays each state of Australia and their average weekly income. Western Austalia (WA) and Australian Capital Territory (ACT) has the highest weekly average income out of the other Australian states. Although ACT is one of higher average weekly income, this could be a result of the lower population density and high valued occupations in the state. Western Australia has the highest average weekly income with $2039.3, the high average weekly income may be affected due to the high income rate of western farmers. Although this is only a speculation to the data, further research may be needed for more data analysis.

The rest of the states are relatively similar to one another, with Tasmania (TAS) and South Australia (SA) having the lowest average weekly income compared to the other states. With an average weekly income of $1619.3, Tasmania has the smallest average income and smaller population compared to the Australia (Aust.). Therefore, the result Tasmania having the least average income makes sense.

The "average weekly income by state", will provide an indicator whether or not the weekly income has any correlation with the price of vehicle by state. Later in this question, a bar graph that showcase the vehicle price by state will be displayed to determine if there are any correlatio or if this there's a pattern in the dataset. 



Finding average vehicle price by State

In [None]:
new = vehicle_df['Location'].str.split(", ", n=1, expand=True)
 
# making separate first name column from new data frame
vehicle_df["Suburb"] = new[0]
 
# making separate last name column from new data frame
vehicle_df["State"] = new[1]
 
# Dropping old Name columns
vehicle_df.drop(columns=["Location"], inplace=True)


In [None]:
vehicle_df = vehicle_df.drop(columns=["UsedOrNew", "Transmission", "Engine",
                                             "DriveType", "FuelType", "ColourExtInt", "CylindersinEngine", "BodyType", "FuelConsumption", "Doors","Seats","Kilometres"])

In [None]:
vehicle_df.info

In [None]:
vehicle_df['Price'] = pd.to_numeric(vehicle_df['Price'], errors='coerce')

In [None]:
vehicle_df['Price'] = vehicle_df['Price'].fillna(0).astype(int)

In [None]:
aus_vehicle_agg = vehicle_df.groupby("State")['Price'].agg(["mean", "median", "var","std","sem"])

In [None]:
aus_vehicle_agg = aus_vehicle_agg.reset_index()

In [None]:
merged_df = pd.merge(aus_vehicle_agg, average_abs, on="State")
merged_df

In [None]:
merged_df= merged_df.rename(columns={"mean": "Average"})

In [None]:
average_income_vehicle_price_state = merged_df.plot(kind="bar", x="State", y=["Average"], color='lightgreen')

plt.ylabel("Vehicle Price ($)")
plt.xlabel("State")
plt.title("Average Vehicle Price by State")

After finding the average weekly income by state, the next graph will be the vehicle prices by state (figure 1.1). The purpose of this graph is to find whether there's a pattern with the average weekly income and the average vehicle prices. In this graph, Western Australia has the highest vehicle prices on with $37,372.94. The given result suggest that, WA has the highest average weekly income and the highest vehicle prices as well. However, Tasmania has a high vehicle price and the average weekly income for the state is lower than the average Australian weekly income (figure 1.0). The average weekly income for Tasmania is $1,619.3 meanwhile the average vehicle price of Tasmania is $37,521.62 which makes earning a vehicle in TAS more difficult compared to the rest of states. 

South Austalia has the lowest average vehicle prices with $32,059.04 which correlates with the low weekly income of the state with only $1677.8 on average weekly. Australia Capital Territory however has a low average vehicle price with $32844.75 and a high average weekly income with $2,022.9. This could be due to the population density and the lack of expensive cars imported into the state. 

The average vehicle price by state is less accurate than finding which location of Australia's suburbs have the highest vehicle price. The next graph will showcase the suburbs of which the highest vehicle prices are located in Australia. 

Finding the top Vehicle Prices by Suburb

In [None]:
postcode_df = postcode_df.rename(columns={"suburb": "Suburb"})
postcode_df

In [None]:
postcode_duplicates = postcode_df.drop_duplicates(subset="Suburb")
postcode_duplicates

In [None]:
grouped_suburb = vehicle_df.groupby('Suburb').apply(lambda x: x.reset_index(drop=True))

def calculate_numeric_price(group):
    numeric_prices = pd.to_numeric(group['Price'], errors='coerce')
    numeric_prices = numeric_prices[~np.isnan(numeric_prices)]
    if numeric_prices.empty:
        return pd.Series({'Numeric_Price_Mean': np.nan})
    else:
        return pd.Series({'Numeric_Price_Mean': numeric_prices.mean()})

# Grouping by 'Suburb' and calculating mean of numeric prices
numeric_price_stats = vehicle_df.groupby('Suburb').apply(calculate_numeric_price).reset_index()

# print(numeric_price_stats)
pd.DataFrame(numeric_price_stats)

numeric_price_stats_sorted = numeric_price_stats.sort_values(by='Numeric_Price_Mean', ascending=False)
print(pd.DataFrame(numeric_price_stats_sorted))

In [None]:
grouped_suburb = vehicle_df.loc[:, "Suburb"]

grouped_suburb.describe()

In [None]:
lon_lat_merged = pd.merge(grouped_suburb, postcode_duplicates, on="Suburb", how="inner")

lon_lat_merged = lon_lat_merged.drop_duplicates()

unique_suburbs =lon_lat_merged['Suburb'].nunique()
unique_suburbs

In [None]:
merged_coordinates_aus_vehicle = pd.merge(vehicle_df, lon_lat_merged, on="Suburb", how="inner")

merged_coordinates_aus_vehicle['Price'] = pd.to_numeric(merged_coordinates_aus_vehicle['Price'], errors='coerce')

In [None]:
grouped_suburb = merged_coordinates_aus_vehicle.groupby(["Suburb"])


agg_price_loc = grouped_suburb['Price'].agg(["mean", "median", "var","std","sem"])

suburb_grouped = vehicle_df.groupby(["Suburb"])

agg_price_loc

# mean_suburb_price = suburb_grouped['Price'].mean()

# suburb_grouped['Price'] = suburb_grouped['Price'].replace('POA', pd.NA)

In [None]:
top_highest_price = agg_price_loc['mean'].nlargest(10)
top_highest_price = pd.DataFrame(top_highest_price)
top_highest_price

In [None]:
top_price_location = pd.merge(top_highest_price, merged_coordinates_aus_vehicle, on="Suburb", how="inner")
top_price_location['Suburb'].unique()

In [None]:
top_price_location.hvplot.points("longitude", "latitude", geo=True, tiles = "OSM",frame_width = 800,frame_height = 800)

The map above describes the location of the top 10 highest average vehicle prices located by suburbs. The data shows that numerous high vehicle prices are located in Sydney, Melbourne and Perth. The highest average vehicle price being the Rushcutters Bay with an average of $250,849.09 in Sydney NSW. Whilst Geelong West having the second highest average vehicle prices with an average of $152,356.0 and Medindine having the third highest average vehicle price with $127688.0. Surprisingly the suburb with the highest average vehicle price from Perth is the Nedlands, with an average vehicle price of $123,200.00. 

In [None]:
top_highest_price.plot(kind="bar", color="lightgreen", figsize=(8,8))
plt.xticks(rotation=45)

plt.title("Top Suburb with the highest Average Vehicle Price")
plt.ylabel("Price")
plt.xlabel("Top Suburb")

plt.show()

The final graph showcases the top 10 highest average vehicle prices by suburbs in a bar chart to display the average price difference compared to other suburbs in the list. 

In [None]:
top_highest_price=top_highest_price.reset_index()
top_highest_price

In [None]:
top_highest_price['percentage'] = (top_highest_price['mean'] / top_highest_price['mean'].sum()) * 100

# Create a pie chart
plt.figure(figsize=(8, 8))
plt.pie(top_highest_price['percentage'], labels=top_highest_price['Suburb'], autopct='%1.1f%%', startangle=140)
plt.title('Pie Chart of mean percentage by Suburb')
plt.show()