In [206]:
import pandas as pd
from pathlib import Path
import plotly.graph_objects as go
import numpy as np

In [207]:
# process the personal income by postcode data
def get_abs_income_data(filepath):

    year = Path(filepath).stem[0:4]
    year_str = str(year)

    df_raw = pd.read_csv(
        filepath,
        header=6,
    )
    df = df_raw.copy()
    return df

pi_df = get_abs_income_data("../data/raw/ABS/2021-personal-income-by-postcode.csv")
# rename first column to "Postcode"
pi_df = pi_df.rename(columns={pi_df.columns[0]: "Postcode"})
# filter out rows that don't start with 4 digits
pi_df = pi_df[pi_df["Postcode"].str.match(r"^\d{4}.*")]
# strip anything after the first 4 digits from the Postcode
pi_df["Postcode"] = pi_df["Postcode"].str.extract(r"(\d{4})")
# drop last two columns
#pi_df = pi_df.iloc[:, :-2]
# rename second column to $0-
pi_df = pi_df.rename(columns={pi_df.columns[1]: "$0-"})
total = pi_df["Total"]
ncols = len(pi_df.columns)
cols = pi_df.columns
data = pi_df[cols[1:ncols-2]]
# convert values in data to numeric type
data = data.apply(pd.to_numeric)
total = data.sum(axis=1)
# for each column in data, divide by total
frac = data.div(total, axis=0)
# income from header
weekly_income = data.columns.str.extract(r"\$(.*?)-")[0]
weekly_income[len(weekly_income)-1]='3,500' # yeah, hack I know - you figure the regex out then :)
# for all entries in income, replace commas with nothing
weekly_income = weekly_income.str.replace(',', '')
# convert income to number
weekly_income = pd.to_numeric(weekly_income)
func = lambda x: sum(np.asarray(x) * np.asarray(weekly_income))
mean_weekly_income = frac.apply(func, axis=1)
# normalize mean_weekly_income to the range 0-1
mean_weekly_income = (mean_weekly_income - mean_weekly_income.min()) / (mean_weekly_income.max() - mean_weekly_income.min())

fig = go.Figure(data=[go.Histogram(x=mean_weekly_income, histnorm='probability', name='Personal Income')])
fig.show()

In [208]:
# process the education by postcode data
def get_education_data(filepath):
    df = pd.read_csv(
        filepath,
        header=6,
    )
    return df

ed_df = get_education_data("../data/raw/ABS/2021-education-by-postcode.csv")
# rename first column to "Postcode"
ed_df = ed_df.rename(columns={ed_df.columns[0]: "Postcode"})
# filter out rows that don't start with 4 digits
ed_df = ed_df[ed_df["Postcode"].str.match(r"^\d{4}.*")]
# strip anything after the first 4 digits from the Postcode
ed_df["Postcode"] = ed_df["Postcode"].str.extract(r"(\d{4})")
# create a numeric value to represent each column
level = [8,7,6,5,4,3,2,1] # this weighting is probably a bit too heavy to the higher levels
cols = ed_df.columns
ncols = len(cols)
data = ed_df[cols[1:ncols-2]]
# convert values in data to numeric type
data = data.apply(pd.to_numeric)
total = data.sum(axis=1)
# for each column in data, divide by total
frac = data.div(total, axis=0)
func = lambda x: sum(np.asarray(x) * np.asarray(level))
mean_education_level = frac.apply(func, axis=1)
# normalize mean_education_level to the range 0-1
mean_education_level = (mean_education_level - mean_education_level.min()) / (mean_education_level.max() - mean_education_level.min())
fig.add_trace(go.Histogram(x=mean_education_level, histnorm='probability', name='Education Level'))
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.5)
fig.show()


In [209]:
# process the age by postcode data
def get_age_data(filepath):
    df = pd.read_csv(
        filepath,
        header=6,
    )
    return df
age_df = get_age_data("../data/raw/ABS/2021-age-by-postcode.csv")
# rename first column to "Postcode"
age_df = age_df.rename(columns={age_df.columns[0]: "Postcode"})
# filter out rows that don't start with 4 digits
age_df = age_df[age_df["Postcode"].str.match(r"^\d{4}.*")]
# strip anything after the first 4 digits from the Postcode
age_df["Postcode"] = age_df["Postcode"].str.extract(r"(\d{4})")
# pull out just the age groups into a new dataframe named data
cols = age_df.columns
ncols = len(cols)
data = age_df[cols[1:ncols-2]]
# create a numeric value to represent each column based on the midpoint of the age range
age_range = age_df.columns[1:ncols-2]
age_range = age_range.str.extract(r"(\d+)-(\d+)")
age_range = age_range.astype(float)
age_range = age_range.mean(axis=1)
age_range[len(age_range)-1] = 100
age_range = age_range.astype(int)
# convert values in data to numeric type
data = data.apply(pd.to_numeric)
total = data.sum(axis=1)
# for each column in data, divide by total
frac = data.div(total, axis=0)
func = lambda x: sum(np.asarray(x) * np.asarray(age_range))
mean_age = frac.apply(func, axis=1)
# normalize mean_age to the range 0-1
mean_age = (mean_age - mean_age.min()) / (mean_age.max() - mean_age.min())
fig.add_trace(go.Histogram(x=mean_age, histnorm='probability', name='Age'))
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.5)
fig.show()

In [210]:
# process the distance to work by postcode data
def get_distance_to_work_data(filepath):
    df = pd.read_csv(
        filepath,
        header=6,
    )
    return df
dtw_df = get_distance_to_work_data("../data/raw/ABS/2021-distance-to-work-by-postcode.csv")
# rename first column to "Postcode"
dtw_df = dtw_df.rename(columns={dtw_df.columns[0]: "Postcode"})
# filter out rows that don't start with 4 digits
dtw_df = dtw_df[dtw_df["Postcode"].str.match(r"^\d{4}.*")]
# strip anything after the first 4 digits from the Postcode
dtw_df["Postcode"] = dtw_df["Postcode"].str.extract(r"(\d{4})")
# pull out just the distance to work groups into a new dataframe named data
cols = dtw_df.columns
ncols = len(cols)
data = dtw_df[cols[1:ncols-2]]
# create a numeric value to represent each column based on the midpoint of the distance range 
distance = [0, 1.25, 6.25, 20.0, 40.0, 150.0, 300.0]
# convert values in data to numeric type
data = data.apply(pd.to_numeric)
total = data.sum(axis=1)
# for each column in data, divide by total
frac = data.div(total, axis=0)
func = lambda x: sum(np.asarray(x) * np.asarray(distance))
mean_distance = frac.apply(func, axis=1)
# normalize mean_distance to the range 0-1
mean_distance = (mean_distance - mean_distance.min()) / (mean_distance.max() - mean_distance.min())
fig.add_trace(go.Histogram(x=mean_distance, histnorm='probability', name='Distance to Work'))
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.5)
fig.show()

In [235]:
# create a new dataframe which has postcode, mean_weekly_income, mean_education_level, mean_age, mean_distance
df = pd.DataFrame(
    {
        "Postcode": pi_df["Postcode"],
        "Mean Weekly Income": mean_weekly_income,
        "Mean Education Level": mean_education_level,
        "Mean Age": mean_age,
        "Mean Distance to Work": mean_distance,
    }
)

def get_score(row):
    inc = row["Mean Weekly Income"]
    edu = row["Mean Education Level"]
    age = row["Mean Age"]
    dtw = row["Mean Distance to Work"]

    #return inc + edu + age + (1 - dtw)
    #return inc
    #return edu
    #return age
    return (1-dtw)

df["Score"] = df.apply(get_score, axis=1)

# create a histogram of the scores
fig = go.Figure(data=[go.Histogram(x=df["Score"], histnorm='probability', name='Score')])
fig.show()

In [236]:
bitre_df = pd.read_csv('../data/processed/bitre_df.csv')
S_all_pc = (
    bitre_df.groupby(["Vehicle Type", "Postcode"])
    .sum()
    .drop(columns=["State", "Fuel Type"])
)
S_all_nat = S_all_pc.groupby(["Vehicle Type"]).sum()
S_ev_pc = (
    bitre_df[bitre_df["Fuel Type"] == "Electric"]
    .groupby(["Vehicle Type", "Postcode"])
    .sum()
    .drop(columns=["State", "Fuel Type"])
)
S_ev_nat = S_ev_pc.groupby(["Vehicle Type"]).sum()
# Assume a fixed scrapping rate of 6% per year, such that if the stock goes down, then new sales were less than scrapping, or if stock goes up, then new sales were more than scrapping, so
# new sales = scrapping + change in stock
# new sales = 0.06 * stock + change in stock
scrapping_rate = [0.06, 0.06, 0.06]
new_sales_nat = S_all_nat.diff(axis=1) + scrapping_rate * S_all_nat.shift(periods=1,axis='columns') # the shift is to reference the previous year stock
PC_ev_sales_nat = ( S_ev_nat.diff(axis=1) / new_sales_nat * 100 )
PC_passenger_ev_sales_nat = PC_ev_sales_nat.loc["Passenger"]
new_sales_pc = S_all_pc.diff(axis=1) + scrapping_rate * S_all_pc.shift(periods=1,axis='columns')
PC_ev_sales_pc = ( S_ev_pc.diff(axis=1) / new_sales_pc * 100)

PC_passenger_ev_sales_pc = PC_ev_sales_pc.loc["Passenger"]
PC_passenger_ev_sales_pc = PC_passenger_ev_sales_pc.fillna(0)  # nans are caused by not reporting postcodes where ev stock is zero
PC_passenger_ev_sales_pc[PC_passenger_ev_sales_pc > 100] = float( "NaN")  # ignore when value is unreasonable high

## plot histogram of PC_passenger_ev_sales_pc
#fig = go.Figure(
#    data=[
#        go.Histogram(
#            x=PC_passenger_ev_sales_pc["2023-01-31"].values,
#            xbins=dict(start=0.0, end=20, size=0.5),
#        )
#    ]
#)
#fig.show()
sales = PC_passenger_ev_sales_pc["2023-01-31"]
score = df["Score"]

# find postcodes where both sales and score have an entry
sales = sales[sales.index.isin(score.index)]
score = score[score.index.isin(sales.index)]
# plot sales vs score
fig = go.Figure(data=[go.Scatter(x=score, y=sales, mode='markers')])
fig.update_yaxes(range=[0, 20])
fig.show() 
