In [169]:
import pandas as pd
from pathlib import Path
import plotly.graph_objects as go
import numpy as np

In [170]:
# process the personal income by postcode data
def get_abs_income_data(filepath):

    year = Path(filepath).stem[0:4]
    year_str = str(year)

    df_raw = pd.read_csv(
        filepath,
        header=6,
    )
    df = df_raw.copy()
    return df

pi_df = get_abs_income_data("../data/raw/ABS/2021-personal-income-by-postcode.csv")
# rename first column to "Postcode"
pi_df = pi_df.rename(columns={pi_df.columns[0]: "Postcode"})
# filter out rows that don't start with 4 digits
pi_df = pi_df[pi_df["Postcode"].str.match(r"^\d{4}.*")]
# strip anything after the first 4 digits from the Postcode
pi_df["Postcode"] = pi_df["Postcode"].str.extract(r"(\d{4})")
# drop last two columns
#pi_df = pi_df.iloc[:, :-2]
# rename second column to $0-
pi_df = pi_df.rename(columns={pi_df.columns[1]: "$0-"})
total = pi_df["Total"]
ncols = len(pi_df.columns)
cols = pi_df.columns
data = pi_df[cols[1:ncols-2]]
# convert values in data to numeric type
data = data.apply(pd.to_numeric)
total = data.sum(axis=1)
# for each column in data, divide by total
frac = data.div(total, axis=0)
# income from header
weekly_income = data.columns.str.extract(r"\$(.*?)-")[0]
weekly_income[len(weekly_income)-1]='3,500' # yeah, hack I know - you figure the regex out then :)
# for all entries in income, replace commas with nothing
weekly_income = weekly_income.str.replace(',', '')
# convert income to number
weekly_income = pd.to_numeric(weekly_income)
func = lambda x: sum(np.asarray(x) * np.asarray(weekly_income))
mean_weekly_income = frac.apply(func, axis=1)
# normalize mean_weekly_income to the range 0-1
mean_weekly_income = (mean_weekly_income - mean_weekly_income.min()) / (mean_weekly_income.max() - mean_weekly_income.min())

fig = go.Figure(data=[go.Histogram(x=mean_weekly_income, histnorm='probability', name='Personal Income')])
fig.show()

In [171]:
# process the education by postcode data
def get_education_data(filepath):
    df = pd.read_csv(
        filepath,
        header=6,
    )
    return df

ed_df = get_education_data("../data/raw/ABS/2021-education-by-postcode.csv")
# rename first column to "Postcode"
ed_df = ed_df.rename(columns={ed_df.columns[0]: "Postcode"})
# filter out rows that don't start with 4 digits
ed_df = ed_df[ed_df["Postcode"].str.match(r"^\d{4}.*")]
# strip anything after the first 4 digits from the Postcode
ed_df["Postcode"] = ed_df["Postcode"].str.extract(r"(\d{4})")
display(ed_df)
# create a numeric value to represent each column
level = [8,7,6,5,4,3,2,1] # this weighting is probably a bit too heavy to the higher levels
cols = ed_df.columns
ncols = len(cols)
data = ed_df[cols[1:ncols-2]]
display(data)
# convert values in data to numeric type
data = data.apply(pd.to_numeric)
total = data.sum(axis=1)
# for each column in data, divide by total
frac = data.div(total, axis=0)
func = lambda x: sum(np.asarray(x) * np.asarray(level))
mean_education_level = frac.apply(func, axis=1)
# normalize mean_education_level to the range 0-1
mean_education_level = (mean_education_level - mean_education_level.min()) / (mean_education_level.max() - mean_education_level.min())
fig.add_trace(go.Histogram(x=mean_education_level, histnorm='probability', name='Education Level'))
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.5)
fig.show()


Unnamed: 0,Postcode,Postgraduate Degree Level,Graduate Diploma and Graduate Certificate Level,Bachelor Degree Level,Advanced Diploma and Diploma Level,Certificate III & IV Level,Secondary Education - Years 10 and above,Certificate I & II Level,Secondary Education - Years 9 and below,Total,Unnamed: 10
1,2000,3813,489.0,8530.0,3814.0,1137.0,4761.0,16.0,397.0,22949.0,
2,2007,1134,134.0,2165.0,722.0,347.0,1426.0,0.0,163.0,6088.0,
3,2008,1731,228.0,3450.0,752.0,404.0,2265.0,0.0,97.0,8926.0,
4,2009,1872,346.0,3673.0,1162.0,712.0,2007.0,12.0,223.0,9993.0,
5,2010,4003,796.0,9068.0,2185.0,1402.0,4145.0,6.0,454.0,22052.0,
...,...,...,...,...,...,...,...,...,...,...,...
2637,4377,9,3.0,31.0,35.0,120.0,184.0,0.0,52.0,432.0,
2638,4380,155,90.0,680.0,509.0,1292.0,2293.0,10.0,754.0,5776.0,
2639,4383,3,3.0,15.0,30.0,112.0,179.0,0.0,88.0,435.0,
2640,4385,6,21.0,61.0,67.0,142.0,350.0,0.0,168.0,815.0,


Unnamed: 0,Postgraduate Degree Level,Graduate Diploma and Graduate Certificate Level,Bachelor Degree Level,Advanced Diploma and Diploma Level,Certificate III & IV Level,Secondary Education - Years 10 and above,Certificate I & II Level,Secondary Education - Years 9 and below
1,3813,489.0,8530.0,3814.0,1137.0,4761.0,16.0,397.0
2,1134,134.0,2165.0,722.0,347.0,1426.0,0.0,163.0
3,1731,228.0,3450.0,752.0,404.0,2265.0,0.0,97.0
4,1872,346.0,3673.0,1162.0,712.0,2007.0,12.0,223.0
5,4003,796.0,9068.0,2185.0,1402.0,4145.0,6.0,454.0
...,...,...,...,...,...,...,...,...
2637,9,3.0,31.0,35.0,120.0,184.0,0.0,52.0
2638,155,90.0,680.0,509.0,1292.0,2293.0,10.0,754.0
2639,3,3.0,15.0,30.0,112.0,179.0,0.0,88.0
2640,6,21.0,61.0,67.0,142.0,350.0,0.0,168.0
