In [188]:
import pandas as pd
from pathlib import Path
import plotly.graph_objects as go
import numpy as np

In [189]:
# process the personal income by postcode data
def get_abs_income_data(filepath):

    year = Path(filepath).stem[0:4]
    year_str = str(year)

    df_raw = pd.read_csv(
        filepath,
        header=6,
    )
    df = df_raw.copy()
    return df

pi_df = get_abs_income_data("../data/raw/ABS/2021-personal-income-by-postcode.csv")
# rename first column to "Postcode"
pi_df = pi_df.rename(columns={pi_df.columns[0]: "Postcode"})
# filter out rows that don't start with 4 digits
pi_df = pi_df[pi_df["Postcode"].str.match(r"^\d{4}.*")]
# strip anything after the first 4 digits from the Postcode
pi_df["Postcode"] = pi_df["Postcode"].str.extract(r"(\d{4})")
# drop last two columns
#pi_df = pi_df.iloc[:, :-2]
# rename second column to $0-
pi_df = pi_df.rename(columns={pi_df.columns[1]: "$0-"})
total = pi_df["Total"]
ncols = len(pi_df.columns)
cols = pi_df.columns
data = pi_df[cols[1:ncols-2]]
# convert values in data to numeric type
data = data.apply(pd.to_numeric)
total = data.sum(axis=1)
# for each column in data, divide by total
frac = data.div(total, axis=0)
# income from header
weekly_income = data.columns.str.extract(r"\$(.*?)-")[0]
weekly_income[len(weekly_income)-1]='3,500' # yeah, hack I know - you figure the regex out then :)
# for all entries in income, replace commas with nothing
weekly_income = weekly_income.str.replace(',', '')
# convert income to number
weekly_income = pd.to_numeric(weekly_income)
func = lambda x: sum(np.asarray(x) * np.asarray(weekly_income))
mean_weekly_income = frac.apply(func, axis=1)
# normalize mean_weekly_income to the range 0-1
mean_weekly_income = (mean_weekly_income - mean_weekly_income.min()) / (mean_weekly_income.max() - mean_weekly_income.min())

fig = go.Figure(data=[go.Histogram(x=mean_weekly_income, histnorm='probability', name='Personal Income')])
fig.show()

In [190]:
# process the education by postcode data
def get_education_data(filepath):
    df = pd.read_csv(
        filepath,
        header=6,
    )
    return df

ed_df = get_education_data("../data/raw/ABS/2021-education-by-postcode.csv")
# rename first column to "Postcode"
ed_df = ed_df.rename(columns={ed_df.columns[0]: "Postcode"})
# filter out rows that don't start with 4 digits
ed_df = ed_df[ed_df["Postcode"].str.match(r"^\d{4}.*")]
# strip anything after the first 4 digits from the Postcode
ed_df["Postcode"] = ed_df["Postcode"].str.extract(r"(\d{4})")
# create a numeric value to represent each column
level = [8,7,6,5,4,3,2,1] # this weighting is probably a bit too heavy to the higher levels
cols = ed_df.columns
ncols = len(cols)
data = ed_df[cols[1:ncols-2]]
# convert values in data to numeric type
data = data.apply(pd.to_numeric)
total = data.sum(axis=1)
# for each column in data, divide by total
frac = data.div(total, axis=0)
func = lambda x: sum(np.asarray(x) * np.asarray(level))
mean_education_level = frac.apply(func, axis=1)
# normalize mean_education_level to the range 0-1
mean_education_level = (mean_education_level - mean_education_level.min()) / (mean_education_level.max() - mean_education_level.min())
fig.add_trace(go.Histogram(x=mean_education_level, histnorm='probability', name='Education Level'))
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.5)
fig.show()


In [191]:
# process the age by postcode data
def get_age_data(filepath):
    df = pd.read_csv(
        filepath,
        header=6,
    )
    return df
age_df = get_age_data("../data/raw/ABS/2021-age-by-postcode.csv")
# rename first column to "Postcode"
age_df = age_df.rename(columns={age_df.columns[0]: "Postcode"})
# filter out rows that don't start with 4 digits
age_df = age_df[age_df["Postcode"].str.match(r"^\d{4}.*")]
# strip anything after the first 4 digits from the Postcode
age_df["Postcode"] = age_df["Postcode"].str.extract(r"(\d{4})")
# pull out just the age groups into a new dataframe named data
cols = age_df.columns
ncols = len(cols)
data = age_df[cols[1:ncols-2]]
# create a numeric value to represent each column based on the midpoint of the age range
age_range = age_df.columns[1:ncols-2]
age_range = age_range.str.extract(r"(\d+)-(\d+)")
age_range = age_range.astype(float)
age_range = age_range.mean(axis=1)
age_range[len(age_range)-1] = 100
age_range = age_range.astype(int)
# convert values in data to numeric type
data = data.apply(pd.to_numeric)
total = data.sum(axis=1)
# for each column in data, divide by total
frac = data.div(total, axis=0)
func = lambda x: sum(np.asarray(x) * np.asarray(age_range))
mean_age = frac.apply(func, axis=1)
# normalize mean_age to the range 0-1
mean_age = (mean_age - mean_age.min()) / (mean_age.max() - mean_age.min())
fig.add_trace(go.Histogram(x=mean_age, histnorm='probability', name='Age'))
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.5)
fig.show()
print(age_range)
display(data)
display(age_df)

0       2
1       7
2      12
3      17
4      22
5      27
6      32
7      37
8      42
9      47
10     52
11     57
12     62
13     67
14     72
15     77
16     82
17     87
18     92
19     97
20    100
dtype: int64


Unnamed: 0,0-4 years,5-9 years,10-14 years,15-19 years,20-24 years,25-29 years,30-34 years,35-39 years,40-44 years,45-49 years,...,55-59 years,60-64 years,65-69 years,70-74 years,75-79 years,80-84 years,85-89 years,90-94 years,95-99 years,100 years and over
1,694,423.0,302.0,610.0,3429.0,5634.0,5050.0,3374.0,2060.0,1327.0,...,1044.0,833.0,725.0,585.0,390.0,233.0,113.0,48.0,10.0,0.0
2,219,172.0,99.0,265.0,1261.0,1468.0,1183.0,778.0,502.0,327.0,...,218.0,169.0,177.0,113.0,61.0,56.0,36.0,17.0,4.0,0.0
3,213,120.0,83.0,480.0,2662.0,2293.0,1566.0,840.0,540.0,364.0,...,260.0,245.0,168.0,127.0,72.0,29.0,12.0,3.0,3.0,0.0
4,612,379.0,308.0,296.0,708.0,1448.0,1750.0,1599.0,1113.0,794.0,...,679.0,643.0,577.0,511.0,301.0,154.0,71.0,22.0,6.0,0.0
5,609,435.0,385.0,369.0,1987.0,4133.0,4296.0,3027.0,2214.0,1745.0,...,1483.0,1227.0,912.0,740.0,516.0,352.0,212.0,94.0,28.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2637,29,47.0,41.0,22.0,25.0,20.0,23.0,24.0,27.0,31.0,...,70.0,49.0,47.0,57.0,38.0,20.0,8.0,0.0,0.0,0.0
2638,302,437.0,555.0,446.0,289.0,320.0,384.0,387.0,422.0,496.0,...,565.0,658.0,695.0,649.0,446.0,292.0,156.0,84.0,11.0,5.0
2639,17,27.0,33.0,23.0,18.0,20.0,23.0,29.0,24.0,37.0,...,64.0,68.0,73.0,60.0,27.0,21.0,12.0,7.0,0.0,0.0
2640,69,67.0,71.0,51.0,39.0,41.0,55.0,67.0,44.0,66.0,...,75.0,99.0,89.0,84.0,72.0,43.0,33.0,14.0,6.0,0.0


Unnamed: 0,Postcode,0-4 years,5-9 years,10-14 years,15-19 years,20-24 years,25-29 years,30-34 years,35-39 years,40-44 years,...,65-69 years,70-74 years,75-79 years,80-84 years,85-89 years,90-94 years,95-99 years,100 years and over,Total,Unnamed: 23
1,2000,694,423.0,302.0,610.0,3429.0,5634.0,5050.0,3374.0,2060.0,...,725.0,585.0,390.0,233.0,113.0,48.0,10.0,0.0,27936.0,
2,2007,219,172.0,99.0,265.0,1261.0,1468.0,1183.0,778.0,502.0,...,177.0,113.0,61.0,56.0,36.0,17.0,4.0,0.0,7410.0,
3,2008,213,120.0,83.0,480.0,2662.0,2293.0,1566.0,840.0,540.0,...,168.0,127.0,72.0,29.0,12.0,3.0,3.0,0.0,10400.0,
4,2009,612,379.0,308.0,296.0,708.0,1448.0,1750.0,1599.0,1113.0,...,577.0,511.0,301.0,154.0,71.0,22.0,6.0,0.0,12658.0,
5,2010,609,435.0,385.0,369.0,1987.0,4133.0,4296.0,3027.0,2214.0,...,912.0,740.0,516.0,352.0,212.0,94.0,28.0,0.0,26443.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2637,4377,29,47.0,41.0,22.0,25.0,20.0,23.0,24.0,27.0,...,47.0,57.0,38.0,20.0,8.0,0.0,0.0,0.0,625.0,
2638,4380,302,437.0,555.0,446.0,289.0,320.0,384.0,387.0,422.0,...,695.0,649.0,446.0,292.0,156.0,84.0,11.0,5.0,8093.0,
2639,4383,17,27.0,33.0,23.0,18.0,20.0,23.0,29.0,24.0,...,73.0,60.0,27.0,21.0,12.0,7.0,0.0,0.0,607.0,
2640,4385,69,67.0,71.0,51.0,39.0,41.0,55.0,67.0,44.0,...,89.0,84.0,72.0,43.0,33.0,14.0,6.0,0.0,1166.0,
