In [1]:
import pandas as pd
import numpy as np
import json
from typing import List, Optional

  from pandas.core import (


In [19]:
class Table:
    def __init__(self, code: str, name: str, paths: List[str], level1_name: str, level1: List[str], aggregates: List[str]):
        self.code = code
        self.name = name
        self.paths = paths
        self.level1_name = level1_name
        self.level1 = level1
        self.aggregates = aggregates
        self.df = pd.read_csv(paths[0])
        if len(paths) > 1:
            for p in paths[1:]:
                self.df = pd.merge(self.df, pd.read_csv(p), on='SED_CODE_2021')
    
    def summary_stats(self) -> pd.DataFrame:
        # Calculate percentage of each category relative to the aggregate column
        for col in self.level1:
            self.df[f"{col}_by_{self.aggregates[0]}"] = self.df[col] / self.df[self.aggregates[0]] * 100
        return self.df

    def correlation(self, table: 'Table') -> float:
        # Merge and calculate correlation matrix of combined data
        merged_df = pd.merge(self.summary_stats(), table.summary_stats(), on='SED_CODE_2021')[self.level1 + table.level1]
        corr = merged_df.corr()
        off_diagonal = corr.values[~np.eye(corr.shape[0], dtype=bool)]
        table_corr = np.mean(np.abs(off_diagonal))
        return table_corr
    
    def melt(self) -> pd.DataFrame:
        # Melt the DataFrame to convert from wide to long format
        value_vars = self.level1
        melted_df = pd.melt(self.df, id_vars=['SED_CODE_2021'], value_vars=value_vars, var_name='l1_category', value_name='population')
        
        return melted_df
    
    def to_json(self) -> str:
        melted_df = self.melt().to_json(orient='records', indent=4)
        
        final_json = {
            "code": self.code,
            "name": self.name,
            "level1_name": self.level1_name,
            "level1": self.level1,
            "data_level1": melted_df
        }
        
        return json.dumps(final_json, indent=4)
    
    def __str__(self) -> str:
        s = f"Table {self.code}: {self.name}"
        return s


In [3]:
g04_level1_name = 'Age (45+ yr old)'
g04_level1 = ['Age_yr_45_49_P', 'Age_yr_50_54_P', 'Age_yr_55_59_P', 'Age_yr_60_64_P', 'Age_yr_65_69_P', 'Age_yr_70_74_P', 'Age_yr_75_79_P',
               'Age_yr_80_84_P', 'Age_yr_85_89_P', 'Age_yr_90_94_P', 'Age_yr_95_99_P', 'Age_yr_100_yr_over_P']
g04_aggregates = ['Tot_P']

g04 = Table(code='g04',
            name='Age by sex',
            paths=['../data/2021 Census GCP State Electroral Division for NSW/2021Census_G04A_NSW_SED.csv',
                   '../data/2021 Census GCP State Electroral Division for NSW/2021Census_G04B_NSW_SED.csv'],
            level1_name=g04_level1_name,
            level1=g04_level1,
            aggregates=g04_aggregates)

In [4]:
g17_level1_name = 'Weekly personal income'
g17_level1 = ['P_Neg_Nil_income_Tot', 'P_1_149_Tot', 'P_150_299_Tot', 'P_300_399_Tot', 'P_400_499_Tot',
              'P_500_649_Tot', 'P_650_799_Tot', 'P_800_999_Tot', 'P_1000_1249_Tot', 'P_1250_1499_Tot',
              'P_1500_1749_Tot', 'P_1750_1999_Tot', 'P_2000_2999_Tot', 'P_3000_3499_Tot', 'P_3500_more_Tot']
g17_aggregates = ['P_Tot_Tot'] 

g17 = Table(code='g17',
            name='Total personal income (weekly) by age by sex',
            paths=['../data/2021 Census GCP State Electroral Division for NSW/2021Census_G17A_NSW_SED.csv', 
                   '../data/2021 Census GCP State Electroral Division for NSW/2021Census_G17B_NSW_SED.csv',
                   '../data/2021 Census GCP State Electroral Division for NSW/2021Census_G17C_NSW_SED.csv'],
            level1_name=g17_level1_name,
            level1=g17_level1, 

            aggregates=g17_aggregates)

In [11]:
g18_level1_name = 'Need for assistance'
g18_level1 = ['P_Tot_Need_for_assistance', 'P_Tot_No_need_for_assistance']
g18_aggregates = ['P_Tot_Tot']

g18 = Table(code='g18',
            name='Core activity need for assistance by age by sex',
            paths=['../data/2021 Census GCP State Electroral Division for NSW/2021Census_G18_NSW_SED.csv'], 
            level1_name=g18_level1_name, 
            level1=g18_level1, 
            aggregates=g18_aggregates)

In [12]:
g19_level1_name = 'Long-term health condition'
g19_level1 = ['P_Asthma_Tot', 'P_Cancer_Tot', 'P_Dementia_Tot', 'P_Diabetes_Tot', 'P_Heart_disease_Tot', 
              'P_Kidney_disease_Tot', 'P_Lung_cond_Tot', 'P_Mental_health_cond_Tot', 'P_Stroke_Tot',
              'P_Other_Tot', 'P_None_Tot']
g19_aggregates = ['P_Tot_Tot']

g19 = Table(code='g19',
            name='Type of long-term health condition by age by sex',
            paths=['../data/2021 Census GCP State Electroral Division for NSW/2021Census_G19A_NSW_SED.csv', 
                   '../data/2021 Census GCP State Electroral Division for NSW/2021Census_G19B_NSW_SED.csv',
                   '../data/2021 Census GCP State Electroral Division for NSW/2021Census_G19C_NSW_SED.csv'],
            level1_name=g19_level1_name, 
            level1=g19_level1, 
            aggregates=g19_aggregates)

In [13]:
g33_level1_name = 'Weekly household income'
g33_level1 = ['Negative_Nil_income_Tot', 'HI_1_149_Tot', 'HI_150_299_Tot', 'HI_300_399_Tot', 'HI_400_499_Tot', 'HI_500_649_Tot', 'HI_650_799_Tot', 'HI_800_999_Tot',
              'HI_1000_1249_Tot', 'HI_1250_1499_Tot', 'HI_1500_1749_Tot', 'HI_1750_1999_Tot', 'HI_2000_2499_Tot', 'HI_2500_2999_Tot', 'HI_3000_3499_Tot', 'HI_3500_3999_Tot',
              'HI_4000_more_Tot']
g33_aggregates = ['Tot_Tot']

g33 = Table(code='g33',
            name='Total household income (weekly) by household composition',
            paths=['../data/2021 Census GCP State Electroral Division for NSW/2021Census_G33_NSW_SED.csv'], 
            level1_name=g33_level1_name,
            level1=g33_level1, 
            aggregates=g33_aggregates)

In [14]:
g54_level1_name = 'Industry of employment'
g54_level1 = ['P_Ag_For_Fshg_Tot', 'P_Mining_Tot', 'P_Manufact_Tot', 'P_El_Gas_Wt_Waste_Tot', 'P_Constru_Tot', 'P_WhlesaleTde_Tot', 'P_RetTde_Tot',
              'P_Accom_food_Tot', 'P_Trans_post_wrehsg_Tot', 'P_Info_media_teleco_Tot', 'P_Fin_Insur_Tot', 'P_RtnHir_REst_Tot', 'P_Pro_scien_tec_Tot',
              'P_Admin_supp_Tot', 'P_Public_admin_sfty_Tot', 'P_Educ_trng_Tot', 'P_HlthCare_SocAs_Tot', 'P_Art_recn_Tot', 'P_Oth_scs_Tot']
g54_aggregates = ['P_Tot_Tot']
g54 = Table(code='g54',
            name='Industry of employment by age by sex',
            paths=['../data/2021 Census GCP State Electroral Division for NSW/2021Census_G54A_NSW_SED.csv', 
                   '../data/2021 Census GCP State Electroral Division for NSW/2021Census_G54B_NSW_SED.csv',
                   '../data/2021 Census GCP State Electroral Division for NSW/2021Census_G54C_NSW_SED.csv',
                  '../data/2021 Census GCP State Electroral Division for NSW/2021Census_G54D_NSW_SED.csv'],
            level1_name=g54_level1_name,
            level1=g54_level1, 
            aggregates=g54_aggregates)

In [15]:
all_tables = [g04, g17, g18, g19, g33, g54]

In [16]:
import pickle
pickle.dump(all_tables, open('../data/table_objects.p', 'wb'))

In [17]:
def query_first_match(tables, attribute, value):    
    for table in tables:
        if getattr(table, attribute, None) == value:
            return table

# Example use
print(query_first_match(all_tables, "code", 'g04'))

Table g04: Age by sex


In [18]:
# Correlate with the rest
table1 = g33
correlations = {}

for table2 in all_tables:
    if table2.code != table1.code:
        c = table1.correlation(table2)
        correlations[table2.code] = c

print(f"{table1.name} (table code: {table1.code}) has the following correlations\n")
for k in correlations.keys():
    print(f"{k}: {correlations[k]:.2f}")

Total household income (weekly) by household composition (table code: g33) has the following correlations

g04: 0.43
g17: 0.49
g18: 0.51
g19: 0.49
g54: 0.38
