# Curated Preprocessing for Income Data

In [1]:
import pandas as pd

In [2]:
LANDING_DATA_DIR = "../../../data/landing"
RAW_DATA_DIR = "../../../data/raw"
CURATED_DATA_DIR = "../../../data/curated"

In [4]:
income = pd.read_csv(f"{RAW_DATA_DIR}/income.csv")

## Remove Rows not in Victoria
The data is structured so that there is an aggregated row by state (including an NA value for `SA2 name`), after which rows for mean income for each SA2 follow. Hence, by identifying the row of the Victoria aggregate and the next state aggregate, we can isolate the rows pertaining to Victorian data and remove all other rows.

In [5]:
# Create a boolean mask for NA values
na = income.isna().any(axis=1)

# Filter the dataframe using the boolean mask
rows_with_na = income[na].index.tolist()

# Show rows with NA values
income.iloc[rows_with_na, :]

Unnamed: 0,SA2,SA2 NAME,2015,2016,2017,2018,2019
0,Australia,,47692,48360,49805,51389,52338
1,New South Wales,,48085,48700,50153,51818,52849
578,Victoria,,46984,47709,49266,51027,51996
1041,Queensland,,46869,47425,48826,50298,51197
1570,South Australia,,46110,46937,48354,49888,50440
1743,Western Australia,,52504,51450,52671,54220,55208
1996,Tasmania,,43833,44437,45546,47352,47909
2096,Northern Territory,,59466,58669,60636,61517,62010
2165,Australian Capital Territory,,63061,63038,64332,66594,68325


In [6]:
# Remove rows unrelated to Victoria
# The row number with Victorian aggregate = 578
# The row number with the next aggregate (Queensland) = 1041
keep_rows = range(579, 1041)
remove_rows = []

for row_num in range(0, len(income)):
    if row_num not in keep_rows:
        remove_rows.append(row_num)
    
income_vic = income.drop(index = remove_rows)

In [7]:
income_vic.head()

Unnamed: 0,SA2,SA2 NAME,2015,2016,2017,2018,2019
579,201011001,Alfredton,49385,50845,52448,53932,55204
580,201011002,Ballarat,49564,50413,51736,53688,53784
581,201011003,Ballarat - North,45816,46561,49211,50593,52068
582,201011004,Ballarat - South,41544,42531,44293,45828,47010
583,201011005,Buninyong,47511,49179,51034,52377,54308


In [8]:
income_vic[len(income_vic)-5: len(income_vic)]

Unnamed: 0,SA2,SA2 NAME,2015,2016,2017,2018,2019
1036,217031476,Otway,33020,32563,33929,36219,36510
1037,217041477,Moyne - East,40053,41244,44000,46071,47120
1038,217041478,Moyne - West,41751,42159,43136,45256,46843
1039,217041479,Warrnambool - North,43536,44772,46945,48855,50654
1040,217041480,Warrnambool - South,42330,43439,45726,47242,49080


## Transpose Year Columns

In [9]:
# Define columns to keep and columns to convert to rows
col_keep = ["SA2", "SA2 NAME"]
col_convert = []

for col in list(income_vic.columns):
    if col not in col_keep:
        col_convert.append(col)

# Transpose year columns
income_transposed = income_vic.melt(
    id_vars = col_keep, value_vars = col_convert, var_name = "Year", 
    value_name = "Median Income"
)

In [10]:
income_transposed.head()

Unnamed: 0,SA2,SA2 NAME,Year,Median Income
0,201011001,Alfredton,2015,49385
1,201011002,Ballarat,2015,49564
2,201011003,Ballarat - North,2015,45816
3,201011004,Ballarat - South,2015,41544
4,201011005,Buninyong,2015,47511


## Remove Entries with "np"

In [11]:
# Check which rows have "np" as the `Median Income` value
income_transposed[income_transposed["Median Income"] == "np"]

Unnamed: 0,SA2,SA2 NAME,Year,Median Income
79,205021080,Alps - East,2015,np
82,205021083,Lake King,2015,np
91,205031092,Wilsons Promontory,2015,np
126,206041127,West Melbourne,2015,np
193,208031192,Moorabbin Airport,2015,np
235,210011227,Essendon Airport,2015,np
544,205021083,Lake King,2016,np
1006,205021083,Lake King,2017,np
1468,205021083,Lake King,2018,np
1927,205021080,Alps - East,2019,np


In [12]:
# Remove rows with "np" as the `Median Income` value
income_clean = income_transposed[income_transposed["Median Income"] != "np"]

## Data Type Conversion

In [13]:
# Map column name to new data type in dictionary
convert_dict = {}

for col_num in range(3, len(income_clean.columns)):
    convert_dict[income_clean.columns[col_num]] = "int"

# Convert datatypes
income_final = income_clean.astype(convert_dict)

In [14]:
income_final.head()

Unnamed: 0,SA2,SA2 NAME,Year,Median Income
0,201011001,Alfredton,2015,49385
1,201011002,Ballarat,2015,49564
2,201011003,Ballarat - North,2015,45816
3,201011004,Ballarat - South,2015,41544
4,201011005,Buninyong,2015,47511


## Output to CSV in Curated data directory

In [15]:
income_final.to_csv(f"{CURATED_DATA_DIR}/income.csv", index = False)