# Raw Preprocessing for Income Data

In [1]:
import pandas as pd

In [2]:
LANDING_DATA_DIR = "../../../data/landing"
RAW_DATA_DIR = "../../../data/raw"
CURATED_DATA_DIR = "../../../data/curated"


In [3]:
income = pd.read_excel(io = f"{LANDING_DATA_DIR}/historical_income_data.xlsx", 
                       sheet_name = "Table 1.4", skiprows = 6)

## Remove Unnecessary Columns
**Columns to keep:** All columns associated with Median Income
- `2015-16.3`
- `2016-17.3`
- `2017-18.3`
- `2018-19.3`
- `2019-20.3`


In [4]:
# Remove specified columns
keep_cols = ["SA2", "SA2 NAME", "2015-16.3", "2016-17.3", "2017-18.3", 
             "2018-19.3", "2019-20.3"]
remove_cols = []

for col in list(income.columns):
    if (col not in keep_cols):
        remove_cols.append(col)

median_income = income.drop(columns = remove_cols)
print(median_income.shape)

(2300, 7)


In [5]:
# median_income[median_income['SA2 NAME'].str.contains("Point Cook")]

In [6]:
median_income.head()

Unnamed: 0,SA2,SA2 NAME,2015-16.3,2016-17.3,2017-18.3,2018-19.3,2019-20.3
0,Australia,,47692,48360,49805,51389,52338
1,New South Wales,,48085,48700,50153,51818,52849
2,101021007,Braidwood,39716,41288,42003,41593,44246
3,101021008,Karabar,55870,57880,59295,61777,62946
4,101021009,Queanbeyan,54999,55376,57848,60119,61724


## Rename Columns for Year

In [9]:
# Rename columns for each year 
# Columns are named according to year of beginning of financial year
# eg. 2015-16 -> 2015
rename_dict = {"2015-16.3": "2015", "2016-17.3": "2016", "2017-18.3": "2017",
               "2018-19.3": "2018", "2019-20.3": "2019"}
median_income.rename(columns = rename_dict, inplace = True)

In [10]:
median_income.head()

Unnamed: 0,SA2,SA2 NAME,2015,2016,2017,2018,2019
0,Australia,,47692,48360,49805,51389,52338
1,New South Wales,,48085,48700,50153,51818,52849
2,101021007,Braidwood,39716,41288,42003,41593,44246
3,101021008,Karabar,55870,57880,59295,61777,62946
4,101021009,Queanbeyan,54999,55376,57848,60119,61724


## Remove Rows with NAs
We need to remove rows that do not contain useful information

In [13]:
# Number of NA values per column
median_income.isna().sum()


SA2          1
SA2 NAME    12
2015         3
2016         3
2017         3
2018         3
2019         3
dtype: int64

In [14]:
# Create a boolean mask for NA values
na = median_income.isna().any(axis=1)

# Filter the dataframe using the boolean mask
rows_with_na = median_income[na].index.tolist()

# Show rows with NA values
median_income.iloc[rows_with_na, :]

Unnamed: 0,SA2,SA2 NAME,2015,2016,2017,2018,2019
0,Australia,,47692.0,48360.0,49805.0,51389.0,52338.0
1,New South Wales,,48085.0,48700.0,50153.0,51818.0,52849.0
578,Victoria,,46984.0,47709.0,49266.0,51027.0,51996.0
1041,Queensland,,46869.0,47425.0,48826.0,50298.0,51197.0
1570,South Australia,,46110.0,46937.0,48354.0,49888.0,50440.0
1743,Western Australia,,52504.0,51450.0,52671.0,54220.0,55208.0
1996,Tasmania,,43833.0,44437.0,45546.0,47352.0,47909.0
2096,Northern Territory,,59466.0,58669.0,60636.0,61517.0,62010.0
2165,Australian Capital Territory,,63061.0,63038.0,64332.0,66594.0,68325.0
2297,,,,,,,


At this stage, only rows 2297-2299 can be removed

In [15]:
# Remove rows 2297-2299
median_income.drop(index = [2297, 2298, 2299], inplace = True)

In [16]:
# Check that NA removal for specified rows was successful
median_income.isna().sum()

SA2         0
SA2 NAME    9
2015        0
2016        0
2017        0
2018        0
2019        0
dtype: int64

In [17]:
median_income.shape

(2297, 7)

## Output to CSV in Raw data directory

In [18]:
median_income.to_csv(f"{RAW_DATA_DIR}/income.csv", index = False)