# Raw Preprocessing for Population Data

In [1]:
import pandas as pd

In [3]:
LANDING_DATA_DIR = "../../../data/landing"
RAW_DATA_DIR = "../../../data/raw"
CURATED_DATA_DIR = "../../../data/curated"

In [5]:
# Load in data
pop = pd.read_excel(io = f"{LANDING_DATA_DIR}/population.xlsx", 
                    sheet_name = "Table 1", skiprows = 7)

## Remove Unnecessary Columns
**Columns to remove:**
- `S/T code`
- `GCCSA code`
- `GCCSA name`
- `SA4 code`
- `SA4 name`
- `SA3 code`
- `SA3 name`

In [6]:
# Remove specified columns
pop.drop(columns = ["S/T code", "GCCSA code", "GCCSA name", "SA4 code", 
                    "SA4 name", "SA3 code", "SA3 name"], inplace = True)

## Rename Columns for Year

In [7]:
# Rename columns for each year
rename_dict = {"no.": "2001", "no..1": "2002", "no..2": "2003", "no..3": "2004",
               "no..4": "2005", "no..5": "2006", "no..6": "2007", 
               "no..7": "2008", "no..8": "2009", "no..9": "2010", 
               "no..10": "2011", "no..11": "2012", "no..12": "2013",
               "no..13": "2014", "no..14": "2015", "no..15": "2016", 
               "no..16": "2017", "no..17": "2018", "no..18": "2019",
               "no..19": "2020", "no..20": "2021", "no..21": "2022",}
pop.rename(columns = rename_dict, inplace = True)

## Remove Rows wth NAs

In [8]:
# Number of NA values per column
pop.isna().sum()


S/T name    8
SA2 code    8
SA2 name    7
2001        7
2002        7
2003        7
2004        7
2005        7
2006        7
2007        7
2008        7
2009        7
2010        7
2011        7
2012        7
2013        7
2014        7
2015        7
2016        7
2017        7
2018        7
2019        7
2020        7
2021        7
2022        7
dtype: int64

In [9]:
# Create a boolean mask for NA values
na = pop.isna().any(axis=1)

# Filter the dataframe using the boolean mask
rows_with_na = pop[na].index.tolist()

# show rows with NA values
pop.iloc[rows_with_na, :]

Unnamed: 0,S/T name,SA2 code,SA2 name,2001,2002,2003,2004,2005,2006,2007,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,,,,,,,,,,,...,,,,,,,,,,
2455,,,,,,,,,,,...,,,,,,,,,,
2456,,,TOTAL AUSTRALIA,19274701.0,19495210.0,19720737.0,19932722.0,20176844.0,20450966.0,20827622.0,...,23128129.0,23475686.0,23815995.0,24190907.0,24594202.0,24966643.0,25340217.0,25655289.0,25688079.0,25996144.0
2457,,,,,,,,,,,...,,,,,,,,,,
2458,,,,,,,,,,,...,,,,,,,,,,
2459,,,,,,,,,,,...,,,,,,,,,,
2460,,,,,,,,,,,...,,,,,,,,,,
2461,,,,,,,,,,,...,,,,,,,,,,


In [10]:
# Remove all rows with NAs
pop.drop(index = rows_with_na, inplace = True)

In [11]:
# Check that NA removal was successful
pop.isna().sum()

S/T name    0
SA2 code    0
SA2 name    0
2001        0
2002        0
2003        0
2004        0
2005        0
2006        0
2007        0
2008        0
2009        0
2010        0
2011        0
2012        0
2013        0
2014        0
2015        0
2016        0
2017        0
2018        0
2019        0
2020        0
2021        0
2022        0
dtype: int64

## Data Type Conversion

In [12]:
# Map column name to new data type in dictionary
convert_dict = {}

for col_num in range(3, len(pop.columns)):
    convert_dict[pop.columns[col_num]] = "int"

# Convert datatypes
pop = pop.astype(convert_dict)

In [13]:
pop.head()

Unnamed: 0,S/T name,SA2 code,SA2 name,2001,2002,2003,2004,2005,2006,2007,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
1,New South Wales,101021007.0,Braidwood,2760,2811,2835,2844,2847,2965,3102,...,3685,3762,3849,3950,4039,4140,4211,4273,4330,4364
2,New South Wales,101021008.0,Karabar,9129,9199,9263,9277,9209,9212,9033,...,8848,8731,8603,8531,8526,8507,8488,8519,8546,8535
3,New South Wales,101021009.0,Queanbeyan,9717,9513,9522,9400,9595,9682,9793,...,11050,11199,11213,11230,11355,11447,11450,11437,11370,11376
4,New South Wales,101021010.0,Queanbeyan - East,3925,4073,4219,4218,4187,4319,4459,...,4983,4967,4961,4970,5013,5072,5117,5077,5093,5093
5,New South Wales,101021012.0,Queanbeyan West - Jerrabomberra,9425,10257,11085,11549,12046,12358,12622,...,13210,13193,13164,13150,13083,13008,12935,12796,12743,12775


## Output to CSV in Raw data directory

In [14]:
pop.to_csv(f"{RAW_DATA_DIR}/population.csv", index = False)