# Machine Learning for Predicting Income Growth
Here, the entirety of the historical income data will be used in training (ie. there will not be a train/test split)

In [22]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression

In [23]:
LANDING_DATA_DIR = "../../../data/landing"
RAW_DATA_DIR = "../../../data/raw"
CURATED_DATA_DIR = "../../../data/curated"

## Prepare Data for ML

### Load in Training Data

In [24]:
# Load in training data
income = pd.read_csv(f"{CURATED_DATA_DIR}/income.csv")

# Remove irrelevant features/columns
income.drop(columns = ["SA2"], inplace = True)

In [35]:
income.head()

Unnamed: 0,SA2,SA2 NAME,Year,Median Income,SA2_bin
0,201011001,Alfredton,2015,49385,4
1,201011002,Ballarat,2015,49564,23
2,201011003,Ballarat - North,2015,45816,24
3,201011004,Ballarat - South,2015,41544,25
4,201011005,Buninyong,2015,47511,67


In [26]:
income["Year"].unique()

array([2015, 2016, 2017, 2018, 2019])

### Create New Dataset for Predictions

In [37]:
# Create lists for SA2 name and Year
years = [2020, 2021, 2022, 2023, 2024, 2025, 2026, 2027, 2028]
years = years * len(income["SA2 NAME"].unique())
years = sorted(years)

sa2 = list(income["SA2 NAME"].unique())
sa2 = sa2 * 9

# Create new dataframe for projected incomeulations
income_new_dict = {"SA2 NAME": sa2, "Year": years}
income_new = pd.DataFrame(income_new_dict)

In [38]:
income_new.head()

Unnamed: 0,SA2 NAME,Year
0,Alfredton,2020
1,Ballarat,2020
2,Ballarat - North,2020
3,Ballarat - South,2020
4,Buninyong,2020


### One-Hot Encoding
This needs to be completed for the `SA2 NAME` column 

#### For Training Data

In [29]:
# Convert data type to category
income["SA2 NAME"] = income["SA2 NAME"].astype('category')

# Assign numerical values
income["SA2_bin"] = income["SA2 NAME"].cat.codes  
  
# Encode columns
enc = OneHotEncoder()
enc_data = pd.DataFrame(enc.fit_transform(income[["SA2_bin"]]).toarray())
income_enc = income.join(enc_data)
  
income_enc.head()

# # Use pandas library to complete one-hot encoding
# income_encoded = pd.get_dummies(income, columns = ['Remarks', 'Gender'])
# print(income_encoded)

Unnamed: 0,SA2,SA2 NAME,Year,Median Income,SA2_bin,0,1,2,3,4,...,451,452,453,454,455,456,457,458,459,460
0,201011001,Alfredton,2015,49385,4,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,201011002,Ballarat,2015,49564,23,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,201011003,Ballarat - North,2015,45816,24,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,201011004,Ballarat - South,2015,41544,25,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,201011005,Buninyong,2015,47511,67,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### For Predictions

In [30]:
# Convert data type to category
income_new["SA2 NAME"] = income_new["SA2 NAME"].astype('category')

# Assign numerical values
income_new["SA2_bin"] = income_new["SA2 NAME"].cat.codes  
  
# Encode columns
enc_new_data = pd.DataFrame(enc.transform(income_new[["SA2_bin"]]).toarray())
income_new_enc = income_new.join(enc_new_data)

In [31]:
income_new_enc.head()

Unnamed: 0,SA2 NAME,Year,SA2_bin,0,1,2,3,4,5,6,...,451,452,453,454,455,456,457,458,459,460
0,Alfredton,2020,4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Ballarat,2020,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Ballarat - North,2020,24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Ballarat - South,2020,25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Buninyong,2020,67,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Linear Regression Model

In [32]:
income_enc.columns

Index([          'SA2',      'SA2 NAME',          'Year', 'Median Income',
             'SA2_bin',               0,               1,               2,
                     3,               4,
       ...
                   451,             452,             453,             454,
                   455,             456,             457,             458,
                   459,             460],
      dtype='object', length=466)

### Fit Model

In [33]:
# Split training data into X and y
income_X = income_enc.drop(columns = ["SA2 NAME", "Median Income", "SA2_bin"])
income_y = income_enc["Median Income"]

# Convert all column names to strings
income_X.columns = income_X.columns.astype(str)

# Fit linear regression model to training data
regr = LinearRegression()
regr.fit(income_X, income_y)

### Generate Population Predictions for Future Years

In [None]:
# Remove columns from prediction dataset
income_pred_X = income_new_enc.drop(columns = ["SA2 NAME", "SA2_bin"])

# Convert all column names to strings
income_pred_X.columns = income_pred_X.columns.astype(str)

# Generate new predictions
income_new_pred = regr.predict(income_pred_X)

### NEED TO ROUND TO WHOLE NUMBERS???

## Analyse and Visualise Trend of Projections and Historical Data

## Save Predictions to CSV

In [None]:
# Add predictions to dataframe for projected population
income_new["Population"] = income_new_pred

# Remove "Bin" column
income_new.drop(columns = "SA2_bin", inplace = True)

# Output to csv
income_new.to_csv(f"{CURATED_DATA_DIR}/income_predictions.csv", index = False)