# Machine Learning for Predicting Population Growth
Here, the entirety of the historical population data will be used in training (ie. there will not be a train/test split)

In [19]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression

In [20]:
LANDING_DATA_DIR = "../../../data/landing"
RAW_DATA_DIR = "../../../data/raw"
CURATED_DATA_DIR = "../../../data/curated"

## Prepare Data for ML

### Load in Training Data

In [21]:
# Load in training data
pop = pd.read_csv(f"{CURATED_DATA_DIR}/population.csv")

# Remove irrelevant features/columns
pop.drop(columns = ["S/T name", "SA2 code"], inplace = True)

In [22]:
pop.head()

Unnamed: 0,SA2 name,Year,Population
0,Alfredton,2001,5756
1,Ballarat,2001,11497
2,Buninyong,2001,5320
3,Delacombe,2001,4154
4,Smythes Creek,2001,3317


### Create New Dataset for Predictions

In [23]:
# Create lists for SA2 name and Year
sa2 = list(pop["SA2 name"].unique())
sa2 = sa2 * 6

years = [2023, 2024, 2025, 2026, 2027, 2028]
years = years * len(pop["SA2 name"].unique())
years = sorted(years)

# Create new dataframe for projected populations
pop_new_dict = {"SA2 name": sa2, "Year": years}
pop_new = pd.DataFrame(pop_new_dict)

In [24]:
pop_new.head()

Unnamed: 0,SA2 name,Year
0,Alfredton,2023
1,Ballarat,2023
2,Buninyong,2023
3,Delacombe,2023
4,Smythes Creek,2023


### One-Hot Encoding
This needs to be completed for the `SA2 name` column 

#### For Training Data

In [25]:
# Convert data type to category
pop["SA2 name"] = pop["SA2 name"].astype('category')

# Assign numerical values
pop["SA2_bin"] = pop["SA2 name"].cat.codes  
  
# Encode columns
enc = OneHotEncoder()
enc_data = pd.DataFrame(enc.fit_transform(pop[["SA2_bin"]]).toarray())
pop_enc = pop.join(enc_data)
  
pop_enc.head()

# # Use pandas library to complete one-hot encoding
# pop_encoded = pd.get_dummies(pop, columns = ['Remarks', 'Gender'])
# print(pop_encoded)

Unnamed: 0,SA2 name,Year,Population,SA2_bin,0,1,2,3,4,5,...,512,513,514,515,516,517,518,519,520,521
0,Alfredton,2001,5756,4,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Ballarat,2001,11497,24,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Buninyong,2001,5320,73,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Delacombe,2001,4154,142,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Smythes Creek,2001,3317,422,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### For Predictions

In [26]:
# Convert data type to category
pop_new["SA2 name"] = pop_new["SA2 name"].astype('category')

# Assign numerical values
pop_new["SA2_bin"] = pop_new["SA2 name"].cat.codes  
  
# Encode columns
enc_new_data = pd.DataFrame(enc.transform(pop_new[["SA2_bin"]]).toarray())
pop_new_enc = pop_new.join(enc_new_data)

In [27]:
pop_new_enc.head()

Unnamed: 0,SA2 name,Year,SA2_bin,0,1,2,3,4,5,6,...,512,513,514,515,516,517,518,519,520,521
0,Alfredton,2023,4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Ballarat,2023,24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Buninyong,2023,73,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Delacombe,2023,142,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Smythes Creek,2023,422,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Linear Regression Model

In [28]:
pop_enc.columns

Index([  'SA2 name',       'Year', 'Population',    'SA2_bin',            0,
                  1,            2,            3,            4,            5,
       ...
                512,          513,          514,          515,          516,
                517,          518,          519,          520,          521],
      dtype='object', length=526)

### Fit Model

In [29]:
# Split training data into X and y
pop_X = pop_enc.drop(columns = ["SA2 name", "Population", "SA2_bin"])
pop_y = pop_enc["Population"]

# Convert all column names to strings
pop_X.columns = pop_X.columns.astype(str)

# Fit linear regression model to training data
regr = LinearRegression()
regr.fit(pop_X, pop_y)

### Generate Population Predictions for Future Years

In [30]:
# Remove columns from prediction dataset
pop_pred_X = pop_new_enc.drop(columns = ["SA2 name", "SA2_bin"])

# Convert all column names to strings
pop_pred_X.columns = pop_pred_X.columns.astype(str)

# Generate new predictions
pop_new_pred = regr.predict(pop_pred_X)

### NEED TO ROUND TO WHOLE NUMBERS???

## Analyse and Visualise Trend of Projections and Historical Data

## Save Predictions to CSV

In [31]:
# Add predictions to dataframe for projected population
pop_new["Population"] = pop_new_pred

# Remove "Bin" column
pop_new.drop(columns = "SA2_bin", inplace = True)

# Output to csv
pop_new.to_csv(f"{CURATED_DATA_DIR}/population_predictions.csv", index = False)