In [2]:
# This script will run an analysis of population growth for all of Canada
# using our preprocessed Canadian census data
# Typical data processing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error, r2_score

from sklearn.model_selection import GridSearchCV

In [5]:
# Load the data

dataSetName = 'Canada'
census2016 = pd.read_csv(f'../processedData/processed_{dataSetName}_2016.csv')
census2021 = pd.read_csv(f'../processedData/processed_{dataSetName}_2021.csv')

In [7]:
# We only need the GEO_NAME for cross referencing, the population count for training,
# and the province for handling duplicate community names
trimmed2021 = census2021[['GEO_NAME', 'Province', 'Population, 2021']]
# Drop any rows with missing population data
trimmed2021 = trimmed2021.dropna()
# Sort the data by community name
trimmed2021 = trimmed2021.sort_values(by='GEO_NAME')
# Take a subset of the 2021 data that matches 
# the communities in the 2016 data. Note that community name is 'GEO_NAME' in both datasets.
# We will need to use a merge rather than a simple isin() because we need to match across multiple columns
trimmed2021 = trimmed2021.merge(census2016[['GEO_NAME', 'Province']], on=['GEO_NAME', 'Province'], how='inner')

# Now let's trim the 2016 data to only include what has a match in our trimmed 2021 data
trimmed2016 = census2016.merge(trimmed2021[['GEO_NAME', 'Province']], on=['GEO_NAME', 'Province'], how='inner')

# As a last step before we are done, we need to sort the dataframes by the community name so that the
# data is in the same order for both datasets
trimmed2016 = trimmed2016.sort_values(by='GEO_NAME')
trimmed2021 = trimmed2021.sort_values(by='GEO_NAME')

print(f'Trimmed 2016 data Shape: {trimmed2016.shape}')
print(f'Trimmed 2021 data Shape: {trimmed2021.shape}')


Trimmed 2016 data Shape: (3405, 216)
Trimmed 2021 data Shape: (3405, 3)
