# Linear regression of vaccination rates and census data

Let's begin by importing everything we need.

In [8]:
import pandas as pd
import numpy as np
from pprint import pprint
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from IPython.display import display, HTML

Here, we read in the data and create an empty list that we'll use to concatenate together the final results into a pandas dataframe.

In [9]:
# Read in data.
data = pd.read_csv("./data-combined.csv")

# A list that will store rows for the final export.
parts = []

display(data)

Unnamed: 0,TOR_Code,TOR_Name,% population ACTUALLY vaccinated (55+ yo),% reporting knowledge of neither English nor French (2016 census),% reporting mother tongue is neither English nor French (2016 census),% households earning LOWEST income (2016 census),% households earning HIGHEST income (2016 census),Nb. households earning top 10% LOWEST income (bottom decile),Nb. households earning 10% HIGHEST income (top decile),% Home Internet Download Speeds Over 50 Mbps (based on FSA),...,% Labour Force Population in Manufacturing (2016 census),% Labour Force Population in Wholesale trade (2016 census),% Labour Force Population in Retail trade (2016 census),% Labour Force Population in Transportation and warehousing (2016 census),% Labour Force Population in Health care and social assistance (2016 census),% Labour Force Population in Accommodation and food services (2016 census),Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26
0,2,Mount Olive-Silverstone-Jamestown,12.7,7.66,57.95,77.41,1.22,25410.0,400.0,0.49,...,18.30,4.13,10.71,8.40,9.22,7.83,0.0,0.0,2.0,2.0
1,115,Mount Dennis,16.2,4.63,44.35,70.64,1.51,9370.0,200.0,0.78,...,10.15,2.97,10.08,5.37,11.31,7.61,0.0,1.0,1.0,2.0
2,82,Niagara,16.3,1.47,28.26,36.83,19.88,11210.0,6050.0,0.67,...,4.07,4.21,6.81,2.44,7.20,6.91,0.0,1.0,2.0,3.0
3,22,Humbermede,16.4,6.92,56.56,71.29,1.83,11075.0,285.0,0.40,...,17.80,4.19,9.62,7.79,8.84,7.53,0.0,0.0,0.0,0.0
4,72,Regent Park,16.9,5.99,51.67,75.46,5.43,8055.0,580.0,0.68,...,3.53,2.74,9.45,4.24,9.54,13.78,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,96,Casa Loma,34.2,0.64,20.50,31.42,39.90,3390.0,4305.0,0.52,...,2.63,2.32,6.64,1.31,10.04,4.63,0.0,0.0,0.0,0.0
136,12,Markland Wood,34.2,1.91,41.01,33.38,20.52,3490.0,2145.0,0.40,...,6.69,4.67,10.45,5.77,7.97,4.49,0.0,0.0,3.0,3.0
137,101,Forest Hill South,34.6,0.79,21.97,28.20,46.23,2995.0,4910.0,0.53,...,2.46,2.87,7.45,1.47,13.84,5.16,0.0,0.0,2.0,2.0
138,71,Cabbagetown-South St. James Town,35.5,2.35,24.57,47.70,20.10,5245.0,2210.0,0.64,...,2.82,2.20,7.43,2.96,9.97,9.08,0.0,0.0,2.0,2.0


In [11]:
# For each measure...
for col in range(20):
    
    # ...prepare the data...
    measure = data.columns[3+col]
    X = data.iloc[:, 2].values.reshape(-1, 1)  # values converts it into a numpy array
    Y = data.iloc[:, 3+col].values.reshape(-1, 1)  # -1 means that calculate the dimension of rows, but have 1 column

    # (this handles any blank or N/A values in the data.)
    imputer = SimpleImputer()
    Y = imputer.fit_transform(Y)

    # ...then do a linear regression...
    linear_regressor = LinearRegression()
    reg = linear_regressor.fit(X.astype(float), Y.astype(float))
    score = reg.score(X, Y)

    # ..and finally put into a new dataframe that will be concatenated together at the end.
    row = pd.DataFrame([[measure, score]])
    parts.append(row)
    
# Put everything in our parts list together into a dataframe for export.
results = pd.concat(parts)
results.reset_index(drop=True, inplace=True)

# Save and print it.
results.to_csv("./results.csv")
display(results)


Unnamed: 0,0,1
0,% reporting knowledge of neither English nor F...,0.13206
1,% reporting mother tongue is neither English n...,0.162728
2,% households earning LOWEST income (2016 census),0.480597
3,% households earning HIGHEST income (2016 census),0.507689
4,Nb. households earning top 10% LOWEST income (...,0.226858
5,Nb. households earning 10% HIGHEST income (top...,0.197261
6,% Home Internet Download Speeds Over 50 Mbps (...,0.000935
7,% Fast or Very Fast Home Internet Service Rela...,0.002199
8,% Worried About Ability to Pay Home Internet B...,0.017055
9,% Driving to work (2016 census),0.086095


The end.