### ML Final Project - Devinee Amin

### Model Analysis

ML models used: linear regression (with L1 and L2 regularization), polynomial regression, and decision tree regression.


### Evaluation

Evaluation metrics used: RMSE and R2.

In [1]:
import numpy as np
import sklearn
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [2]:
%store -r counties_df
counties_df.head()

Unnamed: 0,county,state,latitude,longitude,total_votes_2020,TotalPop,VotingAgeCitizen,poverty_percent,MeanCommute,covid_cases,Hispanic,White,Black,Native,Asian,Pacific,IncomePerCap,covid_case_percent,voter_turnout
701,Dale,AL,31.430371,-85.610957,19699.0,49393.0,37173.0,19.6,20.7,1926.0,6.1,69.5,19.2,0.5,0.9,0.0,23194.0,3.899338,52.992764
733,DeKalb,AL,34.459469,-85.807829,29322.0,71194.0,49579.0,21.5,23.2,3691.0,14.3,80.8,1.4,1.2,0.3,0.1,20020.0,5.184426,59.141975
1579,Lauderdale,AL,34.901719,-87.656247,44149.0,92590.0,72887.0,16.3,24.1,2743.0,2.5,84.9,9.8,0.4,0.7,0.0,25803.0,2.962523,60.571844
495,Choctaw,AL,32.022273,-88.265644,7461.0,13188.0,10454.0,22.3,32.7,395.0,0.5,56.3,42.1,0.0,0.1,0.0,20994.0,2.995147,71.369811
1792,Marshall,AL,34.36976,-86.304867,39492.0,94738.0,65777.0,21.0,23.9,4570.0,13.3,81.8,2.2,0.5,0.6,0.1,22710.0,4.82383,60.039223


In [3]:
X = counties_df[['latitude','longitude','White', 'Black','IncomePerCap','covid_case_percent']]
y = counties_df['voter_turnout'].values.reshape(-1, 1)
list_cols = X.columns

# train (60%), validation (20%), test (20%) sets split
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, random_state=5)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=5)

print(X_train.shape), print(y_train.shape)
print(X_valid.shape), print(y_valid.shape)
print(X_test.shape), print(y_test.shape)

(1796, 6)
(1796, 1)
(599, 6)
(599, 1)
(599, 6)
(599, 1)


(None, None)

In [4]:
# statistics for features
pd.DataFrame(X).describe()

Unnamed: 0,latitude,longitude,White,Black,IncomePerCap,covid_case_percent
count,2994.0,2994.0,2994.0,2994.0,2994.0,2994.0
mean,38.170671,-92.27331,77.070875,8.887709,25770.099198,2.888295
std,4.898537,11.42866,19.835191,14.380569,6015.542188,1.753104
min,19.601212,-159.596679,0.6,0.0,9334.0,0.0
25%,34.524318,-98.206834,65.5,0.6,21730.75,1.678357
50%,38.249008,-90.625976,84.4,2.1,25143.0,2.648279
75%,41.574269,-83.890935,93.0,9.975,28686.0,3.791173
max,48.82228,-70.087747,100.0,86.9,69529.0,20.460504


# Linear Regression

In [5]:
# linear regression model

lin_model = LinearRegression()
lin_model.fit(X_train, y_train)

LinearRegression()

In [6]:
# linear regression model evaluation

# evaluate on training set
y_pred_train = lin_model.predict(X_train)
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
r2_train = r2_score(y_train, y_pred_train)
print("Training RMSE = " + str(rmse_train))
print("Training R2 = " + str(r2_train))

# evaluate on validation set
y_pred_valid = lin_model.predict(X_valid)
rmse_valid = mean_squared_error(y_valid, y_pred_valid, squared=False)
r2_valid = r2_score(y_valid, y_pred_valid)
print("Validation RMSE = " + str(rmse_valid))
print("Validation R2 = " + str(r2_valid))

Training RMSE = 8.803375859465397
Training R2 = 0.3846235070183428
Validation RMSE = 9.102751623403979
Validation R2 = 0.3449683022235168


In [7]:
# tune hyperparameters
# lasso regression (L1 regularization)

L1_model = Lasso(alpha=1)
L1_model.fit(X_train, y_train)

# evaluate on training set
y_pred_train = L1_model.predict(X_train)
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
r2_train = r2_score(y_train, y_pred_train)
print("Training RMSE = " + str(rmse_train))
print("Training R2 = " + str(r2_train))

# evaluate on validation set
y_pred_valid = L1_model.predict(X_valid)
rmse_valid = mean_squared_error(y_valid, y_pred_valid, squared=False)
r2_valid = r2_score(y_valid, y_pred_valid)
print("Validation RMSE = " + str(rmse_valid))
print("Validation R2 = " + str(r2_valid))

Training RMSE = 8.82668777556837
Training R2 = 0.3813600772939548
Validation RMSE = 9.153729439526272
Validation R2 = 0.33761105705567385


In [8]:
# tune hyperparameters
# ridge regression (L2 regularization)

L2_model = Ridge(alpha=1)
L2_model.fit(X_train, y_train)

# evaluate on training set
y_pred_train = L2_model.predict(X_train)
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
r2_train = r2_score(y_train, y_pred_train)
print("Training RMSE = " + str(rmse_train))
print("Training R2 = " + str(r2_train))

# evaluate on validation set
y_pred_valid = L2_model.predict(X_valid)
rmse_valid = mean_squared_error(y_valid, y_pred_valid, squared=False)
r2_valid = r2_score(y_valid, y_pred_valid)
print("Validation RMSE = " + str(rmse_valid))
print("Validation R2 = " + str(r2_valid))

Training RMSE = 8.803375860977617
Training R2 = 0.3846235068069275
Validation RMSE = 9.10275574053424
Validation R2 = 0.3449677096880842


In [9]:
# linear regression without regularization and with L2 regularization perform about equally well
# the L2 regularized model is used to evaluate on test set

y_pred_test = L2_model.predict(X_test)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
r2_test = r2_score(y_test, y_pred_test)
print("Test RMSE = " + str(rmse_test))
print("Test R2 = " + str(r2_test))

Test RMSE = 9.231303499787485
Test R2 = 0.299441568898959


# Polynomial Regression

In [10]:
# polynomial regression model

# train (60%), validation (20%), test (20%) sets split
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, random_state=5)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=5)

In [11]:
# evaluate polynomial regression
# tune hyperparamerters by varying polynomial degree from degrees 2 to 10

for p in range(2, 11):
    polyTune = PolynomialFeatures(degree=p)
    X_trainpoly = polyTune.fit_transform(X_train)
    X_valipoly = polyTune.fit_transform(X_valid)
    X_testpoly = polyTune.fit_transform(X_test)

    # train model
    lin_model = LinearRegression()
    lin_model.fit(X_trainpoly, y_train)
    2
    print("**DEGREE " + str(p) + "**")
    # evaluate on training set
    y_pred_train = lin_model.predict(X_trainpoly)
    rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
    r2_train = r2_score(y_train, y_pred_train)
    print("Training RMSE = " + str(rmse_train))
    print("Training R2 = " + str(r2_train))

    # evaluate on validation set
    y_pred_valid = lin_model.predict(X_valipoly)
    rmse_valid = mean_squared_error(y_valid, y_pred_valid, squared=False)
    r2_valid = r2_score(y_valid, y_pred_valid)
    print("Validation RMSE = " + str(rmse_valid))
    print("Validation R2 = " + str(r2_valid))
    print("")

**DEGREE 2**
Training RMSE = 7.715323200689969
Training R2 = 0.5273380297049466
Validation RMSE = 8.120974276925686
Validation R2 = 0.47864541085401313

**DEGREE 3**
Training RMSE = 7.578936265975879
Training R2 = 0.5439012064435701
Validation RMSE = 8.1452241180588
Validation R2 = 0.4755271539417353

**DEGREE 4**
Training RMSE = 8.213473456988458
Training R2 = 0.464331473906046
Validation RMSE = 11.563499021523803
Validation R2 = -0.05704984512229894

**DEGREE 5**
Training RMSE = 6.795378147916239
Training R2 = 0.6333348178223194
Validation RMSE = 8.56206780264321
Validation R2 = 0.42047222353503755

**DEGREE 6**
Training RMSE = 7.379871279046706
Training R2 = 0.5675459314824264
Validation RMSE = 10.149023675397723
Validation R2 = 0.18573557272907326

**DEGREE 7**
Training RMSE = 6.9418920196853255
Training R2 = 0.6173531684514035
Validation RMSE = 8.849381554569051
Validation R2 = 0.380925703917261

**DEGREE 8**
Training RMSE = 7.697017015348553
Training R2 = 0.5295783435140515
Valid

In [12]:
# polynomial regression for degree=2 performs the best
# the polynomial regression model with degree=2 is used to evaluate on test set

poly = PolynomialFeatures(degree=2)
X_trainpoly = poly.fit_transform(X_train)
X_testpoly = poly.fit_transform(X_test)

# train model
lin_model = LinearRegression()
lin_model.fit(X_trainpoly, y_train)

# evaluate on test set
y_pred_test = lin_model.predict(X_testpoly)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
r2_test = r2_score(y_test, y_pred_test)
print("Test RMSE = " + str(rmse_test))
print("Test R2 = " + str(r2_test))

Test RMSE = 8.376469667227838
Test R2 = 0.42317995348129933


# Decision Tree Regression

In [13]:
# decision tree regression model

# train (60%), test (20%), validation (20%) sets split
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, random_state=5)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=5)

In [14]:
# tune hyperparameters by varying max depth of tree

for p in range(1, 11):
    
    # train model
    tree_model = DecisionTreeRegressor(max_depth=p)
    tree_model.fit(X_train, y_train)
    
    print("**MAX DEPTH " + str(p) + "**")
    # evaluate on training set
    y_pred_train = tree_model.predict(X_train)
    rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
    r2_train = r2_score(y_train, y_pred_train)
    print("Training RMSE = " + str(rmse_train))
    print("Training R2 = " + str(r2_train))

    # evaluate on validation set
    y_pred_valid = tree_model.predict(X_valid)
    rmse_valid = mean_squared_error(y_valid, y_pred_valid, squared=False)
    r2_valid = r2_score(y_valid, y_pred_valid)
    print("Validation RMSE = " + str(rmse_valid))
    print("Validation R2 = " + str(r2_valid))
    print("")

**MAX DEPTH 1**
Training RMSE = 9.662693208333458
Training R2 = 0.25862353487579715
Validation RMSE = 10.058640726340212
Validation R2 = 0.2001739890193921

**MAX DEPTH 2**
Training RMSE = 9.119188804662569
Training R2 = 0.33967942676180296
Validation RMSE = 9.791491525520664
Validation R2 = 0.2420952381517223

**MAX DEPTH 3**
Training RMSE = 8.41395371170132
Training R2 = 0.4378623800905793
Validation RMSE = 9.068221208322441
Validation R2 = 0.3499284761936813

**MAX DEPTH 4**
Training RMSE = 7.825413646898414
Training R2 = 0.5137529042886996
Validation RMSE = 8.866092195402024
Validation R2 = 0.37858545093880824

**MAX DEPTH 5**
Training RMSE = 7.318634421887234
Training R2 = 0.5746930089948583
Validation RMSE = 8.648813276822997
Validation R2 = 0.4086699178156562

**MAX DEPTH 6**
Training RMSE = 6.811802263943955
Training R2 = 0.6315602501689598
Validation RMSE = 8.685526869326218
Validation R2 = 0.40363895475562117

**MAX DEPTH 7**
Training RMSE = 6.1796274861655425
Training R2 = 0

In [15]:
# evaluate on test set
tree_model = DecisionTreeRegressor(max_depth=5)
tree_model.fit(X_train, y_train)

y_pred_test = tree_model.predict(X_test)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
r2_test = r2_score(y_test, y_pred_test)
print("Test RMSE = " + str(rmse_test))
print("Test R2 = " + str(r2_test))

Test RMSE = 9.323898391250458
Test R2 = 0.2853171361249116


# Results
The polynomial regression model with degree=2 performs the best. 

In [16]:
poly = PolynomialFeatures(degree=2)
X_trainpoly = poly.fit_transform(X_train)
X_testpoly = poly.fit_transform(X_test)

lin_model = LinearRegression()
lin_model.fit(X_trainpoly, y_train)
y_pred_test = lin_model.predict(X_testpoly)

dfX = pd.DataFrame(X_test)
dfX.head()

Unnamed: 0,latitude,longitude,White,Black,IncomePerCap,covid_case_percent
1817,37.220784,-88.710366,88.7,6.1,23434.0,1.178567
1242,38.384828,-93.792937,94.0,1.5,24371.0,2.033526
1913,39.415616,-78.941049,94.2,3.3,21888.0,0.889829
915,47.263316,-109.224521,94.4,0.1,27491.0,2.2954
2554,37.157721,-91.401904,94.2,0.1,17903.0,3.263629


In [17]:
dfY = pd.DataFrame(y_pred_test)
dfY.columns = ['voter_turnout']
dfY.head()

Unnamed: 0,voter_turnout
0,62.541862
1,65.80122
2,57.697352
3,77.902241
4,55.472581
