In [1]:
#Load packages
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.tree import export_graphviz
import statsmodels.api as sm
from subprocess import call
from IPython.display import Image 

In [2]:
#Read data
data = pd.read_csv('StateData.csv')
data.head()

Unnamed: 0,Population,Income,Illiteracy,LifeExp,Murder,HighSchoolGrad,Frost,Area,Longitude,Latitude,Region
0,3615,3624,2.1,69.05,15.1,41.3,20,50708,-86.7509,32.5901,South
1,365,6315,1.5,69.31,11.3,66.7,152,566432,-127.25,49.25,West
2,2212,4530,1.8,70.55,7.8,58.1,15,113417,-111.625,34.2192,West
3,2110,3378,1.9,70.66,10.1,39.9,65,51945,-92.2992,34.7336,South
4,21198,5114,1.1,71.71,10.3,62.6,20,156361,-119.773,36.5341,West


In [3]:
#Get targets and features
target = data.LifeExp
features = data.drop(['LifeExp','Longitude','Latitude','Region'],axis=1)

In [4]:
#Fit linear regression model
mod = sm.OLS(target,sm.add_constant(features, prepend=False))
res = mod.fit()
print(res.summary())
#Murder looks to be the most important (confidence interval is far from zero, large coefficient size relative to the scale, p-value is 0))

                            OLS Regression Results                            
Dep. Variable:                LifeExp   R-squared:                       0.736
Model:                            OLS   Adj. R-squared:                  0.692
Method:                 Least Squares   F-statistic:                     16.74
Date:                Tue, 22 Nov 2022   Prob (F-statistic):           2.53e-10
Time:                        18:50:40   Log-Likelihood:                -51.855
No. Observations:                  50   AIC:                             119.7
Df Residuals:                      42   BIC:                             135.0
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Population       5.18e-05   2.92e-05      1.

In [21]:

#Split the data
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=1)
#CART
dtree = DecisionTreeRegressor()
dtree.fit(X_train,y_train)
print('CART R-squared:', dtree.score(X_test, y_test))
#Lassoisye 
lass = Lasso()
lass.fit(X_train,y_train)
print('Lasso R-squared:', lass.score(X_test, y_test))
#CART is better, but both are bad.

CART R-squared: -1.0960086916742848
Lasso R-squared: -0.6290476013205848


In [22]:

#Split the data
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=2)
#CART
dtree = DecisionTreeRegressor()
dtree.fit(X_train,y_train)
print('CART R-squared:', dtree.score(X_test, y_test))
#Lasso
lass = Lasso()
lass.fit(X_train,y_train)
print('Lasso R-squared:', lass.score(X_test, y_test))
#Lasso is better and both are OK.


CART R-squared: 0.6612896098333689
Lasso R-squared: 0.526920309135839


In [23]:

# #Small dataset so high sensitivity to the train-test split.
# Need cross validation 

In [24]:

#KFold
kf = KFold(10,random_state=1,shuffle=True)

#Lists to store R-squared
tree_score = []
lasso_score = []

#Loop over each fold
for train_index, test_index in kf.split(features):
    
    #Get the training and testing sets
    X_train, X_test = features.loc[train_index], features.loc[test_index]
    y_train, y_test = target[train_index], target[test_index]
    
    #Train/test CART
    dtree = DecisionTreeRegressor()
    dtree.fit(X_train,y_train)
    tree_score.append(dtree.score(X_test, y_test))
    
    #Train/test Lasso
    lass = Lasso()
    lass.fit(X_train,y_train)
    lasso_score.append(lass.score(X_test,y_test))
    
#Print average score
print('CART average R-squared:', np.mean(tree_score))
print('Lasso average R-squared:', np.mean(lasso_score))
#Poor performance for both models. May need to tune the hyperparameters to reduce overfitting and model sensitivity.

CART average R-squared: -0.6758852524422074
Lasso average R-squared: -0.22284465988121113


In [25]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
#Q2e

#Choose models
dtree = DecisionTreeRegressor()
lass = Lasso()

#Choose set of hyperparams
params1 = {'min_samples_leaf':[1,2,3,4,5,6,7,8,9,10]}
params2 = {'alpha':[0.001,0.01,0.1,1,10]}

#Lists to store R-squared
tree_score = []
lasso_score = []

#Kfold
kf = KFold(10,random_state=1,shuffle=True)

#Loop over each fold
for train_index, test_index in kf.split(features):
    
    #Get the training and testing sets
    X_train, X_test = features.loc[train_index], features.loc[test_index]
    y_train, y_test = target[train_index], target[test_index]
    
    #Train/test CART
    clf = GridSearchCV(dtree, params1, cv=10)
    clf.fit(X_train,y_train)
    tree_score.append(clf.best_estimator_.score(X_test,y_test))
    
    #Train/test Lasso
    clf_lass = GridSearchCV(lass, params2, cv=10)
    clf_lass.fit(X_train,y_train)
    lasso_score.append(clf_lass.best_estimator_.score(X_test,y_test))
    
#Print average R-squared
print('CART average R-squared:', np.mean(tree_score))
print('Lasso average R-squared:', np.mean(lasso_score))
#Lasso is the best!

CART average R-squared: -0.4683729828561938
Lasso average R-squared: -0.023855163153866855


In [26]:

#We see a slight improvement from d to e due to tuning our hyperparameters (more for Lasso than CART).

In [27]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
#Q2e

#Choose models
dtree = DecisionTreeRegressor()
lass = Lasso()

#Choose set of hyperparams
params1 = {'min_samples_leaf':[1,2,3,4,5,6,7,8,9,10]}
params2 = {'alpha':[0.001,0.01,0.1,1,10]}

#Lists to store R-squared
tree_score = []
lasso_score = []

#Kfold
kf = KFold(3,random_state=1,shuffle=True)

#Loop over each fold
for train_index, test_index in kf.split(features):
    
    #Get the training and testing sets
    X_train, X_test = features.loc[train_index], features.loc[test_index]
    y_train, y_test = target[train_index], target[test_index]
    
    #Train/test CART
    clf = GridSearchCV(dtree, params1, cv=3)
    clf.fit(X_train,y_train)
    tree_score.append(clf.best_estimator_.score(X_test,y_test))
    
    #Train/test Lasso
    clf_lass = GridSearchCV(lass, params2, cv=3)
    clf_lass.fit(X_train,y_train)
    lasso_score.append(clf_lass.best_estimator_.score(X_test,y_test))
    
#Print average R-squared
print('CART average R-squared:', np.mean(tree_score))
print('Lasso average R-squared:', np.mean(lasso_score))
#Lasso is the best!

CART average R-squared: 0.29152297875894045
Lasso average R-squared: 0.45891213966657435
