In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from statsmodels.tools.eval_measures import mse, rmse
from sklearn.metrics import mean_absolute_error,mean_squared_error

%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

import warnings
warnings.filterwarnings(action="ignore")

# Assignment 6-7

## Reading Data

In [13]:
df = pd.read_excel("feed_the_model.xlsx")

X = df[df.columns[df.columns != "total_cases"]]
Y = df["total_cases"]

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 465)

print("Data on train-set  : {}".format(X_train.shape[0]))
print("Data on test-set   : {}".format(X_test.shape[0]))

Data on train-set  : 3301
Data on test-set   : 826


## Training Models

### OLS

In [24]:
ols_model = LinearRegression()
ols_model.fit(X_train,y_train)

ols_model_train_predict = ols_model.predict(X_train)
ols_model_test_predict = ols_model.predict(X_test)

In [25]:
print("R-squared on train set :",ols_model.score(X_train,y_train),"\n")

print("--- OLS model's statistics on test set ---\n")
print("R-squared :",ols_model.score(X_test,y_test))
print("Mean Absolute Error :",mean_absolute_error(y_test,ols_model_test_predict))
print("Mean Squared Error :",mse(y_test,ols_model_test_predict))
print("Root of Mean Squared Error :",rmse(y_test,ols_model_test_predict))

R-squared on train set : 0.900456717826558 

--- OLS model's statistics on test set ---

R-squared : 0.899975302711149
Mean Absolute Error : 0.3968308026652247
Mean Squared Error : 0.330781214609967
Root of Mean Squared Error : 0.5751358227496937


### Ridge Regularization

In [46]:
ridge_model = Ridge(0.3)
ridge_model.fit(X_train,y_train)

ridge_model_train_predict = ridge_model.predict(X_train)
ridge_model_test_predict = ridge_model.predict(X_test)

In [47]:
print("R-squared on train set :",ridge_model.score(X_train,y_train),"\n")

print("--- Ridge model's statistics on test set ---\n")
print("R-squared :",ridge_model.score(X_test,y_test))
print("Mean Absolute Error :",mean_absolute_error(y_test,ridge_model_test_predict))
print("Mean Squared Error :",mean_squared_error(y_test,ridge_model_test_predict))
print("Root of Mean Squared Error :",mean_squared_error(y_test,ridge_model_test_predict)**(1/2))

R-squared on train set : 0.900437065776202 

--- Ridge model's statistics on test set ---

R-squared : 0.900017266883841
Mean Absolute Error : 0.397327837818271
Mean Squared Error : 0.33064243928357834
Root of Mean Squared Error : 0.5750151643944518


### Lasso Regularization

In [82]:
lasso_model = Lasso(0.00001)
lasso_model.fit(X_train,y_train)

lasso_model_train_predict = lasso_model.predict(X_train)
lasso_model_test_predict = lasso_model.predict(X_test)

In [83]:
print("R-squared on train set :",lasso_model.score(X_train,y_train),"\n")

print("--- Lasso model's statistics on test set ---\n")
print("R-squared :",lasso_model.score(X_test,y_test))
print("Mean Absolute Error :",mean_absolute_error(y_test,lasso_model_test_predict))
print("Mean Squared Error :",mean_squared_error(y_test,lasso_model_test_predict))
print("Root of Mean Squared Error :",mean_squared_error(y_test,lasso_model_test_predict)**(1/2))

R-squared on train set : 0.9004559257812224 

--- Lasso model's statistics on test set ---

R-squared : 0.8999838280717473
Mean Absolute Error : 0.39690164608912276
Mean Squared Error : 0.330753021281618
Root of Mean Squared Error : 0.5751113120793383


### ElasticNet

In [158]:
elasticnet_model = ElasticNet(alpha=0.0001, l1_ratio=0.1)
elasticnet_model.fit(X_train,y_train)

elasticnet_model_train_predict = elasticnet_model.predict(X_train)
elasticnet_model_test_predict = elasticnet_model.predict(X_test)

In [163]:
print("R-squared on train set     :",elasticnet_model.score(X_train,y_train),"\n")

print("--- ElasticNet model's statistics on test set ---\n")
print("R-squared                  :",elasticnet_model.score(X_test,y_test))
print("Mean Absolute Error        :",10**mean_absolute_error(y_test,elasticnet_model_test_predict))
print("Mean Squared Error         :",10**mean_squared_error(y_test,elasticnet_model_test_predict))
print("Root of Mean Squared Error :",10**mean_squared_error(y_test,elasticnet_model_test_predict)**(1/2))

R-squared on train set     : 0.9004305690391123 

--- ElasticNet model's statistics on test set ---

R-squared                  : 0.9000103571544086
Mean Absolute Error        : 2.4969460831355734
Mean Squared Error         : 2.1412397139734813
Root of Mean Squared Error : 3.7586772339927834


## Conclusion

In [160]:
conc = pd.DataFrame()

conc["OLS"] = [ols_model.score(X_train,y_train),ols_model.score(X_test,y_test)]
conc["Ridge"] = [ridge_model.score(X_train,y_train),ridge_model.score(X_test,y_test)]
conc["Lasso"] = [lasso_model.score(X_train,y_train),lasso_model.score(X_test,y_test)]
conc["ElasticNet"] = [elasticnet_model.score(X_train,y_train),elasticnet_model.score(X_test,y_test)]

conc.index = ["Train set R-squared","Test set R-squared"]

conc

Unnamed: 0,OLS,Ridge,Lasso,ElasticNet
Train set R-squared,0.9,0.9,0.9,0.9
Test set R-squared,0.9,0.9,0.9,0.9


Even though OLS has the best performance on training set, the ridge model has the best performance on test set. This means our OLS model may have over-fitting problem and we overcome that problem by algorithms that have regularization.