In [126]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
import statsmodels.api as sm
from statsmodels.tools.eval_measures import mse, rmse
from sklearn.metrics import mean_absolute_error,mean_squared_error

%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

import warnings
warnings.filterwarnings(action="ignore")

# Assignment 6-7

## Reading Data

In [2]:
X_train = pd.read_csv("house_polynomial(2)_train_features.csv")
X_test = pd.read_csv("house_polynomial(2)_test_features.csv")

y_train = pd.read_csv("house_train_y.csv")
y_test = pd.read_csv("house_test_y.csv")

## Training Models

### OLS

In [11]:
ols_model = LinearRegression()
ols_model.fit(X_train,y_train)

ols_model_train_predict = ols_model.predict(X_train)
ols_model_test_predict = ols_model.predict(X_test)

In [64]:
print("R-squared on train set :",ols_model.score(X_train,y_train),"\n")

print("--- OLS model's statistics on test set ---\n")
print("R-squared :",ols_model.score(X_test,y_test))
print("Mean Absolute Error :",mean_absolute_error(y_test,ols_model_test_predict))
print("Mean Squared Error :",mse(y_test,ols_model_test_predict))
print("Root of Mean Squared Error :",rmse(y_test,ols_model_test_predict))

R-squared on train set : 0.8944050465483597 

--- OLS model's statistics on test set ---

R-squared : 0.8433659810138976
Mean Absolute Error : 21959.114854494597
Mean Squared Error : [1.04929714e+09]
Root of Mean Squared Error : [32392.8563199]


### Ridge Regularization

In [108]:
ridge_model = Ridge(7)
ridge_model.fit(X_train,y_train)

ridge_model_train_predict = ridge_model.predict(X_train)
ridge_model_test_predict = ridge_model.predict(X_test)

In [109]:
print("R-squared on train set :",ridge_model.score(X_train,y_train),"\n")

print("--- Ridge model's statistics on test set ---\n")
print("R-squared :",ridge_model.score(X_test,y_test))
print("Mean Absolute Error :",mean_absolute_error(y_test,ridge_model_test_predict))
print("Mean Squared Error :",mean_squared_error(y_test,ridge_model_test_predict))
print("Root of Mean Squared Error :",mean_squared_error(y_test,ridge_model_test_predict)**(1/2))

R-squared on train set : 0.8776381210845643 

--- Ridge model's statistics on test set ---

R-squared : 0.849009795372836
Mean Absolute Error : 21782.980721401927
Mean Squared Error : 1011489017.4796073
Root of Mean Squared Error : 31803.915128166333


### Lasso Regularization

In [124]:
lasso_model = Lasso(10**2)
lasso_model.fit(X_train,y_train)

lasso_model_train_predict = lasso_model.predict(X_train)
lasso_model_test_predict = lasso_model.predict(X_test)

In [125]:
print("R-squared on train set :",lasso_model.score(X_train,y_train),"\n")

print("--- Lasso model's statistics on test set ---\n")
print("R-squared :",lasso_model.score(X_test,y_test))
print("Mean Absolute Error :",mean_absolute_error(y_test,lasso_model_test_predict))
print("Mean Squared Error :",mean_squared_error(y_test,lasso_model_test_predict))
print("Root of Mean Squared Error :",mean_squared_error(y_test,lasso_model_test_predict)**(1/2))

R-squared on train set : 0.8717486370855018 

--- Lasso model's statistics on test set ---

R-squared : 0.8473475520968392
Mean Absolute Error : 22346.713415064543
Mean Squared Error : 1022624447.2394495
Root of Mean Squared Error : 31978.49976530246


### ElasticNet

In [286]:
elasticnet_model = ElasticNet(alpha=0.07, l1_ratio=0.9)
elasticnet_model.fit(X_train,y_train)

elasticnet_model_train_predict = elasticnet_model.predict(X_train)
elasticnet_model_test_predict = elasticnet_model.predict(X_test)

In [287]:
print("R-squared on train set :",elasticnet_model.score(X_train,y_train),"\n")

print("--- ElasticNet model's statistics on test set ---\n")
print("R-squared :",elasticnet_model.score(X_test,y_test))
print("Mean Absolute Error :",mean_absolute_error(y_test,elasticnet_model_test_predict))
print("Mean Squared Error :",mean_squared_error(y_test,elasticnet_model_test_predict))
print("Root of Mean Squared Error :",mean_squared_error(y_test,elasticnet_model_test_predict)**(1/2))

R-squared on train set : 0.8721670481912681 

--- ElasticNet model's statistics on test set ---

R-squared : 0.8477811690395081
Mean Absolute Error : 22259.381658243226
Mean Squared Error : 1019719631.1529647
Root of Mean Squared Error : 31933.049199112895


## Conclusion

In [288]:
conc = pd.DataFrame()

conc["OLS"] = [ols_model.score(X_train,y_train),ols_model.score(X_test,y_test)]
conc["Ridge"] = [ridge_model.score(X_train,y_train),ridge_model.score(X_test,y_test)]
conc["Lasso"] = [lasso_model.score(X_train,y_train),lasso_model.score(X_test,y_test)]
conc["ElasticNet"] = [elasticnet_model.score(X_train,y_train),elasticnet_model.score(X_test,y_test)]

conc.index = ["Train set R-squared","Test set R-squared"]

conc

Unnamed: 0,OLS,Ridge,Lasso,ElasticNet
Train set R-squared,0.894,0.878,0.872,0.872
Test set R-squared,0.843,0.849,0.847,0.848


Even though OLS has the best performance on training set, the ridge model has the best performance on test set. This means our OLS model may have over-fitting problem and we overcome that problem by algorithms that have regularization.