In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from statsmodels.tools.eval_measures import mse, rmse
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')

postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))
hprice = pd.read_sql_query('select * from houseprices',con=engine)

engine.dispose()

In [2]:
hprice = pd.concat([hprice,pd.get_dummies(hprice.street, drop_first=True)], axis=1)
dummy_columns = list(pd.get_dummies(hprice.street, drop_first=True).columns)

hprice['totalsf'] = hprice['totalbsmtsf'] + hprice['firstflrsf'] + hprice['secondflrsf']
hprice['int_over_sf'] = hprice['totalsf'] * hprice['overallqual']

X = hprice[['overallqual', 'grlivarea', 'garagecars', 'garagearea', 'totalsf', 'int_over_sf'] + dummy_columns]
Y = hprice['saleprice']

X = sm.add_constant(X)

results = sm.OLS(Y, X).fit()

print(results.summary())

                            OLS Regression Results                            
Dep. Variable:              saleprice   R-squared:                       0.773
Model:                            OLS   Adj. R-squared:                  0.772
Method:                 Least Squares   F-statistic:                     704.9
Date:                Wed, 20 Nov 2019   Prob (F-statistic):               0.00
Time:                        17:31:35   Log-Likelihood:                -17463.
No. Observations:                1460   AIC:                         3.494e+04
Df Residuals:                    1452   BIC:                         3.498e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const       -2.113e+04   1.91e+04     -1.104      

The R-squared and the adjusted R-squared of the model are 0.773 and 0.772 respectively. Hence, according to the R-squared, around 22.8% of the variance in the target variable is unexplained by the model. AIC and BIC scores are 34940 and 34980 respectively.

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 465)

print("The number of observations in training set is {}".format(X_train.shape[0]))
print("The number of observations in test set is {}".format(X_test.shape[0]))

The number of observations in training set is 1168
The number of observations in test set is 292


In [7]:
lrm = LinearRegression()
lrm.fit(X_train, y_train)


# We are making predictions here
y_preds_train = lrm.predict(X_train)
y_preds_test = lrm.predict(X_test)

print("R-squared of the model in the training set is: {}".format(lrm.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in the test set is: {}".format(lrm.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

R-squared of the model in the training set is: 0.7640233922522901
-----Test set statistics-----
R-squared of the model in the test set is: 0.7931114118523199
Mean absolute error of the prediction is: 24698.238598022544
Mean squared error of the prediction is: 1388988758.16082
Root mean squared error of the prediction is: 37269.13948779633
Mean absolute percentage error of the prediction is: 15.094458323173965


In [19]:
alphas = [np.power(10.0,p) for p in np.arange(-10,40,1)]

lassoregr = LassoCV(alphas=alphas) 
lassoregr.fit(X_train, y_train)

# We are making predictions here
y_preds_train = lassoregr.predict(X_train)
y_preds_test = lassoregr.predict(X_test)

print("Best alpha value is: {}".format(lassoregr.alpha_))
print("R-squared of the model on the training set is: {}".format(lassoregr.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model on the test set is: {}".format(lassoregr.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))


Best alpha value is: 10.0
R-squared of the model on the training set is: 0.7640193164122014
-----Test set statistics-----
R-squared of the model on the test set is: 0.7933648570959095
Mean absolute error of the prediction is: 24681.002964869473
Mean squared error of the prediction is: 1387287201.795111
Root mean squared error of the prediction is: 37246.304538774195
Mean absolute percentage error of the prediction is: 15.07294353064981


In [20]:
ridgeregr = RidgeCV(alphas=alphas) 
ridgeregr.fit(X_train, y_train)

# We are making predictions here
y_preds_train = ridgeregr.predict(X_train)
y_preds_test = ridgeregr.predict(X_test)

print("Best alpha value is: {}".format(lassoregr.alpha_))
print("R-squared of the model on the training set is: {}".format(ridgeregr.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model on the test set is: {}".format(ridgeregr.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))


Best alpha value is: 10.0
R-squared of the model on the training set is: 0.7500467061909435
-----Test set statistics-----
R-squared of the model on the test set is: 0.8175580007939539
Mean absolute error of the prediction is: 22654.346699015332
Mean squared error of the prediction is: 1224861594.2639413
Root mean squared error of the prediction is: 34998.02271934718
Mean absolute percentage error of the prediction is: 13.517278152484982


In [22]:
elasticregr = ElasticNetCV(alphas=alphas) 
elasticregr.fit(X_train, y_train)

# We are making predictions here
y_preds_train = elasticregr.predict(X_train)
y_preds_test = elasticregr.predict(X_test)

print("Best alpha value is: {}".format(lassoregr.alpha_))
print("R-squared of the model on the training set is: {}".format(elasticregr.score(X_train, y_train)))
print("-----Test set statistics-----")
print("R-squared of the model on the test set is: {}".format(elasticregr.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))


Best alpha value is: 10.0
R-squared of the model on the training set is: 0.7538899894760976
-----Test set statistics-----
R-squared of the model on the test set is: 0.8144634315146178
Mean absolute error of the prediction is: 22882.737890213673
Mean squared error of the prediction is: 1245637616.6575954
Root mean squared error of the prediction is: 35293.59172226022
Mean absolute percentage error of the prediction is: 13.674967132791188


According to the results Ridge regression and ElasticNet regression have the better results. Test set statistics in Ridge regression have the smaller values that means its model is the best. 