In [38]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse
from sqlalchemy import create_engine

In [39]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))

df = pd.read_sql_query('select * from houseprices',con=engine)

# no need for an open connection, 
# as we're only doing a single query
engine.dispose()

In [40]:
df.head()

Unnamed: 0,id,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,...,poolarea,poolqc,fence,miscfeature,miscval,mosold,yrsold,saletype,salecondition,saleprice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [41]:
# Let's create another variable by adding all of the square footages

df['total_sf'] = df['totalbsmtsf'] + df['firstflrsf'] + df['secondflrsf'] + df['grlivarea']

# Let's log tranform our variables so they are more normally distributed

df['saleprice_log'] = np.log10(df['saleprice'])
df['total_sf_log'] = np.log10(df['total_sf'])

continuous_features = df[['overallqual', 'total_sf_log', 'garagecars', 'fullbath', 'yearbuilt', 'yearremodadd']]
categorical_features = pd.get_dummies(df[['neighborhood', 'exterqual', 'kitchenqual', 'garagefinish', 'bsmtqual']], drop_first=True)

model_features = pd.concat([continuous_features, categorical_features], axis=1)

# OLS model

In [42]:
X = model_features
Y = df['saleprice_log']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=465)

lrm = LinearRegression()

lrm.fit(X_train, Y_train)

y_preds = lrm.predict(X_test)

print('R-squared for training set: {}'.format(lrm.score(X_train, Y_train)))
print('--------------')
print('R-squared for test set: {}'.format(lrm.score(X_test, Y_test)))
print('Mean absolute error for predictions: {}'.format(mean_absolute_error(y_preds, Y_test)))
print('Mean squared error for predictions: {}'.format(mse(y_preds, Y_test)))
print('Root mean square error for predictions: {}'.format(rmse(y_preds, Y_test)))
print('Mean absolute percentage error: {}'.format(np.mean(np.abs(y_preds-Y_test)/Y_test)*100))

R-squared for training set: 0.8708445530051916
--------------
R-squared for test set: 0.8693739485809054
Mean absolute error for predictions: 0.047173862243272816
Mean squared error for predictions: 0.004108267817578581
Root mean square error for predictions: 0.06409577066841915
Mean absolute percentage error: 0.9082009752392314


# Lasso Regression

In [43]:
from sklearn.linear_model import LassoCV

alphas = [1*10**x for x in range(-10,40)]

lasso_cv = LassoCV(alphas=alphas, cv=5)

lasso_cv.fit(X_train, Y_train)

# We are making predictions here
y_preds_train = lasso_cv.predict(X_train)
y_preds_test = lasso_cv.predict(X_test)

print("Best alpha value is: {}".format(lasso_cv.alpha_))
print("R-squared of the model in training set is: {}".format(lasso_cv.score(X_train, Y_train)))
print("-----Test set statistics-----")
print("R-squared of the model in test set is: {}".format(lasso_cv.score(X_test, Y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(Y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(Y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(Y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((Y_test - y_preds_test) / Y_test)) * 100))

Best alpha value is: 1e-05
R-squared of the model in training set is: 0.870803383081017
-----Test set statistics-----
R-squared of the model in test set is: 0.8691363512383572
Mean absolute error of the prediction is: 0.04719848826701093
Mean squared error of the prediction is: 0.004115740396787161
Root mean squared error of the prediction is: 0.06415403648085724
Mean absolute percentage error of the prediction is: 0.9086772439397826


# Ridge Regression

In [44]:
from sklearn.linear_model import RidgeCV

ridge_cv = RidgeCV(alphas=alphas, cv=5)

ridge_cv.fit(X_train, Y_train)

y_preds = ridge_cv.predict(X_test)

print('Best alpha value: {}'.format(ridge_cv.alpha_))
print('R-squared for training set: {}'.format(ridge_cv.score(X_train, Y_train)))
print('--------------')
print('R-squared for test set: {}'.format(ridge_cv.score(X_test, Y_test)))
print('Mean absolute error for predictions: {}'.format(mean_absolute_error(y_preds, Y_test)))
print('Mean squared error for predictions: {}'.format(mse(y_preds, Y_test)))
print('Root mean square error for predictions: {}'.format(rmse(y_preds, Y_test)))
print('Mean absolute percentage error: {}'.format(np.mean(np.abs(y_preds-Y_test)/Y_test)*100))

Best alpha value: 0.1
R-squared for training set: 0.8708234195982012
--------------
R-squared for test set: 0.8692828713752486
Mean absolute error for predictions: 0.04716420777977098
Mean squared error for predictions: 0.004111132250429837
Root mean square error for predictions: 0.0641181117191534
Mean absolute percentage error: 0.9079348950994829


# ElasticNet Regression

In [45]:
from sklearn.linear_model import ElasticNetCV

elasticnet_cv = ElasticNetCV(alphas=alphas, cv=5)

elasticnet_cv.fit(X_train, Y_train)

y_preds = elasticnet_cv.predict(X_test)

print('Best alpha value: {}'.format(elasticnet_cv.alpha_))
print('R-squared for training set: {}'.format(elasticnet_cv.score(X_train, Y_train)))
print('--------------')
print('R-squared for test set: {}'.format(elasticnet_cv.score(X_test, Y_test)))
print('Mean absolute error for predictions: {}'.format(mean_absolute_error(y_preds, Y_test)))
print('Mean squared error for predictions: {}'.format(mse(y_preds, Y_test)))
print('Root mean square error for predictions: {}'.format(rmse(y_preds, Y_test)))
print('Mean absolute percentage error: {}'.format(np.mean(np.abs(y_preds-Y_test)/Y_test)*100))

Best alpha value: 0.0001
R-squared for training set: 0.8703539684961262
--------------
R-squared for test set: 0.8684922870973173
Mean absolute error for predictions: 0.04733219675181365
Mean squared error for predictions: 0.00413599660107677
Root mean square error for predictions: 0.06431171433787759
Mean absolute percentage error: 0.9110973084685327


According to the results, all the models effectively are equal with a slight edge to the OLS model.