In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
import scipy.stats as stats
%matplotlib inline

In [2]:
postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))

df = pd.read_sql_query('select * from houseprices',con=engine)

# no need for an open connection, 
# as we're only doing a single query
engine.dispose()

In [3]:
df.head()

Unnamed: 0,id,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,...,poolarea,poolqc,fence,miscfeature,miscval,mosold,yrsold,saletype,salecondition,saleprice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
df['total_sf'] = df['grlivarea'] + df['totalbsmtsf'] + df['firstflrsf'] + df['secondflrsf'] + df['wooddecksf'] + df['openporchsf']
df['total_bathrooms'] = df['fullbath'] + .5*df['halfbath'] + df['bsmtfullbath'] + .5*df['bsmthalfbath']
df['int_totalsf_qual'] = df['overallqual']*df['total_sf']

In [6]:
df['log_saleprice'] = np.log10(df['saleprice'])
df['log_int_totalsf_qual'] = np.log10(df['int_totalsf_qual'])

continuous_features = df[['log_int_totalsf_qual', 'garagecars', 'total_bathrooms', 'yearbuilt', 'yearremodadd']]
target = df['log_saleprice']

In [7]:
categorical_features = pd.get_dummies(df[['exterqual', 'bsmtqual', 'kitchenqual', 'garagefinish', 'neighborhood']], drop_first=True)

model_features = pd.concat([continuous_features, categorical_features], axis=1)

In [8]:
X = model_features
Y = target

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=200)

# OLS model

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse

lrm = LinearRegression()

lrm.fit(X_train, Y_train)

y_preds = lrm.predict(X_test)

print('R-squared for training set: {}'.format(lrm.score(X_train, Y_train)))
print('---------------------------')
print('R_squared for test set: {}'.format(lrm.score(X_test, Y_test)))
print('Mean absolute error for test set: {}'.format(mean_absolute_error(y_preds, Y_test)))
print('Mean squared error for test set: {}'.format(mse(y_preds, Y_test)))
print('Root mean squared error for test set: {}'.format(rmse(y_preds, Y_test)))
print('Mean absolute percentage error for test set: {}'.format(np.mean(np.abs(y_preds-Y_test)/Y_test)*100))

R-squared for training set: 0.8781564824632325
---------------------------
R_squared for test set: 0.8611604037591601
Mean absolute error for test set: 0.0454075206092834
Mean squared error for test set: 0.004216450405464482
Root mean squared error for test set: 0.0649342005838563
Mean absolute percentage error for test set: 0.8706591172689219


# KNN model (n_neighbors=7, weighted)

In [42]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(n_neighbors=7, weights='distance')

knn.fit(X_train, Y_train)

y_preds = knn.predict(X_test)

print('R-squared for training set: {}'.format(knn.score(X_train, Y_train)))
print('---------------------------')
print('R_squared for test set: {}'.format(knn.score(X_test, Y_test)))
print('Mean absolute error for test set: {}'.format(mean_absolute_error(y_preds, Y_test)))
print('Mean squared error for test set: {}'.format(mse(y_preds, Y_test)))
print('Root mean squared error for test set: {}'.format(rmse(y_preds, Y_test)))
print('Mean absolute percentage error for test set: {}'.format(np.mean(np.abs(y_preds-Y_test)/Y_test)*100))

R-squared for training set: 0.9999574021317116
---------------------------
R_squared for test set: 0.7074276716759087
Mean absolute error for test set: 0.06777758995370604
Mean squared error for test set: 0.008885193747249838
Root mean squared error for test set: 0.09426130567337712
Mean absolute percentage error for test set: 1.2953973286565525


# KNN model (n_neighbors=3, unweighted)

In [43]:
knn = KNeighborsRegressor(n_neighbors=3)

knn.fit(X_train, Y_train)

y_preds = knn.predict(X_test)

print('R-squared for training set: {}'.format(knn.score(X_train, Y_train)))
print('---------------------------')
print('R_squared for test set: {}'.format(knn.score(X_test, Y_test)))
print('Mean absolute error for test set: {}'.format(mean_absolute_error(y_preds, Y_test)))
print('Mean squared error for test set: {}'.format(mse(y_preds, Y_test)))
print('Root mean squared error for test set: {}'.format(rmse(y_preds, Y_test)))
print('Mean absolute percentage error for test set: {}'.format(np.mean(np.abs(y_preds-Y_test)/Y_test)*100))

R-squared for training set: 0.824403493925931
---------------------------
R_squared for test set: 0.6965764895817144
Mean absolute error for test set: 0.06820756273228105
Mean squared error for test set: 0.009214735696230068
Root mean squared error for test set: 0.09599341485867699
Mean absolute percentage error for test set: 1.304888807066875


# KNN model (n_neighbors=10, unweighted)

In [48]:
knn = KNeighborsRegressor(n_neighbors=10)

knn.fit(X_train, Y_train)

y_preds = knn.predict(X_test)

print('R-squared for training set: {}'.format(knn.score(X_train, Y_train)))
print('---------------------------')
print('R_squared for test set: {}'.format(knn.score(X_test, Y_test)))
print('Mean absolute error for test set: {}'.format(mean_absolute_error(y_preds, Y_test)))
print('Mean squared error for test set: {}'.format(mse(y_preds, Y_test)))
print('Root mean squared error for test set: {}'.format(rmse(y_preds, Y_test)))
print('Mean absolute percentage error for test set: {}'.format(np.mean(np.abs(y_preds-Y_test)/Y_test)*100))

R-squared for training set: 0.6854836985430106
---------------------------
R_squared for test set: 0.6809649168082899
Mean absolute error for test set: 0.07101160011193995
Mean squared error for test set: 0.009688847002605947
Root mean squared error for test set: 0.09843194096738084
Mean absolute percentage error for test set: 1.3576309922066923


# Summary
The OLS model appears to perform better by every metric. It does not appear to be overfit. All of the metrics are not great, but are satisfactory<br>
<br>
All of the KNN models did not perform nearly as well as the OLS model. The weighted distance parameter appears to result in a extreme overfitting. The lower k values also result in overfitting. The unweighted, k=10 model was not overfit, however all of the metrics were much worse than our OLS model.