In [9]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn import linear_model
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn import neighbors
from sklearn.model_selection import cross_val_score

# Display preferences.
%matplotlib inline
pd.options.display.float_format = '{:.3f}'.format

import warnings
warnings.filterwarnings('ignore')

postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))
hp_df = pd.read_sql_query('select * from houseprices',con=engine)

# no need for an open connection, as we're only doing a single query
engine.dispose()

## OLS and Test/Train Split

In [10]:
# Y is the target variable
Y = hp_df['saleprice']
# X is the feature set
X = hp_df[['grlivarea','overallqual', 'garagecars', 'fullbath', 'fireplaces', 'yearbuilt', 'yearremodadd', 'totrmsabvgrd']]

# We create a LinearRegression model object
# from scikit-learn's linear_model module.
lrm = linear_model.LinearRegression()

# fit method estimates the coefficients using OLS
lrm.fit(X, Y)

# Inspect the results.
print('\nCoefficients: \n', lrm.coef_)
print('\nIntercept: \n', lrm.intercept_)


Coefficients: 
 [   58.64977899 20526.59671243 15617.48454101 -8280.94448954
 11563.95865355   344.45117267   298.66280008  -910.88281567]

Intercept: 
 -1320759.3485210259


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 465)

In [12]:
print("The number of observations in training set is {}".format(X_train.shape[0]))
print("The number of observations in test set is {}".format(X_test.shape[0]))

The number of observations in training set is 1168
The number of observations in test set is 292


In [13]:
X_train = sm.add_constant(X_train)

# We fit an OLS model using statsmodels
results = sm.OLS(y_train, X_train).fit()

# We print the summary results
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:              saleprice   R-squared:                       0.762
Model:                            OLS   Adj. R-squared:                  0.760
Method:                 Least Squares   F-statistic:                     463.5
Date:                Mon, 24 Jun 2019   Prob (F-statistic):               0.00
Time:                        11:30:04   Log-Likelihood:                -13988.
No. Observations:                1168   AIC:                         2.799e+04
Df Residuals:                    1159   BIC:                         2.804e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const         -1.26e+06   1.48e+05     -8.532   

## KNN Regression

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 465)

In [15]:
knn = neighbors.KNeighborsRegressor(n_neighbors=15)
X = X_train
Y = y_train
knn.fit(X, Y)

# Set up our prediction line.
T = X_test

# Trailing underscores are a common convention for a prediction.
Y_ = knn.predict(T)

In [16]:
score = cross_val_score(knn, X, Y, cv=5)
print("Unweighted Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))


Unweighted Accuracy: 0.63 (+/- 0.05)


## Comparison

The OLS model seems to be superior, with an R^2 value of 0.76, compared to 0.63 for the KNN regression.