In [33]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import KFold, GridSearchCV, StratifiedKFold
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [34]:
# Load dataset
ames = pd.read_csv('ames_model.csv', index_col=0)
ames.head(2)


Unnamed: 0,PID,GrLivArea,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,...,YrSold,SaleType,SaleCondition,LogSalePrice,Age,NewerH_SubClass,Slope_Gentle,TotalArea,RR_prox,BsmtLivArea
1,909176150,856,126000,6,2,68.52,7890,1,3,3,...,2010,9,4,5.100371,71,6,1,11023.0,0,238.0
2,905476230,1049,139500,0,2,42.0,4235,1,3,3,...,2009,9,4,5.144574,25,0,1,7753.0,0,945.0


In [35]:
ames.info()
#list(ames.columns)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2499 entries, 1 to 763
Data columns (total 78 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   PID              2499 non-null   int64  
 1   GrLivArea        2499 non-null   int64  
 2   SalePrice        2499 non-null   int64  
 3   MSSubClass       2499 non-null   int64  
 4   MSZoning         2499 non-null   int64  
 5   LotFrontage      2499 non-null   float64
 6   LotArea          2499 non-null   int64  
 7   Street           2499 non-null   int64  
 8   LotShape         2499 non-null   int64  
 9   LandContour      2499 non-null   int64  
 10  LotConfig        2499 non-null   int64  
 11  Neighborhood     2499 non-null   int64  
 12  Condition1       2499 non-null   int64  
 13  BldgType         2499 non-null   int64  
 14  HouseStyle       2499 non-null   int64  
 15  OverallQual      2499 non-null   int64  
 16  OverallCond      2499 non-null   int64  
 17  YearBuilt      

In [36]:
# Split-out validation dataset
features = ['GrLivArea', 'MSSubClass', 'MSZoning', 'LotFrontage',
 'LotArea', 'Street', 'LotShape', 'LandContour', 'LotConfig', 'Neighborhood',
 'Condition1', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond',
  'YearRemodAdd', 'RoofStyle', 'Exterior1st', 'Exterior2nd',
 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation',
 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir',
 'Electrical', '1stFlrSF', '2ndFlrSF', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MoSold',
 'YrSold', 'SaleType', 'SaleCondition', 'Age', 'NewerH_SubClass',
 'Slope_Gentle', 'TotalArea', 'RR_prox', 'BsmtLivArea']

X = ames.loc[:, features]

In [37]:
# Correaltion analysis
# create correlation  matrix
corr_matrix = X.corr().abs()
corr_matrix

Unnamed: 0,GrLivArea,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,LotConfig,Neighborhood,...,MoSold,YrSold,SaleType,SaleCondition,Age,NewerH_SubClass,Slope_Gentle,TotalArea,RR_prox,BsmtLivArea
GrLivArea,1.000000,0.343789,0.131743,0.282676,0.244892,0.034338,0.167041,0.045263,0.057181,0.114357,...,0.054335,0.002106,0.009855,0.035500,0.220874,0.343789,0.021641,0.389053,0.012187,0.065992
MSSubClass,0.343789,1.000000,0.054675,0.259298,0.147163,0.016330,0.043648,0.020385,0.078343,0.026979,...,0.039654,0.014708,0.058924,0.042356,0.183773,1.000000,0.029425,0.170248,0.056455,0.167891
MSZoning,0.131743,0.054675,1.000000,0.141304,0.062024,0.021984,0.149782,0.053961,0.005741,0.277132,...,0.022333,0.019640,0.036822,0.077418,0.426844,0.054675,0.029499,0.091612,0.014615,0.109920
LotFrontage,0.282676,0.259298,0.141304,1.000000,0.347366,0.052564,0.124278,0.021259,0.176080,0.113107,...,0.011027,0.001421,0.001170,0.003403,0.081749,0.259298,0.055664,0.385012,0.019214,0.134489
LotArea,0.244892,0.147163,0.062024,0.347366,1.000000,0.223488,0.186035,0.114533,0.142065,0.006051,...,0.018048,0.022824,0.009305,0.014520,0.008656,0.147163,0.240206,0.986004,0.032146,0.154151
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NewerH_SubClass,0.343789,1.000000,0.054675,0.259298,0.147163,0.016330,0.043648,0.020385,0.078343,0.026979,...,0.039654,0.014708,0.058924,0.042356,0.183773,1.000000,0.029425,0.170248,0.056455,0.167891
Slope_Gentle,0.021641,0.029425,0.029499,0.055664,0.240206,0.106296,0.076277,0.403907,0.021911,0.104761,...,0.021829,0.005940,0.042429,0.010311,0.029514,0.029425,1.000000,0.231854,0.017057,0.138864
TotalArea,0.389053,0.170248,0.091612,0.385012,0.986004,0.206530,0.213113,0.110327,0.145293,0.034491,...,0.026119,0.020421,0.008116,0.004150,0.070468,0.170248,0.231854,1.000000,0.026241,0.188348
RR_prox,0.012187,0.056455,0.014615,0.019214,0.032146,0.085882,0.032543,0.014522,0.024464,0.027235,...,0.002351,0.029778,0.035160,0.012282,0.037310,0.056455,0.017057,0.026241,1.000000,0.027383


In [38]:
# select upper traingle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

In [39]:
# Find index of columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
to_drop

['NewerH_SubClass', 'TotalArea']

In [40]:
# Split-out validation dataset
features = ['GrLivArea', 'MSSubClass', 'MSZoning', 'LotFrontage',
 'LotArea', 'Street', 'LotShape', 'LandContour', 'LotConfig', 'Neighborhood',
 'Condition1', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond',
  'YearRemodAdd', 'RoofStyle', 'Exterior1st', 'Exterior2nd',
 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation',
 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir',
 'Electrical', '1stFlrSF', '2ndFlrSF', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MoSold',
 'YrSold', 'SaleType', 'SaleCondition', 'Age', 'Slope_Gentle', 'RR_prox', 'BsmtLivArea']

X = ames.loc[:, features]
#Y = ames.loc[:, ['SalePrice']]
y = ames.loc[:, ['LogSalePrice']]

scaler = StandardScaler()  #scaler object

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=22)  #splitting into training and test
kfold = KFold(n_splits=10, random_state=99, shuffle=True)   #kfold cross validation object with 10 splits

X_train = scaler.fit_transform(X_train)  #scaling training set
X_test = scaler.transform(X_test)  #scaling test set


### Linear Regression

In [41]:
# run linear regression
reg = LinearRegression() 
start = datetime.now()
reg.fit(X_train, y_train)
pred = reg.predict(X_test)
stop = datetime.now()
delta = stop - start

print('Linear Regression\n')
r2 = r2_score(y_test, pred)
print('R2: ', r2)
err = np.sqrt(mean_squared_error(y_test, pred))
print('Root Mean Squared Error: ', err)
seconds = delta.seconds + delta.microseconds/1E6
print('Time to compute: ', seconds, 'seconds')

linear_reg = ('Linear Regression', r2, err, seconds)

Linear Regression

R2:  0.9147080695613219
Root Mean Squared Error:  0.0452996537210795
Time to compute:  0.007017 seconds


In [42]:
#display regression coefficients and R-squared value of model
print(reg.intercept_, reg.coef_, reg.score(X, y))

[5.21098107] [[ 2.22698078e-02  5.86922760e-03 -1.06977719e-02  2.73254228e-03
   7.98650842e-03  2.68131521e-03 -1.53971274e-03 -8.15430885e-04
  -5.29925246e-04 -2.59993908e-03  1.09625151e-02 -2.78607234e-03
  -2.95612891e-03  3.49873155e-02  2.01777189e-02  6.66212557e-03
   9.76920556e-04 -3.89978380e-03  6.12260431e-03  2.78582785e-03
   2.13751030e-03 -3.94671215e-03  2.44164607e-03  2.06727048e-03
  -3.11343446e-03 -3.53917841e-04 -3.42247324e-03 -2.10976760e-03
  -2.74622692e+10 -1.19842676e-03 -1.13304953e+10 -7.24522446e+09
   6.53398878e+09 -1.75269124e-05 -5.57405283e-03  8.04808034e-03
  -1.29190160e-03  3.16759835e-02  3.10121335e-02  4.46217786e-03
   6.96327386e-04  7.37255746e-04 -9.48465167e-04 -4.73439425e-03
  -6.96121585e-03 -5.70736850e-03  1.72009114e-03  8.37403329e-03
   5.39377309e-03 -3.04497023e-03 -3.90341730e-03  1.93555073e-03
  -2.17441598e-03  1.24334872e-02 -1.67213642e-03  1.17867933e-04
   6.52569693e-03  3.15166468e-03  2.62538070e-03  1.84825270e-

### K-Nearest Neighbors Regression

In [43]:
# run KNN model
params = {
    'n_neighbors': [9],  #'n_neighbors': [3,4,5,6,7,8,9,10,11,12], Number of neighbor points to consider for prediction
    'weights': ['distance'],  #'weights': ['uniform', 'distance'], weight function used in prediction
    'p': [1]  #'p': [1,2] # p=1 compute manhattan distance, p=2 compute euclidean distance
    }

knn = KNeighborsRegressor()
rs = GridSearchCV(estimator=knn, param_grid=params, cv=10, n_jobs=-1, scoring='neg_mean_squared_error')
rs.fit(X_train, y_train)
print(rs.best_estimator_)
knn = rs.best_estimator_
start = datetime.now()
knn.fit(X_train, y_train)
pred = knn.predict(X_test)
stop = datetime.now()
delta = stop - start

print('-'*30)
r2 = r2_score(y_test, pred)
print('R2: ', r2)
err = np.sqrt(mean_squared_error(y_test, pred))
print('Root Mean Squared Error: ', err)
seconds = delta.seconds + delta.microseconds/1E6
print('Time to compute: ', seconds, 'seconds')

knn_reg = ('KNN', r2, err, seconds)

KNeighborsRegressor(n_neighbors=9, p=1, weights='distance')
------------------------------
R2:  0.8449950659704658
Root Mean Squared Error:  0.061067996422695586
Time to compute:  0.146547 seconds


### Decision Tree Regression¶


In [44]:
# run Decision Tree regression
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=22) #getting not scaled data

params = {'max_depth': [7], # The maximum depth of the tree
          'max_features': ['auto', 'sqrt'], #The number of features to consider
          'min_samples_leaf': [7], # The minimum number of samples required to be at a leaf node
          'min_samples_split': [0.1], #The minimum number of samples required to split an internal node
          'criterion': ['mse'] #The function to measure the quality of a split
         }

tree = DecisionTreeRegressor()
rs = GridSearchCV(estimator=tree, param_grid=params, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
rs.fit(X_train, y_train)
print(rs.best_estimator_)

tree = rs.best_estimator_
start = datetime.now()
tree.fit(X_train, y_train)
pred = tree.predict(X_test)
stop = datetime.now()
delta = stop - start

print('-'*30)
r2 = r2_score(y_test, pred)
print('R2: ', r2)
err = np.sqrt(mean_squared_error(y_test, pred))
print('Root Mean Squared Error: ', err)
seconds = delta.seconds + delta.microseconds/1E6
print('Time to compute: ', seconds, 'seconds')

decision_tree = ('Tree', r2, err, seconds)

DecisionTreeRegressor(criterion='mse', max_depth=7, max_features='auto',
                      min_samples_leaf=7, min_samples_split=0.1)
------------------------------
R2:  0.7171121162128382
Root Mean Squared Error:  0.08249888588101263
Time to compute:  0.015615 seconds


In [45]:
# Compare scores of three ML models
df_results = pd.DataFrame([linear_reg, knn_reg, decision_tree], columns=['model', 'R2','RMSE','comp_time'])
df_results.sort_values('R2',ascending=False)

Unnamed: 0,model,R2,RMSE,comp_time
0,Linear Regression,0.914708,0.0453,0.007017
1,KNN,0.844995,0.061068,0.146547
2,Tree,0.717112,0.082499,0.015615
