In [1]:
KAGGLE=False # set to True if using the Kaggle version of the dataset

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
import sklearn.preprocessing

from sklearn.pipeline import make_pipeline 
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, StratifiedKFold,KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

import catboost as cb

from xgboost import XGBRegressor
import pickle


In [3]:
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', 300)

In [4]:
def load_data(tree=None):
    if KAGGLE:
        train = pd.read_csv('data/train.csv')
        test = pd.read_csv('data/test.csv')
        num_train=np.shape(train)[0]
        num_test=np.shape(test)[0]
        alldata=train.append(test, ignore_index=True)
    else:
        alldata=pd.read_csv('./data/Ames_HousePriceLocsDistNoDpl.csv')#, index_col=0) #already cleaned (see work in appendix files)

        #alldata=pd.read_csv('./data/Ames_HousePriceLocs.csv')#, index_col=0) #already cleaned (see work in appendix files)
    return alldata

alldata=load_data()

In [5]:
def label_encode():
    from sklearn import preprocessing
    le = preprocessing.LabelEncoder()
    for column_name in alldata.columns:
        if alldata[column_name].dtype == object:
            #cat_idxs.append(alldata.columns.get_loc(column_name))
            #alldata[column_name] = alldata[column_name].astype(str)
            alldata[column_name] = le.fit_transform(alldata[column_name])
label_encode()

In [6]:
def split_into_train_test(alldata):
    y=alldata['SalePrice']
    X=alldata.drop(['SalePrice'],axis=1)
    from sklearn.model_selection import train_test_split
    if KAGGLE:
        return split_train_test(alldata=alldata,num_train=num_train)
    else:
        return train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = split_into_train_test(alldata)  

In [7]:
from catboost import CatBoostRegressor

from_file = CatBoostRegressor()

cbm=from_file.load_model('./models/model_cb_locs_dist_nodpl')

rf=pickle.load(open('./models/model_rf_locs_dist_nodpl', 'rb'))
xgb=pickle.load(open('./models/model_xgb_locs_dist_nodpl', 'rb'))

### Catboost

In [8]:
from sklearn.metrics import r2_score

pred = cbm.predict(X_test)
rmse = (np.sqrt(mean_squared_error(y_test, pred)))
r2 = r2_score(y_test, pred)
print('Testing performance')
print('RMSE: {:.5f}'.format(rmse))
print('R2: {:.5f}'.format(r2))

Testing performance
RMSE: 21380.97680
R2: 0.92275


In [10]:
from sklearn.metrics import r2_score

pred = cbm.predict(X_train)
rmse = (np.sqrt(mean_squared_error(y_train, pred)))
r2 = r2_score(y_train, pred)
print('Testing performance')
print('RMSE: {:.5f}'.format(rmse))
print('R2: {:.5f}'.format(r2))

Testing performance
RMSE: 5164.75691
R2: 0.99517


### Rf

In [16]:
from sklearn.metrics import r2_score

pred = rf.predict(X_test)
rmse = (np.sqrt(mean_squared_error(y_test, pred)))
r2 = r2_score(y_test, pred)
print('Testing performance')
print('RMSE: {:.5f}'.format(rmse))
print('R2: {:.5f}'.format(r2))

Testing performance
RMSE: 23494.85731
R2: 0.90672


In [17]:
from sklearn.metrics import r2_score

pred = rf.predict(X_train)
rmse = (np.sqrt(mean_squared_error(y_train, pred)))
r2 = r2_score(y_train, pred)
print('Testing performance')
print('RMSE: {:.5f}'.format(rmse))
print('R2: {:.5f}'.format(r2))

Testing performance
RMSE: 9376.04747
R2: 0.98409


### XGBoost

In [18]:
from sklearn.metrics import r2_score

pred = xgb.predict(X_test)
rmse = (np.sqrt(mean_squared_error(y_test, pred)))
r2 = r2_score(y_test, pred)
print('Testing performance')
print('RMSE: {:.5f}'.format(rmse))
print('R2: {:.5f}'.format(r2))

Testing performance
RMSE: 20098.27249
R2: 0.93174


In [19]:
from sklearn.metrics import r2_score

pred = xgb.predict(X_train)
rmse = (np.sqrt(mean_squared_error(y_train, pred)))
r2 = r2_score(y_train, pred)
print('Testing performance')
print('RMSE: {:.5f}'.format(rmse))
print('R2: {:.5f}'.format(r2))

Testing performance
RMSE: 2754.65451
R2: 0.99863


- All models overfit, as expected, with the range of difference between train and test R^2's being between 6% and 8%.  Given the scenario of predicting a house price, the amount of overfitting is arguably acceptable.  XGBoost, the heaviest used model, overfits by 6.5%, while random forest overfits by 8%, but little importane