# Modelling

In [15]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression,Lasso
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline,Pipeline

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import pickle
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
from math import sqrt

import warnings
warnings.filterwarnings("ignore")

In [16]:
X_train=pd.read_csv('C:/Users/Administrator/ML/House Prices Advanced Regression Techniques/Storage/X_train_selected.csv')
X_test=pd.read_csv('C:/Users/Administrator/ML/House Prices Advanced Regression Techniques/Storage/X_test_selected.csv')
y_train=pd.read_csv('C:/Users/Administrator/ML/House Prices Advanced Regression Techniques/Storage/y_train_no_scale.csv',names = ['SalePrice'])
y_test=pd.read_csv('C:/Users/Administrator/ML/House Prices Advanced Regression Techniques/Storage/y_test_no_scale.csv',names = ['SalePrice'])

In [17]:
print(X_train.shape)
print(X_test.shape)

# to convert dataframe to series

y_train = y_train.squeeze()
y_test = y_test.squeeze()

print(y_train.shape)
print(y_test.shape)

(1314, 16)
(146, 16)
(1314,)
(146,)


In [18]:
X_test.columns

Index(['TotalSF', 'TotalBsmtSF', 'OverallQual', 'GarageCars', 'BsmtFinSF1',
       '1stFlrSF', 'YearRemodAdd', 'LotArea', 'KitchenQual', 'GrLivArea',
       'GarageFinish', 'GarageArea', 'FullBath', 'ExterQual', 'BsmtQual',
       '2ndFlrSF'],
      dtype='object')

In [39]:
X_train.dtypes

TotalSF         float64
TotalBsmtSF       int64
OverallQual       int64
GarageCars        int64
BsmtFinSF1        int64
1stFlrSF        float64
YearRemodAdd      int64
LotArea         float64
KitchenQual       int64
GrLivArea       float64
GarageFinish      int64
GarageArea        int64
FullBath          int64
ExterQual         int64
BsmtQual          int64
2ndFlrSF          int64
dtype: object

In [40]:
X_test.dtypes

TotalSF         float64
TotalBsmtSF       int64
OverallQual       int64
GarageCars        int64
BsmtFinSF1        int64
1stFlrSF        float64
YearRemodAdd      int64
LotArea         float64
KitchenQual       int64
GrLivArea       float64
GarageFinish      int64
GarageArea        int64
FullBath          int64
ExterQual         int64
BsmtQual          int64
2ndFlrSF          int64
dtype: object

In [20]:
def metric_score(y_train, y_train_pred, y_test, y_pred):
    
     # The Root mean squared error
    rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
    print("Train : RMSE: %.4f" % rmse_train)
    
    # Explained variance score: 1 is perfect prediction
    r2_train = r2_score(y_train, y_train_pred)
    print('Train : R_2 Score: %.2f' % r2_train)
    
    # The Root mean squared error
    rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
    print("Test : RMSE: %.4f" % rmse_test)
    
    # Explained variance score: 1 is perfect prediction
    r2_test = r2_score(y_test, y_pred)
    print('Test : R_2 Score: %.2f' % r2_test)
    return rmse_train, r2_train, rmse_test, r2_test

## Linear Regression

In [21]:
scaler=MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [24]:
linear_regr = LinearRegression(normalize=False)

# Train the model using the training sets
linear_regr.fit(X_train_scaled, y_train)

# Make predictions using the testing set
y_pred = linear_regr.predict(X_test_scaled)
y_train_pred = linear_regr.predict(X_train_scaled)

rmse_train, r2_train, rmse_test, r2_test = metric_score(y_train, y_train_pred, y_test, y_pred)

Train : RMSE: 34849.0033
Train : R_2 Score: 0.80
Test : RMSE: 36802.3898
Test : R_2 Score: 0.81


In [25]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df.tail()

Unnamed: 0,Actual,Predicted
141,190000,196545.064204
142,130500,187653.842662
143,253293,293987.275631
144,219500,185000.343918
145,139400,158924.283119


## Random Forest

In [83]:
%%time 

# Create a pipeline
pipe = make_pipeline((RandomForestRegressor()))
# Create dictionary with candidate learning algorithms and their hyperparameters
grid_param = [
                {"randomforestregressor": [RandomForestRegressor()],
                 "randomforestregressor__n_estimators": [10, 100, 200, 500],
                 "randomforestregressor__max_depth":[5,8,15,25,30,None],
                 "randomforestregressor__min_samples_leaf":[1,2,5,10,15,100],
                 "randomforestregressor__max_leaf_nodes": [2, 5,10]}]
# create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(X_train,y_train)

Wall time: 9min 3s


In [86]:
y_pred = best_model.predict(X_test)
y_train_pred = best_model.predict(X_train)

rmse_train, r2_train, rmse_test, r2_test = metric_score(y_train, y_train_pred, y_test, y_pred)

RMSE: 41034.98
R_2 Score: 0.77


In [90]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df.tail()

Unnamed: 0,Actual,Predicted
141,190000,196467.581188
142,130500,150039.347089
143,253293,344693.006903
144,219500,193715.203105
145,139400,143445.900358


## Grid Search Pipelines for various Regression Alogorithms

In [28]:
%%time

# Create a pipeline
grid_pipe = Pipeline([("regressor", RandomForestRegressor())])
# Create dictionary with candidate learning algorithms and their hyperparameters
grid_param = [
                {"regressor": [LinearRegression(normalize=True)]
                },
                {"regressor": [SVR()],
                 "regressor__C":[0.001,0.1,1,10,100],
                 "regressor__kernel":['linear','rbf']
                },
                {"regressor": [xgb.XGBRegressor()],
                 "regressor__n_estimators": [80, 175, 350],
                 "regressor__learning_rate": [0.125,0.145,0.165],
                 "regressor__subsample": [0.5,0.6,0.7],
                 "regressor__max_depth":[2,3,4,5,6],
                 "regressor__min_child_weight": [3,4],
                 "regressor__colsample_bytree": [0.5,0.7,0.9]
                 },
                {"regressor": [RandomForestRegressor()],
                 "regressor__n_estimators": [100,300,500],
                 "regressor__max_depth":[5,8,15,25,None],
                 "regressor__min_samples_leaf":[1,2,5,10,15,100],
                 "regressor__max_leaf_nodes": [2,5,10]}
                 ]
# create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(grid_pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
grid_best_model = gridsearch.fit(X_train,y_train)

Wall time: 25min 42s


In [29]:
print(grid_best_model.best_estimator_)
print("==============================================")
y_train_pred_grid = grid_best_model.predict(X_train)
y_pred_grid = grid_best_model.predict(X_test)
rmse_train_grid, r2_train_grid, rmse_test_grid, r2_test_grid = metric_score(y_train, y_train_pred_grid, y_test, y_pred_grid)

Pipeline(memory=None,
         steps=[('regressor',
                 XGBRegressor(base_score=0.5, booster='gbtree',
                              colsample_bylevel=1, colsample_bynode=1,
                              colsample_bytree=0.7, gamma=0,
                              importance_type='gain', learning_rate=0.125,
                              max_delta_step=0, max_depth=6, min_child_weight=3,
                              missing=None, n_estimators=80, n_jobs=1,
                              nthread=None, objective='reg:linear',
                              random_state=0, reg_alpha=0, reg_lambda=1,
                              scale_pos_weight=1, seed=None, silent=None,
                              subsample=0.6, verbosity=1))],
         verbose=False)
Train : RMSE: 12535.0840
Train : R_2 Score: 0.97
Test : RMSE: 31631.9871
Test : R_2 Score: 0.86


In [67]:
data=X_train.copy()
data_test = X_test.copy()

In [68]:
data.TotalSF = data.TotalSF.astype('int64')
data['1stFlrSF'] = data['1stFlrSF'].astype('int64')
data.GrLivArea = data.GrLivArea.astype('int64')
data.LotArea = data.LotArea.astype('int64')

data_test.TotalSF = data_test.TotalSF.astype('int64')
data_test['1stFlrSF'] = data_test['1stFlrSF'].astype('int64')
data_test.GrLivArea = data_test.GrLivArea.astype('int64')
data_test.LotArea = data_test.LotArea.astype('int64')

In [78]:
y_test.head()

0    175000
1    149000
2    235000
3     55000
4    155000
Name: SalePrice, dtype: int64

In [99]:
data_test.head(5)

Unnamed: 0,TotalSF,TotalBsmtSF,OverallQual,GarageCars,BsmtFinSF1,1stFlrSF,YearRemodAdd,LotArea,KitchenQual,GrLivArea,GarageFinish,GarageArea,FullBath,ExterQual,BsmtQual,2ndFlrSF
0,1491,1484,5,2,998,7,1971,9,5,7,3,487,2,5,6,0
1,1582,1042,5,1,0,6,1950,8,5,7,3,225,1,5,5,534
2,1597,795,7,2,795,7,1994,9,6,7,5,539,2,5,5,795
3,725,0,4,0,0,5,1950,8,4,6,0,0,2,1,0,720
4,1554,804,6,2,556,6,1976,7,6,7,3,440,2,5,6,744


In [79]:
xg_reg = xgb.XGBRegressor(colsample_bytree=0.7,learning_rate=0.125,max_depth=6, min_child_weight=3,
                               n_estimators=80, n_jobs=1,subsample=0.6)

xgb_best_model = xg_reg.fit(X_train.values,y_train.values)

y_train_pred_xgb = xgb_best_model.predict(X_train.values)
y_pred_xgb = xgb_best_model.predict(X_test.values)

rmse_train_grid, r2_train_grid, rmse_test_grid, r2_test_grid = metric_score(y_train, y_train_pred_xgb, y_test, y_pred_xgb)

Train : RMSE: 12535.0840
Train : R_2 Score: 0.97
Test : RMSE: 31631.9871
Test : R_2 Score: 0.86


In [97]:
X_test.loc[0]

TotalSF         1491.302496
TotalBsmtSF     1484.000000
OverallQual        5.000000
GarageCars         2.000000
BsmtFinSF1       998.000000
1stFlrSF           7.302496
YearRemodAdd    1971.000000
LotArea            9.412710
KitchenQual        5.000000
GrLivArea          7.302496
GarageFinish       3.000000
GarageArea       487.000000
FullBath           2.000000
ExterQual          5.000000
BsmtQual           6.000000
2ndFlrSF           0.000000
Name: 0, dtype: float64

In [95]:
data_1 = [1491.302496,1484.000000,5.000000,2.000000, 998.000000,7,1971.000000,9.412710,5.000000,7.302496,3.000000,487.000000,2.000000,5.000000,6.000000,0.000000]
xgb_best_model.predict([np.array(data_1)])

array([169217.16], dtype=float32)

In [89]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_xgb})
df.head()

Unnamed: 0,Actual,Predicted
0,175000,164974.921875
1,149000,124751.359375
2,235000,224504.015625
3,55000,56766.65625
4,155000,172302.90625


In [81]:
filename = 'xgb_model.pkl'
pickle.dump(xgb_best_model, open("./Storage/"+filename, 'wb'))

In [31]:
# save the model to disk
#filename = 'finalized_grid_model_1.sav'
#pickle.dump(grid_best_model, open("./Storage/"+filename, 'wb'))
 
# some time later...
 
# load the model from disk
#grid_best_model = pickle.load(open("./Storage/"+filename, 'rb'))
#y_pred_grid = grid_best_model.predict(X_test)

## Random Search Pipelines for various Regression Algorithms

In [33]:
%%time

# Create a pipeline
pipe = Pipeline([("regressor", RandomForestRegressor())])
# Create dictionary with candidate learning algorithms and their hyperparameters
params = [
                {"regressor": [LinearRegression(normalize=True)]
                },
                {"regressor": [SVR()],
                 "regressor__C":[0.001,0.1,1,10,100],
                 "regressor__kernel":['linear','rbf']
                },
                {"regressor": [xgb.XGBRegressor()],
                 "regressor__n_estimators": [80, 175, 350,500,750,1000],
                 "regressor__learning_rate": [0.125,0.135,0.165,0.135],
                 "regressor__subsample": [0.4,0.5,0.6,0.7],
                 "regressor__max_depth":[2,3,4,5,6,7,8],
                 "regressor__min_child_weight": [2,3,4,5],
                 "regressor__colsample_bytree": [0.5,0.7,0.8,0.9]
                 },
                {"regressor": [RandomForestRegressor()],
                 "regressor__n_estimators": [100,300,500],
                 "regressor__max_depth":[5,8,15,25,None],
                 "regressor__min_samples_leaf":[1,2,5,10,15,100],
                 "regressor__max_leaf_nodes": [2,5,10]}
                 ]
# create a gridsearch of the pipeline, the fit the best model
rdmsearch = RandomizedSearchCV(pipe, params, cv=10, verbose=0,n_jobs=-1) # Fit grid search
best_model = rdmsearch.fit(X_train,y_train)

Wall time: 55.8 s


In [34]:
print(best_model.best_estimator_)
print("==============================================")
y_pred_train= best_model.predict(X_train)
y_pred = best_model.predict(X_test)
rmse_train, r2_train, rmse_test, r2_test = metric_score(y_train, y_train_pred, y_test, y_pred)

Pipeline(memory=None,
         steps=[('regressor',
                 XGBRegressor(base_score=0.5, booster='gbtree',
                              colsample_bylevel=1, colsample_bynode=1,
                              colsample_bytree=0.8, gamma=0,
                              importance_type='gain', learning_rate=0.125,
                              max_delta_step=0, max_depth=2, min_child_weight=2,
                              missing=None, n_estimators=175, n_jobs=1,
                              nthread=None, objective='reg:linear',
                              random_state=0, reg_alpha=0, reg_lambda=1,
                              scale_pos_weight=1, seed=None, silent=None,
                              subsample=0.7, verbosity=1))],
         verbose=False)
Train : RMSE: 34849.0033
Train : R_2 Score: 0.80
Test : RMSE: 33152.8522
Test : R_2 Score: 0.85


In [111]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df.head()

Unnamed: 0,Actual,Predicted
0,175000,186466.671875
1,149000,130902.28125
2,235000,229039.625
3,55000,52140.90625
4,155000,159590.40625


In [118]:
# save the model to disk
filename = 'finalized_random_model.sav'
pickle.dump(best_model, open("./Storage/"+filename, 'wb'))

In [120]:
# load the model from disk
# loaded_model = pickle.load(open("./Storage/"+filename, 'rb'))
# result = loaded_model.predict(X_test)
# rmse_r, r2_r = metric_score(y_test, result)