In [None]:
!pip install vecstack

from vecstack import stacking
import pandas as pd
import numpy as np


from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score #works
import warnings
warnings.filterwarnings("ignore")

from google.colab import drive
drive.mount('/gdrive')
#Change current working directory to gdrive
%cd /gdrive



Collecting vecstack
  Downloading https://files.pythonhosted.org/packages/d0/a1/b9a1e9e9e5a12078da1ab9788c7885e4c745358f7e57d5f94d9db6a4e898/vecstack-0.4.0.tar.gz
Building wheels for collected packages: vecstack
  Building wheel for vecstack (setup.py) ... [?25l[?25hdone
  Created wheel for vecstack: filename=vecstack-0.4.0-cp36-none-any.whl size=19880 sha256=868d3a4087b3ddf4bb0d01d9c7b1e3b00e02806353e5879461b71740422bfbae
  Stored in directory: /root/.cache/pip/wheels/5f/bb/4e/f6488433d53bc0684673d6845e5bf11a25240577c8151c140e
Successfully built vecstack
Installing collected packages: vecstack
Successfully installed vecstack-0.4.0
Mounted at /gdrive
/gdrive


In [None]:
trainfile = r'/gdrive/Projects/House price prediction/TrainFinal.csv'
trainData = pd.read_csv(trainfile) #creates a dataframe
testfile = r'/gdrive/Projects/House price prediction/TestFinal.csv'
testData = pd.read_csv(testfile)  #creates a dataframe


print(trainData.shape)
print(testData.shape)



(1460, 273)
(1459, 272)


In [None]:
trainData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Columns: 273 entries, Unnamed: 0 to SalePrice
dtypes: float64(11), int64(262)
memory usage: 3.0 MB


In [None]:
testData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Columns: 272 entries, Unnamed: 0 to SaleCondition_Partial
dtypes: float64(11), int64(261)
memory usage: 3.0 MB


In [None]:
#Copy Train data excluding target
trainData_Copy = trainData.iloc[:, :-1].copy()
testData_Copy = testData.copy()
TargetData_copy = trainData["SalePrice"]


In [None]:
# Perform a test_train_split 
Xtrain = trainData_Copy
Xtest = testData_Copy
Ytrain = TargetData_copy 
print(Xtrain.shape)
print(Ytrain.shape)

X_train, X_test, y_train, y_test = train_test_split(Xtrain, Ytrain, test_size = .30, random_state = 1)

print(X_train.shape)
print(X_test.shape)

(1460, 272)
(1460,)
(1022, 272)
(438, 272)


In [None]:
#Decision Tree Regressor ========================================================================
#CONSTRUCT DEFAULT DECISION TREE AND OBTAIN RESPECTIVE RMSE 
clf = DecisionTreeRegressor()
clf.fit(X_train, y_train)
clf_predict_Train=clf.predict(X_train)

print(clf.feature_importances_)

mean_squared_error(y_train,clf_predict_Train)
print("RMSE (training) for Decision Tree:{0:10f}".format(mean_squared_error(y_train,clf_predict_Train)))

clf_predict_Test=clf.predict(X_test)
mean_squared_error(y_test,clf_predict_Test)
print("RMSE (Test Data) for Decision Tree:{0:10f}".format(mean_squared_error(y_test,clf_predict_Test)))


[9.16114293e-04 6.95660995e-04 6.55757936e-04 2.45792250e-02
 2.53700825e-02 6.23105248e-01 4.91949752e-03 5.70444279e-03
 7.67782156e-03 8.53586555e-05 5.62888142e-02 8.75229692e-06
 4.86390510e-03 2.79679405e-02 9.56308818e-03 6.17552558e-03
 0.00000000e+00 8.85240008e-02 1.42554482e-06 1.16367259e-03
 7.69327269e-04 6.02069370e-04 3.71155130e-04 0.00000000e+00
 2.04358738e-03 4.60776512e-04 4.55201929e-04 3.46173640e-03
 1.50122363e-02 2.38083811e-02 3.04682902e-03 3.83657420e-04
 1.31588753e-06 1.96987626e-03 1.24869505e-04 2.24318371e-04
 1.22102553e-03 1.12814479e-03 0.00000000e+00 9.91858449e-07
 0.00000000e+00 1.54177881e-04 1.61143824e-03 0.00000000e+00
 0.00000000e+00 4.09739348e-04 9.84600677e-04 0.00000000e+00
 8.06639055e-06 6.66168061e-06 0.00000000e+00 1.44781896e-07
 4.07251427e-05 0.00000000e+00 0.00000000e+00 4.49746272e-06
 2.90722137e-05 0.00000000e+00 4.85648756e-05 1.10533620e-04
 7.60747477e-07 3.00030581e-05 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.000000

In [None]:
# Cross Validation evaluation for Decision Tree Regressor
print("Printing the cross validation score")
CrossValidation_Score = cross_val_score(clf, X_train, y_train, scoring="neg_mean_squared_error", cv=5)
print(CrossValidation_Score.mean())

Printing the cross validation score
-2185680364.7494597


In [None]:
#Save predictions
clf_predict_Test=clf.predict(Xtest)
df_DT = pd.DataFrame({"Id": Xtest["Id"], "SalePrice":clf_predict_Test})
df_DT.to_csv('/gdrive/Projects/House price prediction/DT_Test.csv', index = False)


In [None]:
#Hyperparameter tuning done for decision tree Regressor 
print("RandomizedSearchCV-Decision tree")
parameters={'min_samples_leaf' : range(1,50,10),'max_depth': 
            range(1,100,10),'criterion':['mse', 'friedman_mse', 'mae']}
clf_random = RandomizedSearchCV(clf,parameters,n_iter=25,cv=5)
clf_random.fit(X_train, y_train)
grid_parm=clf_random.best_params_ # gives the best tree 
print(grid_parm)



RandomizedSearchCV-Decision tree
{'min_samples_leaf': 11, 'max_depth': 61, 'criterion': 'mae'}


In [None]:
#Using the parameters obtained from HyperParameterTuning in the Decision Tree Classifier 
clfRand = DecisionTreeRegressor(**grid_parm)
clfRand.fit(X_train, y_train)
clfRand_predict_Train = clfRand.predict(X_train)

mean_squared_error(y_train,clf_predict_Train)
print("RMSE (training) for Decision Tree with best parameter :{0:10f}".format(mean_squared_error(y_train,clf_predict_Train)))

clfRand_predict_Test= clfRand.predict(X_test)

mean_squared_error(y_test,clfRand_predict_Test)
print("RMSE (Test Data) for Decision Tree with best parameter{0:10f}".format(mean_squared_error(y_test,clfRand_predict_Test)))
clfRand_predict = clfRand.predict(Xtest)


RMSE (training) for Decision Tree with best parameter :  0.000000
RMSE (Test Data) for Decision Tree with best parameter1533704657.659817


In [None]:
# Cross Validation evaluation for Decision Tree Regressor with hyperparameter tuning 
print("Printing the cross validation score")
CrossValidation_Score = cross_val_score(clfRand, X_train, y_train, scoring="neg_mean_squared_error", cv=5)
print(CrossValidation_Score.mean())

Printing the cross validation score
-1650334385.9160533


In [None]:
# Save prediction after hyperparameter tuning of the decision tree regressor 
clf_predict_Test=clf.predict(Xtest)
df_DT = pd.DataFrame({"Id": Xtest["Id"], "SalePrice":clf_predict_Test})
df_DT.to_csv('/gdrive/Projects/House price prediction/DT2_Test.csv', index = False)


In [None]:
#Random Forest Regressor==============================================================================
#=================================================================================================

rfc = RandomForestRegressor()
rfc.fit(X_train, y_train)

# print(rfc.feature_importances_)

rfc_predict_Train=rfc.predict(X_train)
mean_squared_error(y_train,rfc_predict_Train)
print("RMSE (training) for Random Forest Regressor:{0:10f}".format(mean_squared_error(y_train,rfc_predict_Train)))

rfc_predict_Test=rfc.predict(X_test)
mean_squared_error(y_test,rfc_predict_Test)
print("RMSE (Test Data) for Random Forest Regressor:{0:10f}".format(mean_squared_error(y_test,rfc_predict_Test)))



RMSE (training) for Random Forest Regressor:160077771.878988
RMSE (Test Data) for Random Forest Regressor:699046983.509769


In [None]:
# Cross Validation evaluation for Random Forest
print("Printing the cross validation score")
CrossValidation_Score = cross_val_score(rfc, X_train, y_train, scoring="neg_mean_squared_error", cv=5)
print(CrossValidation_Score.mean())

Printing the cross validation score
-1061608176.6549772


In [None]:

#Save predictions
rfc_predict_Test=rfc.predict(Xtest)

df_RF = pd.DataFrame({"Id": Xtest["Id"], "SalePrice":rfc_predict_Test})
df_RF.to_csv('/gdrive/Projects/House price prediction/RF_Test.csv', index = False)


In [None]:
#Hyperparameter tuning done for random forest regressor 

#RANDOM SEARCH--------------------------------------------

print("RandomizedSearchCV-Random forest")
rand_parameters={'min_samples_leaf' : range(10,100,10),'max_depth': 
            range(1,10,2),'max_features':[10,20,30],'n_estimators':[20,30,40]}
rfc_random = RandomizedSearchCV(rfc,rand_parameters,n_iter=25,cv=5)
rfc_random.fit(X_train, y_train)
grid_parm=rfc_random.best_params_
print(grid_parm)


RandomizedSearchCV-Random forest
{'n_estimators': 30, 'min_samples_leaf': 10, 'max_features': 30, 'max_depth': 7}


In [None]:
#Using the parameters obtained from HyperParameterTuning in the Random Forest Regressor 
rfcRand = RandomForestRegressor(**grid_parm)
rfcRand.fit(X_train, y_train)

rfcRand_predict_Train = rfcRand.predict(X_train)
mean_squared_error(y_train,clf_predict_Train)
print("RMSE (training) for Random Forest Regressor with best parameter:{0:10f}".format(mean_squared_error(y_train,rfcRand_predict_Train)))

rfcRand_predict_Test=rfcRand.predict(X_test)
mean_squared_error(y_test,rfcRand_predict_Test)
print("RMSE (Test Data) for Random Forest with best parameter:{0:10f}".format(mean_squared_error(y_test,rfcRand_predict_Test)))

rfcRand_predict = rfcRand.predict(Xtest)


RMSE (training) for Random Forest Regressor with best parameter:1003330602.932782
RMSE (Test Data) for Random Forest with best parameter:1322621813.961111


In [None]:
# Cross Validation evaluation for Random Forest Regressor with hyperparameter tuning 
print("Printing the cross validation score")
CrossValidation_Score = cross_val_score(rfcRand, X_train, y_train, scoring="neg_mean_squared_error", cv=5)
print(CrossValidation_Score.mean())

Printing the cross validation score
-1265517793.3795311


In [None]:
# Save prediction 
rfcRand_predict = rfcRand.predict(Xtest)

df_RF = pd.DataFrame({"Id": Xtest["Id"], "SalePrice":rfcRand_predict})
df_RF.to_csv('/gdrive/Projects/House price prediction/RF2_Test.csv', index = False)


In [None]:
#Gradient Boosting Regressor================================================================================

abc =GradientBoostingRegressor()
abc.fit(X_train, y_train)
abc_predict_Train=abc.predict(X_train)


# print(abc.feature_importances_)

mean_squared_error(y_train,abc_predict_Train)
print("RMSE (training) for Gradient boosting regressor:{0:10f}".format(mean_squared_error(y_train,abc_predict_Train)))
abc_predict_Test=rfc.predict(X_test)
mean_squared_error(y_test,abc_predict_Test)
print("RMSE (Test Data) for Gradient boosting regressor:{0:10f}".format(mean_squared_error(y_test,abc_predict_Test)))




RMSE (training) for Gradient boosting regressor:177211170.303055
RMSE (Test Data) for Gradient boosting regressor:721061068.700645


In [None]:
# Cross Validation evaluation for Gradient boosting regressor 
print("Printing the cross validation score")
CrossValidation_Score = cross_val_score(abc, X_train, y_train, scoring="neg_mean_squared_error", cv=5)
print(CrossValidation_Score.mean())

Printing the cross validation score
-827174204.9193505


In [None]:
#Save predictions

abc_predict=abc.predict(Xtest)

df_RF = pd.DataFrame({"Id": Xtest["Id"], "SalePrice":abc_predict})
df_RF.to_csv('/gdrive/Projects/House price prediction/GB_Test.csv', index = False)

In [None]:
#Randomized Search for hyperparameter tuning
abc = GradientBoostingRegressor()
search_grid={'n_estimators':[5,10,20, 30, 50],'learning_rate':[0.01,.1]}
abc_random = RandomizedSearchCV(abc,search_grid,n_iter=15)
abc_random.fit(X_train, y_train)
grid_parm_abc=abc_random.best_params_
print(grid_parm_abc)

{'n_estimators': 50, 'learning_rate': 0.1}


In [None]:
#CONTRUCT A GRADIENT BOOSTING MODEL using the best parameters========
abc = GradientBoostingRegressor(**grid_parm_abc)  
abc.fit(X_train, y_train)
abc_predict=abc.predict(Xtest)

abc_predict_Train=abc.predict(X_train)
mean_squared_error(y_train,abc_predict_Train)
print("RMSE (training) for Gradient boosting Regressor with hyperparameter tuning {0:10f}".format(mean_squared_error(y_train,abc_predict_Train)))

abc_predict_Test=abc.predict(X_test)
mean_squared_error(y_test,abc_predict_Test)
print("RMSE (Test Data) for Gradient boosting Regressor with hyperparameter tuning :{0:10f}".format(mean_squared_error(y_test,abc_predict_Test)))



RMSE (training) for Gradient boosting Regressor with hyperparameter tuning 304826335.641936
RMSE (Test Data) for Gradient boosting Regressor with hyperparameter tuning :682669586.669524


In [None]:
# Cross Validation evaluation for Gradient boosting Regressor with hyperparameter tuning 
print("Printing the cross validation score")
CrossValidation_Score = cross_val_score(abc, X_train, y_train, scoring="neg_mean_squared_error", cv=5)
print(CrossValidation_Score.mean())

Printing the cross validation score
-904151025.0088466


In [None]:
#Save predictions

abc_predict=abc.predict(Xtest)

df_RF = pd.DataFrame({"Id": Xtest["Id"], "SalePrice":abc_predict})
df_RF.to_csv('/gdrive/Projects/House price prediction/GB2_Test.csv', index = False)

In [None]:
#CONTRUCT A Support Vector Regressor MODEL ========
from sklearn.svm import SVR
SVReg = SVR(kernel = 'rbf')
SVReg.fit(X_train, y_train)
Y_Predict_Test = SVReg.predict(X_test)

SVReg_predict_Train=SVReg.predict(X_train)
mean_squared_error(y_train,SVReg_predict_Train)
print("RMSE (training) for Support Vector Regressor:{0:10f}".format(mean_squared_error(y_train,SVReg_predict_Train)))

SVReg_predict_Test=SVReg.predict(X_test)
mean_squared_error(y_test,SVReg_predict_Test)
print("RMSE (Test Data) for Support Vector Regressor:{0:10f}".format(mean_squared_error(y_test,SVReg_predict_Test)))



RMSE (training) for Support Vector Regressor:6277799254.266168
RMSE (Test Data) for Support Vector Regressor:7409541653.591960


In [None]:
# Cross Validation evaluation for Support Vector Regressor
print("Printing the cross validation score")
CrossValidation_Score = cross_val_score(SVReg, X_train, y_train, scoring="neg_mean_squared_error", cv=5)
print(CrossValidation_Score.mean())

Printing the cross validation score
-6263042680.2809515


In [None]:
#Save Prediction 
SVReg_predict=SVReg.predict(Xtest)

df_RF = pd.DataFrame({"Id": Xtest["Id"], "SalePrice":SVReg_predict})
df_RF.to_csv('/gdrive/Projects/House price prediction/SVR_Test.csv', index = False)

In [None]:
#CONTRUCT A K-Nearest Neighbor Regression MODEL ========
from sklearn.neighbors import KNeighborsRegressor
KNNR = KNeighborsRegressor(n_neighbors=3)
KNNR.fit(X_train, y_train)
Y_Predict_Test = KNNR.predict(X_test)

KNNR_predict_Train=KNNR.predict(X_train)
mean_squared_error(y_train,KNNR_predict_Train)
print("RMSE (training) for K-nearest Neighbour :{0:10f}".format(mean_squared_error(y_train,KNNR_predict_Train)))

KNNR_predict_Test=SVReg.predict(X_test)
mean_squared_error(y_test,KNNR_predict_Test)
print("RMSE (Test Data) for K-nearest Neighbour:{0:10f}".format(mean_squared_error(y_test,KNNR_predict_Test)))



RMSE (training) for K-nearest Neighbour :1163407025.573168
RMSE (Test Data) for K-nearest Neighbour:7409541653.591960


In [None]:
# Cross Validation evaluation for K-nearest Neighbour 
print("Printing the cross validation score")
CrossValidation_Score = cross_val_score(KNNR, X_train, y_train, scoring="neg_mean_squared_error", cv=5)
print(CrossValidation_Score.mean())

Printing the cross validation score
-2508783794.648517


In [None]:

KNNR_predict=KNNR.predict(Xtest)

df_RF = pd.DataFrame({"Id": Xtest["Id"], "SalePrice":KNNR_predict})
df_RF.to_csv('/gdrive/Projects/House price prediction/KNNR_Test.csv', index = False)

In [None]:
#STACKING MODELS =====================================================================
print("___________________________________________________________________________________________\nEnsemble Methods Predictions using GradientBoosting, RandomForest and Decision Tree Classifier\n")

models = [ GradientBoostingRegressor(), RandomForestRegressor(), DecisionTreeRegressor(), GradientBoostingRegressor(**grid_parm_abc)]
      
S_Train, S_Test = stacking(models,                   
                           X_train, y_train, X_test,   
                           regression=True, 
     
                           mode='oof_pred_bag', 
       
                           needs_proba=False,
         
                           save_dir=None, 
                                        
                           n_folds=4, 
                                                    
                           verbose=2)



___________________________________________________________________________________________
Ensemble Methods Predictions using GradientBoosting, RandomForest and Decision Tree Classifier

task:         [regression]
metric:       [mean_absolute_error]
mode:         [oof_pred_bag]
n_models:     [4]

model  0:     [GradientBoostingRegressor]
    fold  0:  [18238.80702654]
    fold  1:  [18998.05810385]
    fold  2:  [18359.24972924]
    fold  3:  [15065.34610512]
    ----
    MEAN:     [17665.36524119] + [1528.60074398]
    FULL:     [17667.23034358]

model  1:     [RandomForestRegressor]
    fold  0:  [19941.91242188]
    fold  1:  [19017.60140625]
    fold  2:  [18999.70478431]
    fold  3:  [16233.01796078]
    ----
    MEAN:     [18548.05914331] + [1389.84678658]
    FULL:     [18549.88242661]

model  2:     [DecisionTreeRegressor]
    fold  0:  [27677.49218750]
    fold  1:  [28888.32421875]
    fold  2:  [26996.64313725]
    fold  3:  [26253.20000000]
    ----
    MEAN:     [27453.9

In [None]:
#STACKING - CONTRUCT A GRADIENT BOOSTING MODEL==============================
model = GradientBoostingRegressor()
    
model = model.fit(S_Train, y_train)
y_pred_train = model.predict(S_Train)
y_pred_test = model.predict(S_Test)



In [None]:
mean_squared_error(y_train,y_pred_train)
print("RMSE (training) for GRADIENT BOOSTING:{0:10f}".format(mean_squared_error(y_train,y_pred_train)))
mean_squared_error(y_test,y_pred_test)
print("RMSE (Test Data) for GRADIENT BOOSTING:{0:10f}".format(mean_squared_error(y_test,y_pred_test)))

RMSE (training) for GRADIENT BOOSTING:327519238.346160
RMSE (Test Data) for GRADIENT BOOSTING:1701932038.078007


In [None]:
# Cross Validation evaluation for Training after stacking 
print("Printing the cross validation score")
CrossValidation_Score = cross_val_score(model, S_Train, y_train, scoring="neg_mean_squared_error", cv=5)
print(CrossValidation_Score.mean())

Printing the cross validation score
-1129043680.7722962


In [None]:
#STACKING MODELS =====================================================================
print("___________________________________________________________________________________________\nEnsemble Methods Predictions using GradientBoosting, RandomForest and Decision Tree Classifier\n")

models = [ GradientBoostingRegressor(), RandomForestRegressor(), DecisionTreeRegressor(), GradientBoostingRegressor(**grid_parm_abc) ]
      
S_Train, S_Test = stacking(models,                   
                           X_train, y_train, Xtest,   
                           regression=True, 
     
                           mode='oof_pred_bag', 
       
                           needs_proba=False,
         
                           save_dir=None, 
                                        
                           n_folds=4, 
                                                    
                           verbose=2)


___________________________________________________________________________________________
Ensemble Methods Predictions using GradientBoosting, RandomForest and Decision Tree Classifier

task:         [regression]
metric:       [mean_absolute_error]
mode:         [oof_pred_bag]
n_models:     [3]

model  0:     [RandomForestRegressor]
    fold  0:  [20217.41148437]
    fold  1:  [19306.86671875]
    fold  2:  [18785.77047059]
    fold  3:  [16313.54725490]
    ----
    MEAN:     [18655.89898215] + [1446.16376202]
    FULL:     [18658.06383562]

model  1:     [DecisionTreeRegressor]
    fold  0:  [29293.35156250]
    fold  1:  [29387.45703125]
    fold  2:  [27297.39607843]
    fold  3:  [26407.04705882]
    ----
    MEAN:     [28096.31293275] + [1283.72912808]
    FULL:     [28098.74755382]

model  2:     [GradientBoostingRegressor]
    fold  0:  [19548.02922346]
    fold  1:  [19799.34903789]
    fold  2:  [18860.75854166]
    fold  3:  [16085.33388000]
    ----
    MEAN:     [18573.3

In [None]:
model = GradientBoostingRegressor()
model.fit(S_Train, y_train)
Test_Pred = model.predict(S_Test)

In [None]:
# Cross Validation evaluation for Random Forest Regressor with hyperparameter tuning 
CrossValidation_Score = cross_val_score(model, S_Train, y_train, scoring="neg_mean_squared_error", cv=5)
print(CrossValidation_Score.mean())

-978582777.2040167


In [None]:
#writing the predicted values in a file to submit in kaggle and check the score
model = GradientBoostingRegressor(**grid_parm_abc) 
model.fit(S_Train, y_train)

y_pred_train = model.predict(S_Train)
mean_squared_error(y_train,y_pred_train)
print("RMSE (training) for Gradient Boosting after Stacking:{0:10f}".format(mean_squared_error(y_train,y_pred_train)))



RMSE (training) for Gradient Boosting after Stacking:470451390.046705


In [None]:

Test_Pred = model.predict(S_Test)

df_RF = pd.DataFrame({"Id": Xtest["Id"], "SalePrice":Test_Pred})
df_RF.to_csv('/gdrive/Projects/House price prediction/stacking_Test.csv', index = False)