In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import pickle
%matplotlib inline

In [None]:
# Read the train and test datasets
df_train = pd.read_csv("dataset/train_final.csv")
df_test = pd.read_csv("dataset/test_final.csv")

cols_train = df_train.columns.tolist()
cols_test = df_test.columns.tolist()

# Train the model with columns that exist both in train and test set
cols_to_train = [col for col in cols_train if col in cols_test]
cols_to_train.remove('Id')

In [None]:
X_train_i = df_train[cols_to_train]
Y_train = df_train['SalePrice']
X_test_i = df_test[cols_to_train]
X_Id = df_test['Id']

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_i)
X_test = scaler.fit_transform(X_test_i)

In [None]:
print("Training(+validation) set shape : {}".format(X_train.shape))
print("Y_train shape : {}".format(Y_train.shape))
print("Test set shape : {}".format(X_test.shape))

In [6]:
param = {
#     'loss': ['deviance', 'exponential'], 
    'learning_rate': [.05,.1,.2],
    'n_estimators': [600]
#     'criterion': ['mae'], 
#     'max_depth': [3,4,5],
#     'random_state': [0]
}
estmtr = GradientBoostingRegressor(verbose=1)
cv_split = ShuffleSplit(n_splits = 10, test_size = .20, train_size = .80, random_state = 0 )
best_model = GridSearchCV(estimator = estmtr, param_grid = param, cv = cv_split,\
                          scoring = 'neg_mean_absolute_error')

In [7]:
best_model.fit(X_train, Y_train)

      Iter       Train Loss   Remaining Time 
         1  5710262525.7064            4.78s
         2  5307113392.1186            5.38s
         3  4942187359.0342            5.17s
         4  4610289000.9261            4.91s
         5  4303764366.2733            4.87s
         6  4027765473.2464            4.74s
         7  3773380888.1083            4.73s
         8  3536570467.2657            4.73s
         9  3321567744.1399            4.65s
        10  3120291226.0258            4.59s
        20  1784984463.1845            4.28s
        30  1137360580.3940            4.21s
        40   795238229.0446            4.11s
        50   602559002.7925            3.99s
        60   487638836.0371            3.91s
        70   410614789.9575            3.86s
        80   361211656.4160            3.78s
        90   325608995.5620            3.72s
       100   298468663.1140            3.65s
       200   184168346.9873            2.90s
       300   134491093.0693            2.19s
       40

        30  1201556438.8193            4.34s
        40   846664541.4956            4.26s
        50   644035859.7189            4.20s
        60   516833943.8136            4.13s
        70   434133446.1598            4.02s
        80   379401171.3165            3.92s
        90   338749417.3588            3.82s
       100   309842872.8143            3.73s
       200   193219802.7818            2.99s
       300   145798818.0233            2.23s
       400   114368142.0414            1.51s
       500    91723676.5089            0.75s
       600    74313801.9460            0.00s
      Iter       Train Loss   Remaining Time 
         1  5756597763.9434            4.18s
         2  5347698384.7114            4.47s
         3  4976472835.5206            4.37s
         4  4634086854.7266            4.46s
         5  4325961772.2335            4.27s
         6  4042296115.6355            4.34s
         7  3787173133.6910            4.22s
         8  3549125759.8602            4.28s
         

       200   119250435.4637            3.02s
       300    76469650.1536            2.33s
       400    51937601.9862            1.59s
       500    36232096.9486            0.81s
       600    26021315.1626            0.00s
      Iter       Train Loss   Remaining Time 
         1  5119861780.9079            5.38s
         2  4431895929.3408            4.47s
         3  3863983406.2010            4.36s
         4  3387035373.8993            4.60s
         5  2975138301.5792            4.50s
         6  2644701166.8528            4.54s
         7  2349693619.7304            4.47s
         8  2102493461.5716            4.42s
         9  1892366235.4531            4.45s
        10  1718662481.6410            4.35s
        20   769418408.2134            4.13s
        30   480797649.0283            4.02s
        40   360866877.3082            3.92s
        50   299465650.8322            3.85s
        60   258575472.0360            3.78s
        70   228207660.2678            3.73s
        8

        30   274112529.8793            4.02s
        40   230559134.5884            3.99s
        50   191818193.4847            3.93s
        60   171167450.8507            3.88s
        70   150092798.9389            3.81s
        80   134262730.2250            3.75s
        90   123979680.6049            3.70s
       100   114012947.0371            3.64s
       200    49903242.9160            2.94s
       300    23455219.6051            2.19s
       400    12035417.5542            1.49s
       500     6502451.3396            0.75s
       600     3694473.4888            0.00s
      Iter       Train Loss   Remaining Time 
         1  4527696802.0054            4.18s
         2  3422133602.5443            4.47s
         3  2644479967.4272            4.37s
         4  2092652851.0577            4.46s
         5  1679469065.5177            4.39s
         6  1374600383.5575            4.44s
         7  1152584357.5278            4.56s
         8   979591024.2914            4.50s
         

       200    49870910.9224            2.86s
       300    24293854.4718            2.13s
       400    12767936.8294            1.42s
       500     6684294.5956            0.72s
       600     3786943.7064            0.00s
      Iter       Train Loss   Remaining Time 
         1  5848989195.3991            5.97s
         2  5435694060.4798            5.96s
         3  5062301191.7565            5.56s
         4  4721305237.4000            5.50s
         5  4409444041.6985            5.46s
         6  4121930505.1983            5.53s
         7  3864756012.6645            5.66s
         8  3623426217.9001            5.61s
         9  3404868142.1142            5.63s
        10  3201149420.8081            5.65s
        20  1855134763.0673            5.67s
        30  1184939251.5364            5.36s
        40   838480394.6591            5.11s
        50   647751393.8714            4.93s
        60   531788116.0207            4.78s
        70   454853086.3930            4.71s
        8

GridSearchCV(cv=ShuffleSplit(n_splits=10, random_state=0, test_size=0.2, train_size=0.8),
             estimator=GradientBoostingRegressor(verbose=1),
             param_grid={'learning_rate': [0.05, 0.1, 0.2],
                         'n_estimators': [600]},
             scoring='neg_mean_absolute_error')

In [8]:
best_param = best_model.best_params_
print("Best parameters are : {}".format(best_param))
estmtr.set_params(**best_param)
estmtr.fit(X_train, Y_train)
preds_train = estmtr.predict(X_train)
preds_test = estmtr.predict(X_test)

result = pd.DataFrame({
    'Id':X_Id,
    'SalePrice':preds_test
})

result.to_csv("dataset/result_gb_CV.csv",index=False)

Best parameters are : {'learning_rate': 0.05, 'n_estimators': 600}
      Iter       Train Loss   Remaining Time 
         1  5848989195.3991            7.19s
         2  5435694060.4798            6.87s
         3  5062301191.7565            6.55s
         4  4721305237.4000            6.24s
         5  4409444041.6985            5.93s
         6  4121930505.1983            5.92s
         7  3864756012.6645            5.75s
         8  3623426217.9001            5.61s
         9  3404868142.1142            5.44s
        10  3201149420.8081            5.41s
        20  1855134763.0673            5.18s
        30  1184939251.5364            5.31s
        40   838480394.6591            5.29s
        50   647751393.8714            5.26s
        60   531788116.0207            5.19s
        70   454853086.3930            5.12s
        80   403200349.4254            5.04s
        90   366274320.3329            4.96s
       100   338610527.6294            4.87s
       200   218396893.7546     

In [None]:
# TODO:
#     1. Polynomial features
#     2. Grid Search
#     3. Improve ordinal variables
#     4. Feature Selection
#     5. Remove outliers