In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
import seaborn as sns
import h2o
from h2o.grid.grid_search import H2OGridSearch
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
import os, time, sys
import warnings
warnings.filterwarnings("ignore")

In [6]:
path = "C:\\Users\\Gil\\Documents\\GitHub\\K_House_Prices"
os.chdir(path)

In [24]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
test_original = test
print(train.shape)
print(train.columns)

(1460, 81)
Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 

In [25]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## Clean Data

In [26]:
# Identify the columns containing NA values in both train and test dataset
train.isnull().sum().sort_values(ascending = False)
test.isnull().sum().sort_values(ascending = False)
# Remove the columns which contain NA with more than 100 rows
train.drop( ['PoolQC', 'MiscFeature','Alley','Fence','FireplaceQu','LotFrontage'], inplace = True, axis = 'columns')
test.drop( ['PoolQC', 'MiscFeature','Alley','Fence','FireplaceQu','LotFrontage'], inplace = True, axis = 'columns')
print(train.shape)
print(test.shape)
# Identify the columns which contain numeric values
train_numericCol = train.select_dtypes(include=[np.number]).columns.values
print(train_numericCol)
# Fill missing values in the numeric columns.
train.fillna(train.mean(),inplace = True)
test.fillna(test.mean(),inplace = True)
# check if still any NA's are available in the numerical coulumns of train data
train[train_numericCol].isnull().sum().sort_values(ascending = False)

(1460, 75)
(1459, 74)
['Id' 'MSSubClass' 'LotArea' 'OverallQual' 'OverallCond' 'YearBuilt'
 'YearRemodAdd' 'MasVnrArea' 'BsmtFinSF1' 'BsmtFinSF2' 'BsmtUnfSF'
 'TotalBsmtSF' '1stFlrSF' '2ndFlrSF' 'LowQualFinSF' 'GrLivArea'
 'BsmtFullBath' 'BsmtHalfBath' 'FullBath' 'HalfBath' 'BedroomAbvGr'
 'KitchenAbvGr' 'TotRmsAbvGrd' 'Fireplaces' 'GarageYrBlt' 'GarageCars'
 'GarageArea' 'WoodDeckSF' 'OpenPorchSF' 'EnclosedPorch' '3SsnPorch'
 'ScreenPorch' 'PoolArea' 'MiscVal' 'MoSold' 'YrSold' 'SalePrice']


SalePrice        0
BsmtHalfBath     0
GrLivArea        0
LowQualFinSF     0
2ndFlrSF         0
1stFlrSF         0
TotalBsmtSF      0
BsmtUnfSF        0
BsmtFinSF2       0
BsmtFinSF1       0
MasVnrArea       0
YearRemodAdd     0
YearBuilt        0
OverallCond      0
OverallQual      0
LotArea          0
MSSubClass       0
BsmtFullBath     0
FullBath         0
YrSold           0
HalfBath         0
MoSold           0
MiscVal          0
PoolArea         0
ScreenPorch      0
3SsnPorch        0
EnclosedPorch    0
OpenPorchSF      0
WoodDeckSF       0
GarageArea       0
GarageCars       0
GarageYrBlt      0
Fireplaces       0
TotRmsAbvGrd     0
KitchenAbvGr     0
BedroomAbvGr     0
Id               0
dtype: int64

## Initialize h2o server

In [27]:
## Initiate the h2o Server
h2o.init(ip="localhost", port=54321)

Checking whether there is an H2O instance running at http://localhost:54321 .. connected.


0,1
H2O cluster uptime:,1 min 01 secs
H2O cluster timezone:,Europe/London
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.1
H2O cluster version age:,26 days
H2O cluster name:,H2O_from_python_Gil_y1wixq
H2O cluster total nodes:,1
H2O cluster free memory:,239.3 Mb
H2O cluster total cores:,12
H2O cluster allowed cores:,12


In [28]:
h2o.cluster_info()

0,1
H2O cluster uptime:,1 min 05 secs
H2O cluster timezone:,Europe/London
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.1
H2O cluster version age:,26 days
H2O cluster name:,H2O_from_python_Gil_y1wixq
H2O cluster total nodes:,1
H2O cluster free memory:,239.3 Mb
H2O cluster total cores:,12
H2O cluster allowed cores:,12


## Convert data into h2o Frame

In [29]:
train = h2o.H2OFrame(train)
test = h2o.H2OFrame(test)
test_original = h2o.H2OFrame(test_original)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [30]:
# Split the train dataset
train, valid, test = train.split_frame(ratios=[0.7, 0.15], seed=42)

In [31]:
# Seperate the target data and store it into y variable
y = 'SalePrice'
Id = test['Id']

In [32]:
# remove target and Id column from the dataset and store rest of the columns in X variable
X = list(train.columns)
X.remove(y)
X.remove('Id')


## H2o Machine Learning models

We will now perform training of the models using below H2o supervised algorithms


Gradient Boosting Machine (RF)

Random Forest (RF)

Deep Learning (DL)

### 1. Gradient Boosting Machine (GBM)

In [33]:
# Prepare the hyperparameters
gbm_params = {
                'learn_rate': [0.01, 0.1], 
                'max_depth': [4, 5, 7],
                'sample_rate': [0.6, 0.8],               # Row sample rate
                'col_sample_rate': [0.2, 0.5, 0.9]       # Column sample rate per split (from 0.0 to 1.0)
                }



# Prepare the grid object
gbm_grid = H2OGridSearch(model=H2OGradientBoostingEstimator,   # Model to be trained
                          grid_id='gbm_grid1',                  # Grid Search ID
                          hyper_params=gbm_params,              # Dictionary of parameters
                          search_criteria={"strategy": "Cartesian"}   # RandomDiscrete
                          )



# Train the Model
start = time.time() 
gbm_grid.train(x=X,y=y, 
                training_frame=train,
                validation_frame=valid,
                ntrees=100,      # Specify other GBM parameters not in grid
                score_tree_interval=5,     # For early stopping
                stopping_rounds=3,         # For early stopping
                stopping_tolerance=0.0005,
                seed=1)

end = time.time()
(end - start)/60



# Find the Model grid performance 
gbm_gridperf = gbm_grid.get_grid(sort_by='RMSE',decreasing = False)
gbm_gridperf



# Identify the best model generated with least error
best_gbm_model = gbm_gridperf.models[0]
best_gbm_model


gbm Grid Build progress: |████████████████████████████████████████████████| 100%
Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  gbm_grid1_model_22


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,100.0,100.0,17988.0,4.0,4.0,4.0,5.0,15.0,9.64




ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 192448448.05491593
RMSE: 13872.578997969913
MAE: 9325.56769820039
RMSLE: 0.08161451461137707
Mean Residual Deviance: 192448448.05491593

ModelMetricsRegression: gbm
** Reported on validation data. **

MSE: 657476598.2695568
RMSE: 25641.306485231144
MAE: 16028.875448104653
RMSLE: 0.1454765044707938
Mean Residual Deviance: 657476598.2695568

Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance,validation_rmse,validation_mae,validation_deviance
0,,2020-01-12 17:46:15,11 min 5.037 sec,0.0,79105.276761,57707.735026,6257645000.0,73752.022782,55662.658844,5439361000.0
1,,2020-01-12 17:46:15,11 min 5.059 sec,5.0,55018.54878,38995.032557,3027041000.0,52442.465318,37721.888027,2750212000.0
2,,2020-01-12 17:46:15,11 min 5.077 sec,10.0,40477.34958,27574.202851,1638416000.0,40535.087232,27401.794572,1643093000.0
3,,2020-01-12 17:46:15,11 min 5.093 sec,15.0,31890.511498,20858.092253,1017005000.0,33448.423919,22240.271306,1118797000.0
4,,2020-01-12 17:46:15,11 min 5.109 sec,20.0,26744.957209,17032.599989,715292700.0,30134.378726,19535.582159,908080800.0
5,,2020-01-12 17:46:15,11 min 5.125 sec,25.0,23558.259194,14991.106092,554991600.0,28542.601209,18441.480556,814680100.0
6,,2020-01-12 17:46:15,11 min 5.142 sec,30.0,21580.232302,13641.37975,465706400.0,27529.827944,17921.492059,757891400.0
7,,2020-01-12 17:46:15,11 min 5.158 sec,35.0,20040.18181,12704.208502,401608900.0,26915.193119,17517.225578,724427600.0
8,,2020-01-12 17:46:15,11 min 5.174 sec,40.0,18962.152397,11999.101692,359563200.0,26547.450559,17293.565977,704767100.0
9,,2020-01-12 17:46:15,11 min 5.191 sec,45.0,18163.065348,11446.290271,329896900.0,26151.6846,16921.959189,683910600.0



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,GrLivArea,4895372000000.0,1.0,0.162914
1,OverallQual,3456765000000.0,0.706129,0.115038
2,TotalBsmtSF,2522601000000.0,0.515303,0.08395
3,GarageArea,2428393000000.0,0.496059,0.080815
4,Neighborhood,2375330000000.0,0.48522,0.079049
5,ExterQual,2048164000000.0,0.418388,0.068161
6,GarageYrBlt,1839523000000.0,0.375768,0.061218
7,2ndFlrSF,1369977000000.0,0.279851,0.045592
8,YearBuilt,1228946000000.0,0.251042,0.040898
9,1stFlrSF,1208796000000.0,0.246926,0.040228



See the whole table with table.as_data_frame()




### 2. Random Forest Algorithm

In [20]:
# Prepare the hyperparameters
nfolds = 5
rf_params = {
                'max_depth': [3, 4,5],
                'sample_rate': [0.8, 1.0],               # Row sample rate
                'mtries' : [2,4,3]
                }



# Search criteria for parameter space
search_criteria = {'strategy': "RandomDiscrete",
                   "seed": 1,
                   'stopping_metric': "AUTO",
                   'stopping_tolerance': 0.0005
                   }



# Prepare the grid object
rf_grid = H2OGridSearch(model=H2ORandomForestEstimator,   # Model to be trained
                          grid_id='rf_grid',                  # Grid Search ID
                          hyper_params=rf_params,              # Dictionary of parameters
                          search_criteria=search_criteria,   # RandomDiscrete
                          )



# Train the Model
start = time.time() 
rf_grid.train(x=X,y=y, 
                training_frame=train,
                validation_frame=valid,
                ntrees=100,      
                score_each_iteration=True,
                nfolds = nfolds,
                fold_assignment= "Modulo",
                seed=1
                )

end = time.time()
(end - start)/60



# Find the Model performance 
rf_gridperf = rf_grid.get_grid(sort_by='RMSE',decreasing = False)
rf_gridperf



# Identify the best model generated with least error
best_rf_model = rf_gridperf.models[0]
best_rf_model

drf Grid Build progress: |████████████████████████████████████████████████| 100%
deeplearning Grid Build progress: |███████████████████████████████████████| 100%

ModelMetricsRegression: gbm
** Reported on test data. **

MSE: 1054309023.8360233
RMSE: 32470.125097326363
MAE: 17025.989023451173
RMSLE: 0.13806784851292303
Mean Residual Deviance: 1054309023.8360233
<bound method MetricsBase.gini of >

ModelMetricsRegression: drf
** Reported on test data. **

MSE: 1574512914.8187523
RMSE: 39680.132494974765
MAE: 22922.108386165648
RMSLE: 0.18141083618205037
Mean Residual Deviance: 1574512914.8187523
<bound method MetricsBase.gini of >

ModelMetricsRegression: deeplearning
** Reported on test data. **

MSE: 1110693925.9693353
RMSE: 33327.07496869978
MAE: 18165.986252073893
RMSLE: 0.1370899001618085
Mean Residual Deviance: 9082.993126036947
<bound method MetricsBase.gini of >
gbm prediction progress: |████████████████████████████████████████████████| 100%
drf prediction progress: |██████

H2OConnectionError: Unexpected HTTP error: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

### 3. Deep Learning Algorithm

In [None]:

activation_opt = ["RectifierWithDropout",
                  "TanhWithDropout"]
#L1 & L2 regularization
l1_opt = [0, 0.00001,
          0.0001,
          0.001,
          0.01,
          0.1]

l2_opt = [0, 0.00001,
          0.0001,
          0.001,
          0.01,
          0.1]



# Create the Hyperparameters
dl_params = {
             'activation': activation_opt,
             "input_dropout_ratio" : [0,0.05, 0.1],  # input layer dropout ratio to improve generalization. Suggested values are 0.1 or 0.2.
             'l1': l1_opt,
             'l2': l2_opt,
             'hidden_dropout_ratios':[[0.1,0.2,0.3], # hidden layer dropout ratio to improve generalization: one value per hidden layer.
                                      [0.1,0.5,0.5],
                                      [0.5,0.5,0.5]]
             }



search_criteria = {
                   'strategy': 'RandomDiscrete',
                   'max_runtime_secs': 1000,
                   'seed':1
                   }



# Prepare the grid object
dl_grid = H2OGridSearch(model=H2ODeepLearningEstimator(
                                                    epochs = 1000,   ## hopefully converges earlier...
                                                    adaptive_rate = True,  # http://cs231n.github.io/neural-networks-3/#sgd
                                                    stopping_metric="AUTO",
                                                    stopping_tolerance=1e-2,    ## stop when misclassification does not improve by >=1% for 2 scoring events
                                                    stopping_rounds=3,
                                                    hidden=[128,128,128],      ## more hidden layers -> more complex interactions
                                                    balance_classes= False,
                                                    standardize = True,  # If enabled, automatically standardize the data (mean 0, variance 1). If disabled, the user must provide properly scaled input data.
                                                    loss = "quantile"  # quantile for regression
                                                    ),
                        grid_id='dl_grid',
                        hyper_params=dl_params,
                        search_criteria=search_criteria)



# Train the Model
start = time.time() 
dl_grid.train(x=X,y=y, 
                training_frame=train,
                validation_frame=valid,
                stopping_rounds=2,
                stopping_tolerance=0.0005,
                seed=1
                )

end = time.time()
(end - start)/60



# Find the Model performance 
dl_gridperf = dl_grid.get_grid(sort_by='RMSE',decreasing = False)
dl_gridperf



# Identify the best model generated with least error
best_dl_model = dl_gridperf.models[0]
best_dl_model


## Compare Model Performances

In [None]:
best_gbm_perf= best_gbm_model.model_performance(test)  # GBM Model
best_rf_perf = best_rf_model.model_performance(test)   # Random Forest Model
best_dl_perf = best_dl_model.model_performance(test)   #deep Learning Model

### Retreive test set AUC
print(best_gbm_perf.gini)
print(best_rf_perf.gini)
print(best_dl_perf.gini)

## Prediction of Model
gbm_pred= best_gbm_model.predict(test_original).as_data_frame()
rf_pred = best_rf_model.predict(test_original).as_data_frame()
dl_pred = best_dl_model.predict(test_original).as_data_frame()


## Submission into kaggle

In [34]:
sub = pd.DataFrame()
sub['Id'] = gbm_pred.index + 1461
sub['SalePrice'] = gbm_pred
sub.head()
sub.to_csv('gbm_h2o.csv', index=False)