In [58]:
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier

In [59]:
train_raw = pd.read_csv("/Users/reetkanjilal/Hack/AVD_WNS/train_mod.csv")
test_raw = pd.read_csv("/Users/reetkanjilal/Hack/AVD_WNS/test_mod.csv")

recruitment_channel

In [65]:
train_raw['recruitment_channel'].value_counts()

other       24400
sourcing    18545
referred      907
Name: recruitment_channel, dtype: int64

In [66]:
train_raw['recruitment_channel_sourcing'] = np.where(train_raw['recruitment_channel'] == 'sourcing',1,0)

test_raw['recruitment_channel_sourcing'] = np.where(test_raw['recruitment_channel'] == 'sourcing',1,0)

train_raw['recruitment_channel_sourcing'].value_counts()

0    25307
1    18545
Name: recruitment_channel_sourcing, dtype: int64

In [67]:
train_raw['recruitment_channel_referred'] = np.where(train_raw['recruitment_channel'] == 'referred',1,0)

test_raw['recruitment_channel_referred'] = np.where(test_raw['recruitment_channel'] == 'referred',1,0)

train_raw['recruitment_channel_referred'].value_counts()

0    42945
1      907
Name: recruitment_channel_referred, dtype: int64

education

In [70]:
train_raw['education'].value_counts()

Bachelor's          29336
Master's & above    11918
Below Secondary       640
Name: education, dtype: int64

In [71]:
train_raw['education_MS'] = np.where(train_raw['education'] == "Master's & above",1,0)

test_raw['education_MS'] = np.where(test_raw['education'] == "Master's & above",1,0)

print ('MS count:',train_raw['education_MS'].value_counts())

train_raw['education_Sec'] = np.where(train_raw['education'] == 'Below Secondary',1,0)

test_raw['education_Sec'] = np.where(test_raw['education'] == 'Below Secondary',1,0)

print ('Sec count:', train_raw['education_Sec'].value_counts())

MS count: 0    31934
1    11918
Name: education_MS, dtype: int64
Sec count: 0    43212
1      640
Name: education_Sec, dtype: int64


gender

In [74]:
print ('Gender cnt:', train_raw['gender'].value_counts())

train_raw['gender_m'] = np.where(train_raw['gender'] == "m",1,0)

test_raw['gender_m'] = np.where(test_raw['gender'] == "m",1,0)

print ('gender_m cnt',train_raw['gender_m'].value_counts())


Gender cnt: m    30788
f    13064
Name: gender, dtype: int64
gender_m cnt 1    30788
0    13064
Name: gender_m, dtype: int64


In [76]:
train_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43852 entries, 0 to 43851
Data columns (total 31 columns):
Unnamed: 0                      43852 non-null int64
employee_id                     43852 non-null int64
department                      43852 non-null object
region                          43852 non-null object
education                       41894 non-null object
gender                          43852 non-null object
recruitment_channel             43852 non-null object
no_of_trainings                 43852 non-null int64
age                             43852 non-null int64
previous_year_rating            43852 non-null int64
length_of_service               43852 non-null int64
KPIs_met >80%                   43852 non-null int64
awards_won?                     43852 non-null int64
avg_training_score              43852 non-null int64
is_promoted                     43852 non-null int64
age_group_1                     43852 non-null int64
age_group_2                     43852 

In [77]:
train_mod = train_raw.drop(['employee_id','Unnamed: 0','department','region','education','recruitment_channel','gender',], axis=1)

test_mod= test_raw.drop(['employee_id','Unnamed: 0','department','region','education','recruitment_channel','gender',], axis=1)

In [78]:
train_mod.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43852 entries, 0 to 43851
Data columns (total 24 columns):
no_of_trainings                 43852 non-null int64
age                             43852 non-null int64
previous_year_rating            43852 non-null int64
length_of_service               43852 non-null int64
KPIs_met >80%                   43852 non-null int64
awards_won?                     43852 non-null int64
avg_training_score              43852 non-null int64
is_promoted                     43852 non-null int64
age_group_1                     43852 non-null int64
age_group_2                     43852 non-null int64
age_group_3                     43852 non-null int64
avg_traiing_score_1             43852 non-null int64
avg_traiing_score_2             43852 non-null int64
avg_traiing_score_3             43852 non-null int64
avg_traiing_score_4             43852 non-null int64
avg_traiing_score_5             43852 non-null int64
avg_traiing_score_6             43852 non-n

In [79]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))


In [80]:
Y = train_mod['is_promoted'].values
X = train_mod.drop(['is_promoted'], axis=1)

In [81]:
# A parameter grid for XGBoost
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

In [90]:
xgb = XGBClassifier(learning_rate=0.02, n_estimators=500, objective='binary:logistic',
                    silent=True, nthread=1)

In [101]:
folds = 3
param_comb = 100

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(X,Y), verbose=3, random_state=1001 )

In [102]:
# Here we go
start_time = timer(None) # timing starts from this point for "start_time" variable
random_search.fit(X, Y)
timer(start_time) # timing ends here for "start_time" variable

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] subsample=1.0, min_child_weight=5, gamma=5, colsample_bytree=1.0, max_depth=3 
[CV] subsample=1.0, min_child_weight=5, gamma=5, colsample_bytree=1.0, max_depth=3 
[CV] subsample=1.0, min_child_weight=5, gamma=5, colsample_bytree=1.0, max_depth=3 
[CV] subsample=0.6, min_child_weight=1, gamma=1.5, colsample_bytree=0.8, max_depth=5 
[CV]  subsample=1.0, min_child_weight=5, gamma=5, colsample_bytree=1.0, max_depth=3, score=0.8386547226567144, total=  11.3s
[CV] subsample=0.6, min_child_weight=1, gamma=1.5, colsample_bytree=0.8, max_depth=5 
[CV]  subsample=1.0, min_child_weight=5, gamma=5, colsample_bytree=1.0, max_depth=3, score=0.8404491428301374, total=  11.4s
[CV] subsample=0.6, min_child_weight=1, gamma=1.5, colsample_bytree=0.8, max_depth=5 
[CV]  subsample=1.0, min_child_weight=5, gamma=5, colsample_bytree=1.0, max_depth=3, score=0.8324852367868386, total=  11.7s
[CV] subsample=0.8, min_child_weight=5, gamma=1, col

[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  1.6min


[CV] subsample=0.8, min_child_weight=1, gamma=1.5, colsample_bytree=0.6, max_depth=4 
[CV]  subsample=0.8, min_child_weight=1, gamma=0.5, colsample_bytree=0.6, max_depth=5, score=0.8444040572313001, total=  13.8s
[CV] subsample=0.8, min_child_weight=1, gamma=1.5, colsample_bytree=0.6, max_depth=4 
[CV]  subsample=0.8, min_child_weight=1, gamma=0.5, colsample_bytree=0.6, max_depth=5, score=0.8342604675310926, total=  13.8s
[CV] subsample=0.8, min_child_weight=1, gamma=1.5, colsample_bytree=0.6, max_depth=4 
[CV]  subsample=0.8, min_child_weight=1, gamma=0.5, colsample_bytree=0.6, max_depth=5, score=0.843393702856557, total=  14.0s
[CV] subsample=1.0, min_child_weight=1, gamma=0.5, colsample_bytree=0.8, max_depth=4 
[CV]  subsample=0.8, min_child_weight=1, gamma=1.5, colsample_bytree=0.6, max_depth=4, score=0.8429977020049859, total=  11.4s
[CV] subsample=1.0, min_child_weight=1, gamma=0.5, colsample_bytree=0.8, max_depth=4 
[CV]  subsample=0.8, min_child_weight=1, gamma=1.5, colsample_b

[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:  8.6min


[CV] subsample=0.8, min_child_weight=1, gamma=5, colsample_bytree=1.0, max_depth=5 
[CV]  subsample=1.0, min_child_weight=10, gamma=1.5, colsample_bytree=1.0, max_depth=5, score=0.8440883044459411, total=  21.7s
[CV] subsample=0.8, min_child_weight=1, gamma=5, colsample_bytree=1.0, max_depth=5 
[CV]  subsample=1.0, min_child_weight=10, gamma=1.5, colsample_bytree=1.0, max_depth=5, score=0.8345433961309973, total=  21.7s
[CV] subsample=0.8, min_child_weight=1, gamma=5, colsample_bytree=1.0, max_depth=5 
[CV]  subsample=1.0, min_child_weight=10, gamma=1.5, colsample_bytree=1.0, max_depth=5, score=0.840805960724855, total=  21.4s
[CV] subsample=0.6, min_child_weight=5, gamma=1.5, colsample_bytree=0.6, max_depth=3 
[CV]  subsample=0.6, min_child_weight=5, gamma=1.5, colsample_bytree=0.6, max_depth=3, score=0.8398480310495162, total=  10.0s
[CV] subsample=0.6, min_child_weight=5, gamma=1.5, colsample_bytree=0.6, max_depth=3 
[CV]  subsample=0.8, min_child_weight=1, gamma=5, colsample_bytree

[Parallel(n_jobs=4)]: Done 280 tasks      | elapsed: 20.2min


[CV] subsample=0.8, min_child_weight=10, gamma=1, colsample_bytree=1.0, max_depth=4 
[CV]  subsample=0.6, min_child_weight=1, gamma=1.5, colsample_bytree=1.0, max_depth=4, score=0.8358527310851391, total=  18.1s
[CV] subsample=0.8, min_child_weight=10, gamma=1, colsample_bytree=1.0, max_depth=4 
[CV]  subsample=0.8, min_child_weight=10, gamma=1, colsample_bytree=1.0, max_depth=4, score=0.8440413993549074, total=  16.6s
[CV] subsample=0.8, min_child_weight=10, gamma=5, colsample_bytree=0.6, max_depth=5 
[CV]  subsample=0.6, min_child_weight=1, gamma=1.5, colsample_bytree=1.0, max_depth=4, score=0.8421139664049315, total=  18.2s
[CV] subsample=0.8, min_child_weight=10, gamma=5, colsample_bytree=0.6, max_depth=5 
[CV]  subsample=0.8, min_child_weight=10, gamma=1, colsample_bytree=1.0, max_depth=4, score=0.8347223754990094, total=  16.6s
[CV] subsample=0.8, min_child_weight=10, gamma=5, colsample_bytree=0.6, max_depth=5 
[CV]  subsample=0.8, min_child_weight=10, gamma=5, colsample_bytree=0

[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed: 21.3min finished



 Time taken: 0 hours 21 minutes and 34.49 seconds.


In [105]:
random_search.best_params_

{'colsample_bytree': 0.8,
 'gamma': 5,
 'max_depth': 5,
 'min_child_weight': 1,
 'subsample': 0.8}

In [107]:
# A parameter grid for XGBoost
params = {'colsample_bytree': 0.8,
            'gamma': 5,
             'max_depth': 5,
             'min_child_weight': 1,
             'subsample': 0.8}

In [115]:
xgb = XGBClassifier(
    learning_rate = 0.02,
 n_estimators= 2000,
 max_depth= 5,
 min_child_weight= 1,
 gamma=1,
#  gamma=5,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,verbose=3, random_state=1001 ).fit(X, Y)

In [113]:
pred_test = pd.DataFrame(xgb.predict(test_mod))

  if diff:


In [114]:
pred_test['employee_id'] = test_raw['employee_id']
pred_test.to_csv('xgb1.csv')


# New Model stacking with h2o

In [116]:
import pandas as pd 
import json
import h2o
# h2o.init(ip="10.0.21.30", port=54321)
from h2o.automl import H2OAutoML
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators import H2OXGBoostEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
from h2o.grid.grid_search import H2OGridSearch
import numpy as np

In [118]:
h2o.init(max_mem_size="12G")

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_112"; Java(TM) SE Runtime Environment (build 1.8.0_112-b16); Java HotSpot(TM) 64-Bit Server VM (build 25.112-b16, mixed mode)
  Starting server from /Users/reetkanjilal/ml/IIMB_kaggle/lib/python3.5/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/pg/3ws7rkfj09d1lc0slmsvt_tw0000gn/T/tmpeewy7r8l
  JVM stdout: /var/folders/pg/3ws7rkfj09d1lc0slmsvt_tw0000gn/T/tmpeewy7r8l/h2o_reetkanjilal_started_from_python.out
  JVM stderr: /var/folders/pg/3ws7rkfj09d1lc0slmsvt_tw0000gn/T/tmpeewy7r8l/h2o_reetkanjilal_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,02 secs
H2O cluster timezone:,Asia/Kolkata
H2O data parsing timezone:,UTC
H2O cluster version:,3.20.0.7
H2O cluster version age:,15 days
H2O cluster name:,H2O_from_python_reetkanjilal_781sz8
H2O cluster total nodes:,1
H2O cluster free memory:,10.67 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


In [152]:
# GBM hyperparameters

gbm_params1 = {'learn_rate': [0.01, 0.1],
                'max_depth': [3, 5, 9],
                'sample_rate': [0.8, 1.0],
                'col_sample_rate': [0.2, 0.5, 1.0]}

# Train and validate a cartesian grid of GBMs
gbm_grid1 = H2OGridSearch(model=H2OGradientBoostingEstimator,
                          grid_id='gbm_grid1',
                          hyper_params=gbm_params1)
gbm_grid1.train(x=x, y="is_promoted",
                training_frame=train,
                validation_frame=valid,
                ntrees=500,
                seed=1)




gbm Grid Build progress: |█████████████████████████████████████████████████| 100%


In [157]:
# gbm_grid1.get_grid()
# gbm_grid1.models[0]
best_gbm1 = gbm_grid1.models[0]
preds = best_gbm1.predict(testh2o).as_data_frame()
preds['employee_id'] = test_raw['employee_id']
preds.to_csv("gbm_tune.csv")

gbm prediction progress: |████████████████████████████████████████████████| 100%


In [150]:
# Get the grid results, sorted by validation MAE
gbm_gridperf1 = gbm_grid1.get_grid()

NameError: name 'gbm_grid1' is not defined

In [126]:
y = 'is_promoted'
drop_clmn = ['employee_id','Unnamed: 0','department','region','education','recruitment_channel','gender']


train_data = train_raw.drop(drop_clmn,axis=1)
test_data = test_raw.drop(drop_clmn,axis=1)

trainh2o = h2o.H2OFrame(train_data)
testh2o = h2o.H2OFrame(test_data)

train, valid = trainh2o.split_frame(ratios=[0.9], seed=1)



  data = _handle_python_lists(python_obj.as_matrix().tolist(), -1)[1]


Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [132]:
y = 'is_promoted'
x = list(train.columns)
x.remove(y)


# For binary classification, response should be a factor
train[y] = train[y].asfactor()
valid[y] = valid[y].asfactor()

In [133]:
def plot_perf(grid, test):
    print('best')
    for index, model in enumerate(grid.models):
        print ('model',index)
        print ('id',model.model_id)
        perf_grid = model.model_performance(test)
        print ('gini:',perf_grid.gini())
        print ('auc:',perf_grid.auc())
        print ('--------\n\n')

#for ensamble
nfolds=2

#same random seed or ensemble wont work
#set in all model :'keep_cross_validation_predictions':True, and 'fold_assignment':"Modulo",
seed=1
search_criteria = {"strategy": "RandomDiscrete", "max_models": 2, "seed": seed}

In [148]:
hyper_parameters = {
                    'max_depth':[4,6,8,10],
                    "ntrees":[300, 250, 350, 400, 500],
                    "sample_rate": [0.8, 0.7,0.6],
}

params = {
          "balance_classes": True,
          "seed":1234,
          "fold_assignment":"Modulo",
          "nfolds":nfolds,
          "keep_cross_validation_predictions":True,
          "stopping_rounds":10,
          "stopping_metric":'AUC'}
          

grid_3 = H2OGridSearch(H2ORandomForestEstimator(**params
                                   
                                   ), hyper_params=hyper_parameters,
                                    search_criteria=search_criteria,)

grid_3.train(x=x, y=y, training_frame=train,validation_frame=valid)
grid_3.show()

drf Grid Build progress: |███ (cancelled)


H2OJobCancelled: Job<$03017f00000132d4ffffffff$_a73b3caa5b7c7245cf0eabd311ea4656> was cancelled by the user.

In [135]:
plot_perf(grid_3, valid)


best
model 0
id Grid_DRF_py_34_sid_b1b6_model_python_1537101360347_2_model_1
gini: 0.5252403189688133
auc: 0.7626201594844066
--------


model 1
id Grid_DRF_py_34_sid_b1b6_model_python_1537101360347_2_model_0
gini: 0.525619913703643
auc: 0.7628099568518215
--------




NameError: name 'test' is not defined

In [136]:
hyper_parameters = {#
    'activation':["Maxout"],#["Rectifier","Tanh","Maxout","RectifierWithDropout","TanhWithDropout","MaxoutWithDropout"],
    'epochs': [12],
    'hidden':[ [10, 5] ],
    'input_dropout_ratio': [0.01, 0.05],
    'rate': [0.01, 0.05],
    #'rate_annealing':[1e-9, 1e-8, 1e-7]    
}

params = {
                    
          'score_interval':1,
          'stopping_rounds':5,
          'stopping_metric':'AUC',
          'balance_classes' : True,
          'nfolds':nfolds,
          'seed':seed,
          'keep_cross_validation_predictions':True,
          'fold_assignment':"Modulo",

          }

grid_4 = H2OGridSearch(H2ODeepLearningEstimator(**params),hyper_params=hyper_parameters,
                                    search_criteria=search_criteria)

grid_4.train(x=x, y=y, training_frame=train,validation_frame=valid)
grid_4.show()
plot_perf(grid_4, valid)

deeplearning Grid Build progress: |███████████████████████████████████████| 100%
    activation              epochs   hidden input_dropout_ratio  rate  \
0       Maxout  12.297120359927648  [10, 5]                0.01  0.05   
1       Maxout  12.297120359927648  [10, 5]                0.05  0.01   

                                                               model_ids  \
0  Grid_DeepLearning_py_34_sid_b1b6_model_python_1537101360347_77_mod...   
1  Grid_DeepLearning_py_34_sid_b1b6_model_python_1537101360347_77_mod...   

               logloss  
0  0.23532383010838226  
1  0.24084975443019707  
best
model 0
id Grid_DeepLearning_py_34_sid_b1b6_model_python_1537101360347_77_model_0
gini: 0.6245384783439838
auc: 0.8122692391719919
--------


model 1
id Grid_DeepLearning_py_34_sid_b1b6_model_python_1537101360347_77_model_1
gini: 0.6100223933584576
auc: 0.8050111966792288
--------




In [137]:
hyper_parameters = { 'alpha': [0.2, 0.3, 0.4,  0.5, 0.6, 0.7, 0.8], 
                     'lambda': [1e-10, 1e-9, 1, 0.5, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0] }
params = {
          'family':'binomial',
          'nfolds':nfolds,
          'seed':seed, 
          'fold_assignment':"Modulo",
          'keep_cross_validation_predictions':True,
          }
grid_5 = H2OGridSearch( H2OGeneralizedLinearEstimator(**params), hyper_params=hyper_parameters,
                                    search_criteria=search_criteria)
grid_5.train(x=x, y=y, training_frame=train, validation_frame=valid)
grid_5.show()
plot_perf(grid_5, valid)

glm Grid Build progress: |████████████████████████████████████████████████| 100%
     alpha   lambda  \
0    [0.2]  [0.001]   
1    [0.8]    [0.1]   

                                                        model_ids  \
0  Grid_GLM_py_34_sid_b1b6_model_python_1537101360347_128_model_1   
1  Grid_GLM_py_34_sid_b1b6_model_python_1537101360347_128_model_0   

               logloss  
0   0.2380114346878762  
1  0.29650758828966534  
best
model 0
id Grid_GLM_py_34_sid_b1b6_model_python_1537101360347_128_model_1
gini: 0.5782722977770496
auc: 0.7891361488885248
--------


model 1
id Grid_GLM_py_34_sid_b1b6_model_python_1537101360347_128_model_0
gini: 0.0
auc: 0.5
--------




In [138]:
hyper_parameters = {"ntrees":[50],
                "learn_rate": [0.05, 0.01],
                "max_depth": [4],
                "sample_rate": [0.8, 0.9, 0.7],
                "col_sample_rate": [ 0.6, 0.7, 0.8]}

params = {
          "balance_classes": True,
          "seed":seed,
          "fold_assignment":"Modulo",
          "nfolds":nfolds,
          "keep_cross_validation_predictions":True,
          "stopping_tolerance":0.0005,
          "stopping_metric":'AUC',
          "stopping_rounds":10,
          "score_each_iteration":True,
          "score_tree_interval":50          
          }

# Train the grid
grid_1 = H2OGridSearch(model=H2OGradientBoostingEstimator(**params),
                     hyper_params=hyper_parameters,
                     search_criteria=search_criteria,
                     grid_id="gbm_grid_binomial")
grid_1.train(x=x, y=y, training_frame=train, validation_frame=valid)
grid_1.show()
plot_perf(grid_1, valid)

gbm Grid Build progress: |████████████████████████████████████████████████| 100%
    col_sample_rate learn_rate max_depth ntrees sample_rate  \
0               0.7       0.05         4     50         0.8   
1               0.7       0.01         4     29         0.7   

                   model_ids             logloss  
0  gbm_grid_binomial_model_0  0.2276483814928897  
1  gbm_grid_binomial_model_1  0.2735887995540484  
best
model 0
id gbm_grid_binomial_model_0
gini: 0.6494087607187722
auc: 0.8247043803593861
--------


model 1
id gbm_grid_binomial_model_1
gini: 0.6261647277295319
auc: 0.813082363864766
--------




In [139]:
base_models=[grid_1.model_ids[0],grid_3.model_ids[0],grid_4.model_ids[0], grid_5.model_ids[0]]
ensemble = H2OStackedEnsembleEstimator(model_id="my_ensemble_1",base_models=base_models)


In [140]:
ensemble.train(x=x, y=y, training_frame=train)

stackedensemble Model Build progress: |███████████████████████████████████| 100%


In [142]:
test_pred = ensemble.predict(testh2o)

stackedensemble prediction progress: |████████████████████████████████████| 100%


In [147]:
pred_ens = ensemble.predict(testh2o).as_data_frame()
# test_pred = test_pred.as_data_frame()
pred_ens['employee_id'] = test_raw['employee_id']
pred_ens.to_csv("ensmbl1.csv")

stackedensemble prediction progress: |████████████████████████████████████| 100%


In [145]:


test_id = test_raw['employee_id']
submission = pd.concat((h2o.as_list(test_id), h2o.as_list(test_pred['predict'])), axis=1, ignore_index=True)


H2OTypeError: Argument `data` should be an H2OFrame, got Series 0         8724
1        74430
2        72255
3        38562
4        64486
5        46232
6        54542
7        67269
8        66174
9        76303
10       60245
11       42639
12       30963
13       54055
14       42996
15       12737
16       27561
17       26622
18       31582
19       29793
20       72735
21        5677
22       60889
23       51498
24        8566
25       53151
26       16203
27       61083
28       74849
29       13259
         ...  
23460    72190
23461    27155
23462    15959
23463    72900
23464    57683
23465    55174
23466    51115
23467    50134
23468    59775
23469    14408
23470     4351
23471    59625
23472    29379
23473    67979
23474    61451
23475    73871
23476    52090
23477    18488
23478    59673
23479    39410
23480    11837
23481    78278
23482    27284
23483    49291
23484    47537
23485    53478
23486    25600
23487    45409
23488     1186
23489     5973
Name: employee_id, Length: 23490, dtype: int64

In [125]:
drop_clmn = ['employee_id','Unnamed: 0','department','region','education','recruitment_channel','gender']
train_raw.drop(drop_clmn,axis=1).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43852 entries, 0 to 43851
Data columns (total 24 columns):
no_of_trainings                 43852 non-null int64
age                             43852 non-null int64
previous_year_rating            43852 non-null int64
length_of_service               43852 non-null int64
KPIs_met >80%                   43852 non-null int64
awards_won?                     43852 non-null int64
avg_training_score              43852 non-null int64
is_promoted                     43852 non-null int64
age_group_1                     43852 non-null int64
age_group_2                     43852 non-null int64
age_group_3                     43852 non-null int64
avg_traiing_score_1             43852 non-null int64
avg_traiing_score_2             43852 non-null int64
avg_traiing_score_3             43852 non-null int64
avg_traiing_score_4             43852 non-null int64
avg_traiing_score_5             43852 non-null int64
avg_traiing_score_6             43852 non-n

In [49]:
# from h2o.estimators import H2OXGBoostEstimator
from h2o.estimators.xgboost import H2OXGBoostEstimator

In [50]:
# GBM hyperparameters
x = trainh2o.columns
y = "is_promoted"
x.remove(y)
ID = 'employee_id'
x.remove(ID)
x.remove(x[0])




param = {
      "ntrees" : 10
    , "min_rows" : 5
    , "seed": 4241
    , "score_tree_interval": 1
}



xgb_params1 = {'learn_rate': [i * 0.01 for i in range(1, 3)],
               'max_depth': list(range(4, 7)),
               'sample_rate': [i * 0.1 for i in range(8, 10)],
               'col_sample_rate': [i * 0.1 for i in range(8, 10)]}

# Train and validate a cartesian grid of GBMs
xgb_grid1 = H2OGridSearch(H2OXGBoostEstimator(**param),
                          grid_id='xgb_grid1',
                          hyper_params=xgb_params1)



In [None]:
{'learn_rate': [i * 0.01 for i in range(1, 3)],
               'max_depth': list(range(4, 7)),
               'sample_rate': [i * 0.1 for i in range(8, 10)],
               'col_sample_rate': [i * 0.1 for i in range(8, 10)]}



In [45]:
[i * 0.1 for i in range(8, 10)]

[0.8, 0.9]

['department',
 'region',
 'education',
 'gender',
 'recruitment_channel',
 'no_of_trainings',
 'age',
 'previous_year_rating',
 'length_of_service',
 'KPIs_met >80%',
 'awards_won?',
 'avg_training_score',
 'age_group_1',
 'age_group_2',
 'age_group_3',
 'avg_traiing_score_1',
 'avg_traiing_score_2',
 'avg_traiing_score_3',
 'avg_traiing_score_4',
 'avg_traiing_score_5',
 'avg_traiing_score_6',
 'avg_traiing_score_7',
 'avg_traiing_score_8']

In [54]:
train, valid = trainh2o.split_frame(ratios=[0.9], seed=123)
# For binary classification, response should be a factor
# y = "is_promoted"

train['is_promoted'] = train['is_promoted'].asfactor()
valid['is_promoted'] = valid['is_promoted'].asfactor()

In [56]:
xgb_grid1.train(x=x, y='is_promoted', training_frame=train,validation_frame=valid)

xgboost Grid Build progress: |████████████████████████████████████████████| 100%


H2OResponseError: Server error water.exceptions.H2OIllegalArgumentException:
  Error: Failed to find ModelMetrics for criterion: logloss
  Request: GET /99/Grids/xgb_grid1


In [55]:
list(range(1, train.shape[1]))

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25]

In [16]:
xgb_grid1.get_grid(sort_by='residual_deviance', decreasing=True)

     col_sample_rate learn_rate max_depth sample_rate           model_ids  \
0                1.0        0.1         5         0.8   xgb_grid1_model_7   
1                0.8        0.1         5         0.8   xgb_grid1_model_6   
2                0.8        0.1         5         1.0  xgb_grid1_model_14   
3                1.0        0.1         5         1.0  xgb_grid1_model_15   
4                0.8        0.1         3         0.8   xgb_grid1_model_2   
5                1.0        0.1         3         1.0  xgb_grid1_model_11   
6                1.0        0.1         3         0.8   xgb_grid1_model_3   
7                0.8        0.1         3         1.0  xgb_grid1_model_10   
8                1.0       0.01         5         0.8   xgb_grid1_model_5   
9                1.0       0.01         5         1.0  xgb_grid1_model_13   
10               0.8       0.01         5         0.8   xgb_grid1_model_4   
11               0.8       0.01         5         1.0  xgb_grid1_model_12   



In [15]:
xgb_grid1.models[0]

Model Details
H2OXGBoostEstimator :  XGBoost
Model Key:  xgb_grid1_model_7


ModelMetricsRegression: xgboost
** Reported on train data. **

MSE: 0.08251473989140551
RMSE: 0.28725379003836576
MAE: 0.25291253033385175
RMSLE: 0.2250450924014752
Mean Residual Deviance: 0.08251473989140551

ModelMetricsRegression: xgboost
** Reported on validation data. **

MSE: 0.08253213353006826
RMSE: 0.2872840641770237
MAE: 0.2526148645632865
RMSLE: 0.2249142672601103
Mean Residual Deviance: 0.08253213353006826
Scoring History: 


0,1,2,3,4,5,6,7,8,9
,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance,validation_rmse,validation_mae,validation_deviance
,2018-09-16 13:14:54,6.565 sec,0.0,0.5,0.5,0.25,0.5,0.5,0.25
,2018-09-16 13:14:54,6.717 sec,1.0,0.4625531,0.4620585,0.2139554,0.4625056,0.4620128,0.2139115
,2018-09-16 13:14:54,6.790 sec,2.0,0.4301599,0.4282733,0.1850375,0.4300923,0.4281968,0.1849794
,2018-09-16 13:14:54,6.860 sec,3.0,0.4020010,0.3978799,0.1616048,0.4018213,0.3976968,0.1614603
,2018-09-16 13:14:54,6.926 sec,4.0,0.3773379,0.3702481,0.1423839,0.3771466,0.3700177,0.1422395
,2018-09-16 13:14:54,6.996 sec,5.0,0.3562779,0.3455445,0.1269339,0.3560119,0.3452459,0.1267445
,2018-09-16 13:14:54,7.066 sec,6.0,0.3382658,0.3233156,0.1144237,0.3380691,0.3230370,0.1142907
,2018-09-16 13:14:54,7.133 sec,7.0,0.3227125,0.3031866,0.1041434,0.3226687,0.3029526,0.1041151
,2018-09-16 13:14:54,7.202 sec,8.0,0.3088720,0.2846401,0.0954019,0.3087941,0.2843498,0.0953538


Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
avg_training_score,88.0,1.0,0.3333333
previous_year_rating,34.0,0.3863636,0.1287879
awards_won?,22.0,0.25,0.0833333
KPIs_met>80%,19.0,0.2159091,0.0719697
Unnamed:0,14.0,0.1590909,0.0530303
---,---,---,---
region.region_23,1.0,0.0113636,0.0037879
recruitment_channel.other,1.0,0.0113636,0.0037879
region.region_26,1.0,0.0113636,0.0037879



See the whole table with table.as_data_frame()




In [21]:
# Get the grid results, sorted by validation MAE
xgb_gridperf1 = xgb_grid1.get_grid()


In [22]:
best_xgb1 = xgb_gridperf1.models[0]

In [24]:
pred_xgb = best_xgb1.predict(testh2o).as_data_frame()
pred_xgb['employee_id'] = testh2o['employee_id'].as_data_frame()
# pred_xgb.to_csv("pred_xgb2.csv")

xgboost prediction progress: |████████████████████████████████████████████| 100%


In [25]:
pred_xgb

Unnamed: 0,predict,employee_id
0,0.299296,8724
1,0.175754,74430
2,0.175754,72255
3,0.197159,38562
4,0.190600,64486
5,0.197159,46232
6,0.255706,54542
7,0.197159,67269
8,0.197159,66174
9,0.287912,76303


In [3]:
train_raw.fillna(0,inplace=True)
test_raw.fillna(0,inplace=True)