In [35]:
%matplotlib inline
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb

with open("helper_functions.py") as f:
    code = compile(f.read(), "helper_functions.py", 'exec')
    exec(code)

In [36]:
rng = np.random.RandomState(0)

print ('Reading in train data..')
train = pd.read_csv('train.csv')
train['type'] = 'train'

print ('Reading in test data..')
test = pd.read_csv('test.csv')
test['type'] = 'test'
test['OutcomeSubtype'] = ''
test['OutcomeType'] = ''

df = data_import(train, test)

print ('Running data preparation for train dataset')
X_train, y_train, le_train, X_train_cols = prep_data(df, 'train')

print ('Running data preparation for test dataset')
X_test, y_test, le_test, X_test_cols = prep_data(df, 'test')

col_check(X_train_cols, X_test_cols)

Reading in train data..
Reading in test data..
Running feature extraction process..
Dropping unused variables..
Running data preparation for train dataset
Encoding labels of the outcome variable..
Using one hot encoding for predictor variables..
Running data preparation for test dataset
Encoding labels of the outcome variable..
Using one hot encoding for predictor variables..
Columns are the same!!


In [38]:
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV

gbm = xgb.XGBClassifier()
imp = Imputer(missing_values=np.nan, axis=0)

pipe = Pipeline(steps=[('imp', imp), ('gbm', gbm)])

strat = ['median']

n_estimators = [20, 50, 100, 200, 500]
max_depth = [3, 4, 5]
learning_rate = [0.1, 0.05, 0.01]
n_comp = list(range(1, X_train.shape[1] + 1))

est_space = dict(imp__strategy = strat,
                  gbm__n_estimators = n_estimators,
                  gbm__max_depth=max_depth,
                  gbm__learning_rate = learning_rate)

estimator_gbm = GridSearchCV(pipe, param_grid=est_space, n_jobs=-1, cv=5)

print ('Fitting the model to the training dataset..')
estimator_gbm.fit(X_train, y_train)

best_score = estimator_gbm.best_score_
best_est = estimator_gbm.best_estimator_

print ('Best score is %r for GBM' % best_score)
print ('best estimators are as follows \n %r' % best_est)

predict_output(estimator_gbm, 'xgboost_tuned')

Fitting the model to the training dataset..
Best score is 0.67421153054734562 for GBM
best estimators are as follows 
 Pipeline(steps=[('imp', Imputer(axis=0, copy=True, missing_values=nan, strategy='median', verbose=0)), ('gbm', XGBClassifier(base_score=0.5, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=5, min_child_weight=1, n_estimators=200,
       nthread=-1, objective='multi:softprob', seed=0, silent=True,
       subsample=1))])
Predicting outcomes..
Correct number of rows
Saving to CSV..
Done!


In [40]:
strat = ['mean']

n_estimators = [20, 50, 100, 200, 500]
max_depth = [3, 4, 5]
learning_rate = [0.1, 0.05, 0.01]
n_comp = list(range(1, X_train.shape[1] + 1))

est_space = dict(imp__strategy = strat,
                  gbm__n_estimators = n_estimators,
                  gbm__max_depth=max_depth,
                  gbm__learning_rate = learning_rate)

estimator_gbm = GridSearchCV(pipe, param_grid=est_space, n_jobs=-1, cv=5)

print ('Fitting the model to the training dataset..')
estimator_gbm.fit(X_train, y_train)

best_score = estimator_gbm.best_score_
best_est = estimator_gbm.best_estimator_

print ('Best score is %r for GBM' % best_score)
print ('best estimators are as follows \n %r' % best_est)

predict_output(estimator_gbm, 'xgboost_tuned')

Fitting the model to the training dataset..
Best score is 0.67353810468030972 for GBM
best estimators are as follows 
 Pipeline(steps=[('imp', Imputer(axis=0, copy=True, missing_values=nan, strategy='mean', verbose=0)), ('gbm', XGBClassifier(base_score=0.5, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=5, min_child_weight=1, n_estimators=200,
       nthread=-1, objective='multi:softprob', seed=0, silent=True,
       subsample=1))])
Predicting outcomes..
Correct number of rows
Saving to CSV..
Done!


In [39]:
strat = ['most_frequent']

n_estimators = [20, 50, 100, 200, 500]
max_depth = [3, 4, 5]
learning_rate = [0.1, 0.05, 0.01]
n_comp = list(range(1, X_train.shape[1] + 1))

est_space = dict(imp__strategy = strat,
                  gbm__n_estimators = n_estimators,
                  gbm__max_depth=max_depth,
                  gbm__learning_rate = learning_rate)

estimator_gbm = GridSearchCV(pipe, param_grid=est_space, n_jobs=-1, cv=5)

print ('Fitting the model to the training dataset..')
estimator_gbm.fit(X_train, y_train)

best_score = estimator_gbm.best_score_
best_est = estimator_gbm.best_estimator_

print ('Best score is %r for GBM' % best_score)
print ('best estimators are as follows \n %r' % best_est)

predict_output(estimator_gbm, 'xgboost_tuned')

Fitting the model to the training dataset..
Best score is 0.67421153054734562 for GBM
best estimators are as follows 
 Pipeline(steps=[('imp', Imputer(axis=0, copy=True, missing_values=nan, strategy='most_frequent',
    verbose=0)), ('gbm', XGBClassifier(base_score=0.5, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=5, min_child_weight=1, n_estimators=200,
       nthread=-1, objective='multi:softprob', seed=0, silent=True,
       subsample=1))])
Predicting outcomes..
Correct number of rows
Saving to CSV..
Done!


In [None]:
X_train = 

In [None]:
gbm = xgb.XGBClassifier()
imp = Imputer(missing_values=np.nan, strategy = 'median', axis=0)

pipe = Pipeline(steps=[('imp', imp), ('gbm', gbm)])

n_estimators = [20, 50, 100, 200, 500]
max_depth = [3, 4, 5]
learning_rate = [0.1, 0.05, 0.01]
n_comp = list(range(1, X_train.shape[1] + 1))

est_space = dict(imp__strategy = strat,
                  gbm__n_estimators = n_estimators,
                  gbm__max_depth=max_depth,
                  gbm__learning_rate = learning_rate)

estimator_gbm = GridSearchCV(pipe, param_grid=est_space, n_jobs=-1, cv=5)

print ('Fitting the model to the training dataset..')
estimator_gbm.fit(X_train, y_train)

best_score = estimator_gbm.best_score_
best_est = estimator_gbm.best_estimator_

print ('Best score is %r for GBM' % best_score)
print ('best estimators are as follows \n %r' % best_est)

predict_output(estimator_gbm, 'xgboost_tuned')