In [None]:
import sys
!{sys.executable} -m pip install numpy pandas sklearn --user

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
import sklearn as sk
from sklearn.model_selection import GridSearchCV

In [None]:
# Read in the csv data
train_data = pd.read_csv("train_final.csv")
test_data = pd.read_csv("test_final.csv")
# Example the contents
# print(train_data.shape)
# print(train_data.head())
print(test_data.head())

In [None]:
Y = pd.Series(train_data["Y"])
X = train_data.loc[:, "f1":"f24"]

In [None]:
# model.predict_probability() for submission
# play with these params
params={
    'max_depth': [3, 5, 7, 9], #[3,4,5,6,7,8,9], # 5 is good but takes too long in kaggle env
    'subsample': [0.4, 0.6, 0.8, .95], #[0.4,0.5,0.6,0.7,0.8,0.9,1.0],
    'colsample_bytree': [0.5, 0.7, 0.9], #[0.5,0.6,0.7,0.8],
    'n_estimators': [100, 500, 1000, 2000], #[1000,2000,3000]
    'reg_alpha': [0.01, 0.03, 0.05], #[0.01, 0.02, 0.03, 0.04],
    'silent': [1],

}

In [32]:
import time
start_time = time.time()

# B.D.E.
A = xgb.XGBClassifier(
    n_estimators = 1000,
    max_depth = 7,
    subsample = 0.8,
    colsample_bytree = 0.7,
    reg_alpha = 0.03,
    silent = 1,
    update = 'grow_gpu',
    tree_method = 'gpu_hist',
    predictor = 'gpu_predictor'
)

A.fit(X, Y)

print("Training one model took: " + str(time.time() - start_time), " to run")

Training one model took: 3.482536792755127  to run


In [33]:
print('XGBoost with grid search')

xbgc_params = {
    'tree_method': ['gpu_hist'],
    'predictor': ['gpu_predictor'],
    'updater': ['grow_gpu']
}
xgb_clf = xgb.XGBClassifier(xbgc_params)

print('Begin GridSearchCV')
rs = GridSearchCV(xgb_clf,
                  params,
                  cv=20,
                  scoring="roc_auc",
                  n_jobs=-1,
                  verbose=False)
rs.fit(X, Y)
# best_est = rs.best_estimator_


XGBoost with grid search
Begin GridSearchCV


GridSearchCV(cv=20, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth={'tree_method': ['gpu_hist'], 'predictor': ['gpu_predictor'], 'updater': ['grow_gpu']},
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': [3, 5, 7, 9], 'subsample': [0.4, 0.6, 0.8, 0.95], 'colsample_bytree': [0.5, 0.7, 0.9], 'n_estimators': [100, 500, 1000, 2000], 'reg_alpha': [0.01, 0.03, 0.05], 'silent': [1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=False)

In [None]:


# Grab results into csv for checking
results = pd.DataFrame(rs.cv_results_)
results.sort_values(by='rank_test_score', inplace=True)
results.to_csv('training_results.csv', ',')

In [56]:
X_test = test_data.loc[:, "f1":"f24"]

In [96]:
best_model = rs.best_estimator_
print(best_model)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.5, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=9, min_child_weight=1, missing=None, n_estimators=500,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0.01, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=1, subsample=0.95)


In [77]:
predictions = best_model.predict_proba(X_test)
formatted_predictions = np.array(predictions)
submission = pd.DataFrame({'Id':test_data.Id, 'Y': formatted_predictions[:,1]})
submission.to_csv('submissions.csv', index=False)

In [97]:
# Okay, lets try an average of models...
print(best_model)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.5, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=9, min_child_weight=1, missing=None, n_estimators=500,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0.01, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=1, subsample=0.95)


In [103]:
# Get variables from cv results, grab actual parameters

# Create 5 sets of parameters (max_depth[X,X-1,X+1], subsample +- .05, colsample +- .05,
# n_estimators ((+- half of values), reg_alpha _- .05))

# Fine tune each of the 5 for n_estimators
# Take the test values, run them through all 5 models, take an average, submit and win
model1_ftparams ={
    'colsample_bytree': [0.45, 0.5, 0.55],
    'max_depth': [8, 9, 10],  
    'n_estimators': [250, 500, 750],
    'reg_alpha': [0.005, 0.01, 0.015],
    'subsample': [0.95],
    'silent': [1],
}

ftgrid_1 = GridSearchCV(xgb_clf,
                  model1_ftparams,
                  cv=20,
                  scoring="roc_auc",
                  n_jobs=-1,
                  verbose=False)
ftgrid_1.fit(X, Y)

GridSearchCV(cv=20, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth={'tree_method': ['gpu_hist'], 'predictor': ['gpu_predictor'], 'updater': ['grow_gpu']},
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'colsample_bytree': [0.45, 0.5, 0.55], 'max_depth': [8, 9, 10], 'n_estimators': [250, 500, 750], 'reg_alpha': [0.005, 0.01, 0.015], 'subsample': [0.95], 'silent': [1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=False)

In [104]:
model3_ftparams ={
    'colsample_bytree': [0.45, 0.5, 0.55],
    'max_depth': [8, 9, 10],  
    'n_estimators': [500, 1000, 1500],
    'reg_alpha': [0.005, 0.01, 0.015],
    'subsample': [0.95],
    'silent': [1],
}

ftgrid_3 = GridSearchCV(xgb_clf,
                  model3_ftparams,
                  cv=20,
                  scoring="roc_auc",
                  n_jobs=-1,
                  verbose=False)
ftgrid_3.fit(X, Y)

GridSearchCV(cv=20, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth={'tree_method': ['gpu_hist'], 'predictor': ['gpu_predictor'], 'updater': ['grow_gpu']},
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'colsample_bytree': [0.45, 0.5, 0.55], 'max_depth': [8, 9, 10], 'n_estimators': [500, 1000, 1500], 'reg_alpha': [0.005, 0.01, 0.015], 'subsample': [0.95], 'silent': [1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=False)

In [105]:
model4_ftparams ={
    'colsample_bytree': [0.45, 0.5, 0.55],
    'max_depth': [8, 9, 10],  
    'n_estimators': [250, 500, 750],
    'reg_alpha': [0.045, 0.05, 0.055],
    'subsample': [0.95],
    'silent': [1],
}

ftgrid_4 = GridSearchCV(xgb_clf,
                  model4_ftparams,
                  cv=20,
                  scoring="roc_auc",
                  n_jobs=-1,
                  verbose=False)
ftgrid_4.fit(X, Y)

GridSearchCV(cv=20, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth={'tree_method': ['gpu_hist'], 'predictor': ['gpu_predictor'], 'updater': ['grow_gpu']},
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'colsample_bytree': [0.45, 0.5, 0.55], 'max_depth': [8, 9, 10], 'n_estimators': [250, 500, 750], 'reg_alpha': [0.045, 0.05, 0.055], 'subsample': [0.95], 'silent': [1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=False)

In [106]:
model8_ftparams ={
    'colsample_bytree': [0.45, 0.5, 0.55],
    'max_depth': [8, 9, 10],  
    'n_estimators': [500, 1000, 1500],
    'reg_alpha': [0.025, 0.03, 0.035],
    'subsample': [0.95],
    'silent': [1],
}

ftgrid_8 = GridSearchCV(xgb_clf,
                  model8_ftparams,
                  cv=20,
                  scoring="roc_auc",
                  n_jobs=-1,
                  verbose=False)
ftgrid_8.fit(X, Y)

GridSearchCV(cv=20, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth={'tree_method': ['gpu_hist'], 'predictor': ['gpu_predictor'], 'updater': ['grow_gpu']},
       min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'colsample_bytree': [0.45, 0.5, 0.55], 'max_depth': [8, 9, 10], 'n_estimators': [500, 1000, 1500], 'reg_alpha': [0.025, 0.03, 0.035], 'subsample': [0.95], 'silent': [1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=False)

In [109]:
ftgrid_1_results = pd.DataFrame(ftgrid_1.cv_results_)
ftgrid_1_results.sort_values(by='rank_test_score', inplace=True)
ftgrid_1_results.to_csv('model1_ft.csv', ',')
ftgrid_3_results = pd.DataFrame(ftgrid_3.cv_results_)
ftgrid_3_results.sort_values(by='rank_test_score', inplace=True)
ftgrid_3_results.to_csv('model3_ft.csv', ',')
ftgrid_4_results = pd.DataFrame(ftgrid_4.cv_results_)
ftgrid_4_results.sort_values(by='rank_test_score', inplace=True)
ftgrid_4_results.to_csv('model4_ft.csv', ',')
ftgrid_8_results = pd.DataFrame(ftgrid_8.cv_results_)
ftgrid_8_results.sort_values(by='rank_test_score', inplace=True)
ftgrid_8_results.to_csv('model8_ft.csv', ',')



In [111]:
model1_ft_predictions = np.array(ftgrid_1.best_estimator_.predict_proba(X_test))[:,1]
model3_ft_predictions = np.array(ftgrid_3.best_estimator_.predict_proba(X_test))[:,1]
model4_ft_predictions = np.array(ftgrid_4.best_estimator_.predict_proba(X_test))[:,1]
model8_ft_predictions = np.array(ftgrid_8.best_estimator_.predict_proba(X_test))[:,1]

In [113]:
avg_ft_predictions = (model1_ft_predictions + model3_ft_predictions + model4_ft_predictions + model8_ft_predictions) / 4

In [115]:
print(avg_ft_predictions)

[0.9159833  0.97035176 0.9999771  ... 0.9998962  0.9978585  0.9926251 ]


In [116]:
submission = pd.DataFrame({'Id':test_data.Id, 'Y': avg_ft_predictions})
submission.to_csv('submissions1.csv', index=False)

In [None]:
# Okay, let's try to avoid overfitting now, calibrate learning curve
