In [None]:
!pip install sklearn

In [None]:
# #Import libraries:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import model_selection, metrics   #Additional scklearn functions
from sklearn.model_selection import GridSearchCV   #Perforing grid search

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

data = pd.read_csv(r'D:\msc-project\from-github\dissertations-2021-info\rahim_ghani_predict\data\features_with_outcome.csv',encoding='utf=8')
target = 'outcome'
IDcol = 'org_uuid'

In [None]:
data['outcome']

In [None]:
from sklearn.model_selection import train_test_split

y_data = data['outcome']
X_data = data.drop(columns=['outcome'])

X_train,X_test,y_train,y_test = train_test_split(X_data,y_data,stratify=y_data)

In [None]:
train = pd.concat([X_train,y_train],axis=1)
test = pd.concat([X_test,y_test],axis=1)

In [None]:
def modelfit(alg, dtrain, dtest, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['outcome'],eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtest_predictions = alg.predict(dtest[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
    dtest_predprob = alg.predict_proba(dtest[predictors])[:,1]
        
    #Print model report:
    print ("\nModel Report")
    print ("Training Accuracy : %.4g" % metrics.accuracy_score(dtrain['outcome'].values, dtrain_predictions))
    print ("Test Accuracy : %.4g" % metrics.accuracy_score(dtest['outcome'].values, dtest_predictions))
    print ("Training Precision : %.4g" % metrics.precision_score(dtrain['outcome'].values, dtrain_predictions))
    print ("Test Precision : %.4g" % metrics.precision_score(dtest['outcome'].values, dtest_predictions))
    print ("Training Recall : %.4g" % metrics.recall_score(dtrain['outcome'].values, dtrain_predictions))
    print ("Test Recall : %.4g" % metrics.recall_score(dtest['outcome'].values, dtest_predictions))
    print ("Training f1 : %.4g" % metrics.f1_score(dtrain['outcome'].values, dtrain_predictions))
    print ("Test f1 : %.4g" % metrics.f1_score(dtest['outcome'].values, dtest_predictions))
    print ("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['outcome'], dtrain_predprob))
    print ("AUC Score (Test): %f" % metrics.roc_auc_score(dtest['outcome'], dtest_predprob))
    print ("Training Set Confusion Matrix\n")
    print (metrics.confusion_matrix(dtrain['outcome'],dtrain_predictions))
    print ("Test Set Confusion Matrix\n")
    print (metrics.confusion_matrix(dtest['outcome'],dtest_predictions))
                    
    feat_imp = pd.Series(alg.feature_importances_,index=predictors).sort_values(ascending=False).nlargest(n=20)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

In [None]:
y_data = data['outcome']
X_data = data.drop(columns = ['outcome'])

In [None]:
#splitting data into test and full training set
from sklearn.model_selection import train_test_split

X_train_full, X_test, y_train_full, y_test = train_test_split(X_data,y_data, 
                                                              test_size = 0.2,
                                                              stratify=y_data)

In [None]:
#Choose all predictors except target & IDcols
predictors = [x for x in train.columns if x not in [target, IDcol]]

xgb1 = XGBClassifier(
        learning_rate =0.1,
        n_estimators=1000,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        nthread=4,
        scale_pos_weight=1,
        seed=27)

modelfit(xgb1, train, test, predictors)

In [None]:
#Tune max_depth and min_child_weight
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=5,
                                                    min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                                 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
                                                 param_grid = param_test1, scoring='f1',n_jobs=4, cv=5)

gsearch1.fit(train[predictors],train[target])

print(pd.DataFrame(gsearch1.cv_results_))
print(gsearch1.best_params_)
print(gsearch1.best_score_)

In [None]:
param_test2 = {
 'max_depth':[8,9,10],
 'min_child_weight':[2,3,4]
}

gsearch2 = GridSearchCV(estimator = XGBClassifier(learning_rate=0.1, n_estimators=140, max_depth=5,
                                                  min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                                  objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
                                                    param_grid = param_test2, scoring='f1',n_jobs=4, cv=5)

gsearch2.fit(train[predictors],train[target])

print(pd.DataFrame(gsearch2.cv_results_))
print(gsearch2.best_params_)
print(gsearch2.best_score_)

In [None]:
param_test2b = {
 'min_child_weight':[1,2,3,4]
}

gsearch2b = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=10,
 min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test2b, scoring='f1',n_jobs=4, cv=5)

gsearch2b.fit(train[predictors],train[target])

print(pd.DataFrame(gsearch2b.cv_results_))
print(gsearch2b.best_params_)
print(gsearch2b.best_score_)

In [None]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}

gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=10,
 min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test3, scoring='f1',n_jobs=4, cv=5)

gsearch3.fit(train[predictors],train[target])

print(pd.DataFrame(gsearch3.cv_results_))
print(gsearch3.best_params_)
print(gsearch3.best_score_)

In [None]:
xgb2 = XGBClassifier(learning_rate =0.1,
                     n_estimators=1000,
                     max_depth=10,
                     min_child_weight=2,
                     gamma=0,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     objective= 'binary:logistic',
                     nthread=4,
                     scale_pos_weight=1,
                     seed=27)

modelfit(xgb2, train, test, predictors)

In [None]:
param_test4 = {
 'scale_pos_weight':range (1,10,2)
}

gsearch4 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=10,
 min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test4, scoring='f1',n_jobs=4, cv=5)

gsearch4.fit(train[predictors],train[target])

print(pd.DataFrame(gsearch4.cv_results_))
print(gsearch4.best_params_)
print(gsearch4.best_score_)

In [None]:
param_test4a = {
 'scale_pos_weight':[4,5,6]
}

gsearch4a = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=10,
 min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test4a, scoring='f1',n_jobs=4, cv=5)

gsearch4a.fit(train[predictors],train[target])

print(pd.DataFrame(gsearch4a.cv_results_))
print(gsearch4a.best_params_)
print(gsearch4a.best_score_)

In [None]:
param_test4b = {
 'scale_pos_weight':[4.0,4.2,4.4,4.6,4.8,5.0]
}

gsearch4b = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=10,
 min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test4b, scoring='f1',n_jobs=4, cv=5)

gsearch4b.fit(train[predictors],train[target])

print(pd.DataFrame(gsearch4b.cv_results_))
print(gsearch4b.best_params_)
print(gsearch4b.best_score_)

In [None]:
xgb3 = XGBClassifier(learning_rate =0.1,
                     n_estimators=1000,
                     max_depth=10,
                     min_child_weight=2,
                     gamma=0,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     objective= 'binary:logistic',
                     nthread=4,
                     scale_pos_weight=5,
                     seed=27)

modelfit(xgb3, train, test, predictors)

In [None]:
param_test5 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}

gsearch5 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=10,
 min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=5,seed=27), 
 param_grid = param_test5, scoring='f1',n_jobs=4, cv=5)

gsearch5.fit(train[predictors],train[target])

print(pd.DataFrame(gsearch5.cv_results_))
print(gsearch5.best_params_)
print(gsearch5.best_score_)

In [None]:
param_test5a = {
 'subsample':[i/100.0 for i in range(90,100)],
 'colsample_bytree':[i/100.0 for i in range(90,100)]
}

gsearch5a = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=10,
 min_child_weight=2, gamma=0, subsample=0.9, colsample_bytree=0.9,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=5,seed=27), 
 param_grid = param_test5a, scoring='f1',n_jobs=4, cv=5)

gsearch5a.fit(train[predictors],train[target])

print(pd.DataFrame(gsearch5a.cv_results_))
print(gsearch5a.best_params_)
print(gsearch5a.best_score_)

In [None]:
param_test6 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}

gsearch6 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=10,
 min_child_weight=2, gamma=0, subsample=0.9, colsample_bytree=0.68,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=5,seed=27), 
 param_grid = param_test6, scoring='recall',n_jobs=4, cv=5)

gsearch6.fit(train[predictors],train[target])

print(pd.DataFrame(gsearch6.cv_results_))
print(gsearch6.best_params_)
print(gsearch6.best_score_)