In [80]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import mean_absolute_error, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import xgboost as xgb

In [56]:
import GeneralModel as gm

In [57]:
va_split = pd.read_csv('../../DataPlus/va_split.csv')
dvd_split = pd.read_csv('../../DataPlus/dvd_split.csv')

In [58]:
va_df = gm.prepare_df(va_split, ['age'], ['gleason'], 'txgot_binary')
dvd_df = gm.prepare_df(dvd_split, ['age'], ['gleason'], 'txgot_binary')

# of Data Points: 216
# of Data Points: 176


## First XGBoost Model

In [59]:
va_xg = XGBClassifier()
dvd_xg = XGBClassifier()

In [60]:
va_X = va_df.drop(['txgot_binary'], axis=1).values
va_y = va_df['txgot_binary'].values
dvd_X = dvd_df.drop(['txgot_binary'], axis=1).values
dvd_y = dvd_df['txgot_binary'].values

In [61]:
train_X_va, test_X_va, train_y_va, test_y_va = train_test_split(va_X, va_y, test_size=0.6)
train_X_dvd, test_X_dvd, train_y_dvd, test_y_dvd = train_test_split(dvd_X, dvd_y, test_size=0.6)

In [62]:
va_xg.fit(train_X_va, train_y_va, verbose=False)
dvd_xg.fit(train_X_dvd, train_y_dvd, verbose=False)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [63]:
predictions_va = va_xg.predict(test_X_va)
predictions_dvd = dvd_xg.predict(test_X_dvd)

In [64]:
print("VA confusion matrix:")
print(confusion_matrix(predictions_va, test_y_va))
print(classification_report(predictions_va, test_y_va))
print("DVD confusion matrix:")
print(confusion_matrix(predictions_dvd, test_y_dvd))
print(classification_report(predictions_dvd, test_y_dvd))

VA confusion matrix:
[[49 15]
 [20 46]]
             precision    recall  f1-score   support

        0.0       0.71      0.77      0.74        64
        1.0       0.75      0.70      0.72        66

avg / total       0.73      0.73      0.73       130

DVD confusion matrix:
[[62 10]
 [21 13]]
             precision    recall  f1-score   support

        0.0       0.75      0.86      0.80        72
        1.0       0.57      0.38      0.46        34

avg / total       0.69      0.71      0.69       106



## Playing Around with # of Estimators

In [65]:
#Creating a function to determin the best fit n_estimators.
def xgbr(n_estimator,X_train,X_test,y_train,y_test):
    my_model = XGBClassifier(n_estimators=n_estimator)
    my_model.fit(X_train, y_train)
    pred = my_model.predict(X_test)
    return str(mean_absolute_error(pred, y_test))

In [66]:
for estimators in [100,500,1000,1500,2000,2500]:
    mae=xgbr(estimators,train_X_va,test_X_va,train_y_va,test_y_va)
    print("VA Evaluation:")
    print("Number of estimators: {}  \t\t Mean Absolute Error:  {}".format(estimators, mae))
print()
for estimators in [100,500,1000,1500,2000,2500]:
    mae=xgbr(estimators,train_X_dvd,test_X_dvd,train_y_dvd,test_y_dvd)
    print("VA Evaluation:")
    print("Number of estimators: {}  \t\t Mean Absolute Error:  {}".format(estimators, mae))

VA Evaluation:
Number of estimators: 100  		 Mean Absolute Error:  0.2692307692307692
VA Evaluation:
Number of estimators: 500  		 Mean Absolute Error:  0.2923076923076923
VA Evaluation:
Number of estimators: 1000  		 Mean Absolute Error:  0.2923076923076923
VA Evaluation:
Number of estimators: 1500  		 Mean Absolute Error:  0.2923076923076923
VA Evaluation:
Number of estimators: 2000  		 Mean Absolute Error:  0.2923076923076923
VA Evaluation:
Number of estimators: 2500  		 Mean Absolute Error:  0.2923076923076923

VA Evaluation:
Number of estimators: 100  		 Mean Absolute Error:  0.29245283018867924
VA Evaluation:
Number of estimators: 500  		 Mean Absolute Error:  0.2830188679245283
VA Evaluation:
Number of estimators: 1000  		 Mean Absolute Error:  0.2830188679245283
VA Evaluation:
Number of estimators: 1500  		 Mean Absolute Error:  0.2830188679245283
VA Evaluation:
Number of estimators: 2000  		 Mean Absolute Error:  0.2830188679245283
VA Evaluation:
Number of estimators: 2500  		

Error doesn't change much after 500 estimators, probably even less. Then again, we have a really small dataset.

In [67]:
for estimators in [10, 20, 30, 40, 50]:
    mae=xgbr(estimators,train_X_va,test_X_va,train_y_va,test_y_va)
    print("VA Evaluation:")
    print("Number of estimators: {}  \t\t Mean Absolute Error:  {}".format(estimators, mae))
print()
for estimators in [10, 20, 30, 40, 50]:
    mae=xgbr(estimators,train_X_dvd,test_X_dvd,train_y_dvd,test_y_dvd)
    print("VA Evaluation:")
    print("Number of estimators: {}  \t\t Mean Absolute Error:  {}".format(estimators, mae))

VA Evaluation:
Number of estimators: 10  		 Mean Absolute Error:  0.2692307692307692
VA Evaluation:
Number of estimators: 20  		 Mean Absolute Error:  0.2692307692307692
VA Evaluation:
Number of estimators: 30  		 Mean Absolute Error:  0.2692307692307692
VA Evaluation:
Number of estimators: 40  		 Mean Absolute Error:  0.2692307692307692
VA Evaluation:
Number of estimators: 50  		 Mean Absolute Error:  0.2692307692307692

VA Evaluation:
Number of estimators: 10  		 Mean Absolute Error:  0.25471698113207547
VA Evaluation:
Number of estimators: 20  		 Mean Absolute Error:  0.25471698113207547
VA Evaluation:
Number of estimators: 30  		 Mean Absolute Error:  0.2641509433962264
VA Evaluation:
Number of estimators: 40  		 Mean Absolute Error:  0.29245283018867924
VA Evaluation:
Number of estimators: 50  		 Mean Absolute Error:  0.29245283018867924


In [20]:
for estimators in [5, 10, 15, 20, 25]:
    mae=xgbr(estimators,train_X_va,test_X_va,train_y_va,test_y_va)
    print("VA Evaluation:")
    print("Number of estimators: {}  \t\t Mean Absolute Error:  {}".format(estimators, mae))
print()
for estimators in [5, 10, 15, 20, 25]:
    mae=xgbr(estimators,train_X_dvd,test_X_dvd,train_y_dvd,test_y_dvd)
    print("VA Evaluation:")
    print("Number of estimators: {}  \t\t Mean Absolute Error:  {}".format(estimators, mae))

VA Evaluation:
Number of estimators: 5  		 Mean Absolute Error:  0.41531904202241166
VA Evaluation:
Number of estimators: 10  		 Mean Absolute Error:  0.36579068371882806
VA Evaluation:
Number of estimators: 15  		 Mean Absolute Error:  0.3388300590790235
VA Evaluation:
Number of estimators: 20  		 Mean Absolute Error:  0.3252392450204262
VA Evaluation:
Number of estimators: 25  		 Mean Absolute Error:  0.3214148065218559

VA Evaluation:
Number of estimators: 5  		 Mean Absolute Error:  0.4217099744193959
VA Evaluation:
Number of estimators: 10  		 Mean Absolute Error:  0.37527544627774434
VA Evaluation:
Number of estimators: 15  		 Mean Absolute Error:  0.3450969342353209
VA Evaluation:
Number of estimators: 20  		 Mean Absolute Error:  0.33119347410381966
VA Evaluation:
Number of estimators: 25  		 Mean Absolute Error:  0.3261720993608799


In [68]:
for estimators in [20, 25, 30, 35, 40]:
    mae=xgbr(estimators,train_X_va,test_X_va,train_y_va,test_y_va)
    print("VA Evaluation:")
    print("Number of estimators: {}  \t\t Mean Absolute Error:  {}".format(estimators, mae))
print()
for estimators in [20, 25, 30, 35, 40]:
    mae=xgbr(estimators,train_X_dvd,test_X_dvd,train_y_dvd,test_y_dvd)
    print("VA Evaluation:")
    print("Number of estimators: {}  \t\t Mean Absolute Error:  {}".format(estimators, mae))

VA Evaluation:
Number of estimators: 20  		 Mean Absolute Error:  0.2692307692307692
VA Evaluation:
Number of estimators: 25  		 Mean Absolute Error:  0.2692307692307692
VA Evaluation:
Number of estimators: 30  		 Mean Absolute Error:  0.2692307692307692
VA Evaluation:
Number of estimators: 35  		 Mean Absolute Error:  0.2692307692307692
VA Evaluation:
Number of estimators: 40  		 Mean Absolute Error:  0.2692307692307692

VA Evaluation:
Number of estimators: 20  		 Mean Absolute Error:  0.25471698113207547
VA Evaluation:
Number of estimators: 25  		 Mean Absolute Error:  0.25471698113207547
VA Evaluation:
Number of estimators: 30  		 Mean Absolute Error:  0.2641509433962264
VA Evaluation:
Number of estimators: 35  		 Mean Absolute Error:  0.2641509433962264
VA Evaluation:
Number of estimators: 40  		 Mean Absolute Error:  0.29245283018867924


It seems that between 35 and 40 is the sweet spot.

In [70]:
#Creating a function to determin the best fit n_estimators.
def xgbr(n_estimator,X_train,X_test,y_train,y_test):
    my_model = XGBClassifier(n_estimators=n_estimator)
    my_model.fit(X_train, y_train)
    pred = my_model.predict(X_test)
    print("Results:")
    print(confusion_matrix(pred, y_test))
    print(classification_report(pred, y_test))

In [71]:
xgbr(35, train_X_va, test_X_va, train_y_va, test_y_va)

Results:
[[49 15]
 [20 46]]
             precision    recall  f1-score   support

        0.0       0.71      0.77      0.74        64
        1.0       0.75      0.70      0.72        66

avg / total       0.73      0.73      0.73       130



In [72]:
xgbr(35, train_X_dvd, test_X_dvd, train_y_dvd, test_y_dvd)

Results:
[[65 10]
 [18 13]]
             precision    recall  f1-score   support

        0.0       0.78      0.87      0.82        75
        1.0       0.57      0.42      0.48        31

avg / total       0.72      0.74      0.72       106



## Train on One Dataset -> Test on Other

In [73]:
xgbr(35, va_X, dvd_X, va_y, dvd_y)

Results:
[[84 11]
 [45 36]]
             precision    recall  f1-score   support

        0.0       0.65      0.88      0.75        95
        1.0       0.77      0.44      0.56        81

avg / total       0.70      0.68      0.66       176



In [74]:
xgbr(35, dvd_X, va_X, dvd_y, va_y)

Results:
[[101  49]
 [ 17  49]]
             precision    recall  f1-score   support

        0.0       0.86      0.67      0.75       150
        1.0       0.50      0.74      0.60        66

avg / total       0.75      0.69      0.71       216



In [89]:
def modelfit(alg, dtrain, target, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain[target],eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g") % metrics.accuracy_score(dtrain[target].values, dtrain_predictions)
    print("AUC Score (Train): %f") % metrics.roc_auc_score(dtrain[target], dtrain_predprob)
                    
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x000002A130D37C18>>
Traceback (most recent call last):
  File "C:\Users\grant\Anaconda3\envs\dataplus\lib\site-packages\xgboost\core.py", line 366, in __del__
    if self.handle is not None:
AttributeError: 'DMatrix' object has no attribute 'handle'


In [90]:
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

In [91]:
train_cols = ['age', 'gleason']

In [92]:
target_col = 'txgot_binary'

In [93]:
modelfit(xgb1, va_split, target_col, train_cols)

XGBoostError: b'[12:11:12] d:\\build\\xgboost\\xgboost-0.71.git\\src\\objective\\regression_obj.cc:103: Check failed: Loss::CheckLabel(y) label must be in [0,1] for logistic regression'