In [19]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

In [4]:
import GeneralModel as gm

In [5]:
va_split = pd.read_csv('../../DataPlus/va_split.csv')
dvd_split = pd.read_csv('../../DataPlus/dvd_split.csv')

In [6]:
va_df = gm.prepare_df(va_split, ['age'], ['gleason'], 'txgot_binary')
dvd_df = gm.prepare_df(dvd_split, ['age'], ['gleason'], 'txgot_binary')

# of Data Points: 216
# of Data Points: 176


In [9]:
va_xg = XGBRegressor()
dvd_xg = XGBRegressor()

In [11]:
va_X = va_df.drop(['txgot_binary'], axis=1).values
va_y = va_df['txgot_binary'].values
dvd_X = dvd_df.drop(['txgot_binary'], axis=1).values
dvd_y = dvd_df['txgot_binary'].values

In [13]:
train_X_va, test_X_va, train_y_va, test_y_va = train_test_split(va_X, va_y, test_size=0.6)
train_X_dvd, test_X_dvd, train_y_dvd, test_y_dvd = train_test_split(dvd_X, dvd_y, test_size=0.6)

In [14]:
va_xg.fit(train_X_va, train_y_va, verbose=False)
dvd_xg.fit(train_X_dvd, train_y_dvd, verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [15]:
predictions_va = va_xg.predict(test_X_va)
predictions_dvd = dvd_xg.predict(test_X_dvd)

In [20]:
print("VA confusion matrix:")
print(confusion_matrix(predictions_va.round(), test_y_va))
print(classification_report(predictions_va.round(), test_y_va))
print("DVD confusion matrix:")
print(confusion_matrix(predictions_dvd.round(), test_y_dvd))
print(classification_report(predictions_dvd.round(), test_y_dvd))

VA confusion matrix:
[[55 21]
 [13 41]]
             precision    recall  f1-score   support

        0.0       0.81      0.72      0.76        76
        1.0       0.66      0.76      0.71        54

avg / total       0.75      0.74      0.74       130

DVD confusion matrix:
[[70 21]
 [ 6  9]]
             precision    recall  f1-score   support

        0.0       0.92      0.77      0.84        91
        1.0       0.30      0.60      0.40        15

avg / total       0.83      0.75      0.78       106



In [21]:
#Creating a function to determin the best fit n_estimators.
def xgbr(n_estimator,X_train,X_test,y_train,y_test):
    my_model = XGBRegressor(n_estimators=n_estimator)
    my_model.fit(X_train, y_train)
    pred = my_model.predict(X_test)
    return str(mean_absolute_error(pred, y_test))

In [22]:
for estimators in [100,500,1000,1500,2000,2500]:
    mae=xgbr(estimators,train_X_va,test_X_va,train_y_va,test_y_va)
    print("VA Evaluation:")
    print("Number of estimators: {}  \t\t Mean Absolute Error:  {}".format(estimators, mae))
for estimators in [100,500,1000,1500,2000,2500]:
    mae=xgbr(estimators,train_X_dvd,test_X_dvd,train_y_dvd,test_y_dvd)
    print("VA Evaluation:")
    print("Number of estimators: {}  \t\t Mean Absolute Error:  {}".format(estimators, mae))

VA Evaluation:
Number of estimators: 100  		 Mean Absolute Error:  0.2904790080510653
VA Evaluation:
Number of estimators: 500  		 Mean Absolute Error:  0.2881836182796038
VA Evaluation:
Number of estimators: 1000  		 Mean Absolute Error:  0.2881836182796038
VA Evaluation:
Number of estimators: 1500  		 Mean Absolute Error:  0.2881836182796038
VA Evaluation:
Number of estimators: 2000  		 Mean Absolute Error:  0.2881836182796038
VA Evaluation:
Number of estimators: 2500  		 Mean Absolute Error:  0.2881836182796038
VA Evaluation:
Number of estimators: 100  		 Mean Absolute Error:  0.29753750197167667
VA Evaluation:
Number of estimators: 500  		 Mean Absolute Error:  0.27988233065830087
VA Evaluation:
Number of estimators: 1000  		 Mean Absolute Error:  0.27988233122060885
VA Evaluation:
Number of estimators: 1500  		 Mean Absolute Error:  0.27988233122060885
VA Evaluation:
Number of estimators: 2000  		 Mean Absolute Error:  0.27988233122060885
VA Evaluation:
Number of estimators: 2500 