In [28]:
from xgboost import XGBClassifier
from sklearn.preprocessing import OneHotEncoder, scale, LabelEncoder
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV

In [47]:
# function to fit model, predict response, and compute performance metrics 
def fit_and_predict(clf, X_train, y_train, X_test, y_test, predict='test'):
    X_train = np.array(X_train)
    y_train = np.array(y_train)
    X_test = np.array(X_test)
    y_test = np.array(y_test)
    clf.fit(X_train, y_train.ravel())
    if (predict == 'test'): # testing performance
        y_pred = clf.predict(X_test)
        accuracy = metrics.accuracy_score(y_test, y_pred)
        precision = metrics.precision_score(y_test, y_pred)
        recall = metrics.recall_score(y_test, y_pred)
        auc = metrics.roc_auc_score(y_test, y_pred)
    else: # training performance
        y_pred = clf.predict(X_train)
        accuracy = metrics.accuracy_score(y_train, y_pred)
        precision = metrics.precision_score(y_train, y_pred)
        recall = metrics.recall_score(y_train, y_pred)
        auc = metrics.roc_auc_score(y_train, y_pred)
    return accuracy, precision, recall, auc

In [75]:
df = pd.read_csv('dvd_topics.csv')

In [None]:
# Use stratified k-fold to compute training and testing performance, and iterate many times to compute average performance
tr_accuracies = []
tr_precisions = []
tr_recalls = []
tr_aucs = []
ts_accuracies = []
ts_precisions = []
ts_recalls = []
ts_aucs = []
X = np.array(df[['Medical', 'Surveillance', 'Radiation', 'Surgery']])
y = df['txgot_binary']
for i in range(0,1000):
    kf = StratifiedKFold(n_splits=4, shuffle=True) # this will "randomly" split into stratified folds (maintaining class balances in each fold) 
    for train_index, test_index in kf.split(X,y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index] 
        tr_acc, tr_pre, tr_rec, tr_auc = fit_and_predict(XGBClassifier(), X_train, y_train, X_test, y_test, predict='train')
        tr_accuracies.append(tr_acc)
        tr_precisions.append(tr_pre)
        tr_recalls.append(tr_rec)
        tr_aucs.append(tr_auc)
        ts_acc, ts_pre, ts_rec, ts_auc = fit_and_predict(XGBClassifier(), X_train, y_train, X_test, y_test, predict='test')
        ts_accuracies.append(ts_acc)
        ts_precisions.append(ts_pre)
        ts_recalls.append(ts_rec)
        ts_aucs.append(ts_auc)

In [None]:
print('Training - accuracy: ', np.average(tr_accuracies), ', Testing - accuracy: ', np.average(ts_accuracies), '\n')
print('Training - precision: ', np.average(tr_precisions), ', Testing - precision: ', np.average(ts_precisions), '\n')
print('Training - recall: ', np.average(tr_recalls), ', Testing - recall: ', np.average(ts_recalls), '\n')
print('Training - auc: ', np.average(tr_aucs), ', Testing - auc: ', np.average(ts_aucs), '\n')

Preethi's topics:

Training - accuracy:  0.9729315890870137 , Testing - accuracy:  0.7079242063492063 

Training - precision:  0.9864957452511257 , Testing - precision:  0.3472598752554635 

Training - recall:  0.8979137500000001 , Testing - recall:  0.26141666666666663 

Training - auc:  0.9469874305555556 , Testing - auc:  0.5535277777777778 

Sammy's topics:

Training - accuracy:  0.9629580149803026 , Testing - accuracy:  0.7219116129785248 

Training - precision:  0.9938414122509832 , Testing - precision:  0.3704385815573316 

Training - recall:  0.8482908333333334 , Testing - recall:  0.22682986111111114 

Training - auc:  0.9233028626543209 , Testing - auc:  0.5507187054843304 