In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from data_loader import load_data, data_process
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import *
from sklearn.metrics import confusion_matrix

In [10]:
# evaluate predictions
def eval_pred(y_test, predictions):
    accuracy = accuracy_score(y_test, predictions)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    f1 = f1_score(y_test, predictions)
    print("f1 score: %.2f%%" % (f1 * 100.0))
    roc_auc = roc_auc_score(y_test, predictions)
    print("roc_auc score: %.2f%%" % (roc_auc * 100.0))
    confusion = confusion_matrix(y_test, predictions)
    print("confusion matrix: \n n = %d\n" %y_test.shape[0], confusion)
    print("precision score: %.2f%%" % precision_score(y_test, predictions))
    print("recall score: %.2f%%" % recall_score(y_test, predictions))

In [2]:
# load data
train_dir = '/Users/raina/Desktop/UHN/physionet/training_setB'
train_data, y = load_data(train_dir, limit=10000)
X = data_process(train_data, expand_dims=False)

In [3]:
y_ = np.hstack((yi for yi in y))
X_ = np.vstack(X).astype(np.float)
print(X_.shape, y_.shape)

(380176, 40) (380176,)


In [4]:
# split data into train and test sets
seed = 42
test_size = 0.4
X_train, X_test, y_train, y_test = train_test_split(X_, y_, 
                                                    test_size=test_size,
                                                    random_state=seed)

In [5]:
# fit the model
model = XGBClassifier(max_depth=6)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
              max_depth=6, min_child_weight=1, missing=None, n_estimators=100,
              n_jobs=1, nthread=None, objective='binary:logistic',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=True, subsample=1)

In [11]:
# make predictions for test data
# loop over each patient and predict using only past and present data
y_pred = model.predict(X_test)
# since binay classification problem, can convert the result to 0 or 1
# using round()
predictions = [round(value) for value in y_pred]
eval_pred(y_pred, predictions)

Accuracy: 100.00%
f1 score: 100.00%
roc_auc score: 100.00%
confusion matrix: 
 n = 152071
 [[151904      0]
 [     0    167]]
precision score: 1.00%
recall score: 1.00%
