In [13]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from data_loader import load_data, data_process
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import *
from sklearn.metrics import confusion_matrix

In [14]:
# evaluate predictions
def eval_pred(y_test, predictions):
    accuracy = accuracy_score(y_test, predictions)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    f1 = f1_score(y_test, predictions)
    print("f1 score: %.2f%%" % (f1 * 100.0))
    roc_auc = roc_auc_score(y_test, predictions)
    print("roc_auc score: %.2f%%" % (roc_auc * 100.0))
    confusion = confusion_matrix(y_test, predictions)
    print("confusion matrix: \n n = %d\n" %y_test.shape[0], confusion)
    print("precision score: %.2f%%" % (precision_score(y_test, predictions) * 100.0))
    print("recall score: %.2f%%" % (recall_score(y_test, predictions) * 100.0))

In [15]:
# load data
def load_data(limit1, limit2):
    train_dir1 = '/Users/raina/Desktop/UHN/physionet/training'
    train_dir2 = '/Users/raina/Desktop/UHN/physionet/training_setB'
    train_data1, y1 = load_data(train_dir1, limit=limit1)
    train_data2, y2 = load_data(train_dir2, limit=limit2)
    train_data = train_data1 + train_data2
    X = data_process(train_data, expand_dims=False)
    y = y1 + y2
    y_ = np.hstack((yi for yi in y))
    X_ = np.vstack(X).astype(np.float)
    print(X_.shape, y_.shape)
    return X_, y_

In [32]:
def load_model(model_name):
    model = XGBClassifier()
    model.load_model('./xgboost_models/' + model_name)
    model._le = LabelEncoder().fit(y_train)
    return model

In [6]:
# split data into train and test sets
seed = 42
test_size = 0.4
X_train, X_test, y_train, y_test = train_test_split(X_, y_, 
                                                    test_size=test_size,
                                                    random_state=seed)

In [25]:
# parameters
base_params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic', 
    'seed': 42} 
# best_params = {'subsample': 1.0, 'min_child_weight': 1, 'max_depth': 10, 
#                'learning_rate': 0.3, 'gamma': 0.1, 'colsample_bytree': 0.6}
best_params = {'subsample': 1.0, 'min_child_weight': 1, 'max_depth': 20, 
               'learning_rate': 0.5, 'gamma': 0.5, 'colsample_bytree': 1.0}
params = {**base_params, **best_params}

In [27]:
# fit the model
model = XGBClassifier(**params)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bytree=1.0, gamma=0.5, learning_rate=0.5,
              max_delta_step=0, max_depth=20, min_child_weight=1, missing=None,
              n_estimators=100, n_jobs=1, nthread=None,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, seed=42, silent=True,
              subsample=1.0)

In [28]:
# make predictions for test data
# loop over each patient and predict using only past and present data
y_pred = model.predict(X_test)
# since binay classification problem, can convert the result to 0 or 1
# using round()
predictions = [round(value) for value in y_pred]
eval_pred(y_test, predictions)

Accuracy: 98.61%
f1 score: 40.36%
roc_auc score: 63.04%
confusion matrix: 
 n = 620884
 [[609307    384]
 [  8266   2927]]
precision score: 88.40%
recall score: 26.15%


In [29]:
model.save_model('004.model')