In [61]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from data_loader import load_data, data_process
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import *

In [63]:
# load data
train_dir = '/Users/raina/Desktop/UHN/physionet/training_setB'
train_data, y = load_data(train_dir, limit=10000)
X = data_process(train_data, expand_dims=False)

In [64]:
y_ = np.hstack((yi for yi in y))
y_.shape

(380176,)

In [65]:
X_ = np.vstack(X).astype(np.float)
X_.shape

(380176, 40)

In [66]:
# split data into train and test sets
seed = 42
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X_, y_, 
                                                    test_size=test_size,
                                                    random_state=seed)

In [76]:
# fit the model
model = XGBClassifier(max_depth=6)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
              max_depth=6, min_child_weight=1, missing=None, n_estimators=100,
              n_jobs=1, nthread=None, objective='binary:logistic',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=True, subsample=1)

In [78]:
# make predictions for test data
# loop over each patient and predict using only past and present data
y_pred = model.predict(X_test)

In [79]:
# since binay classification problem, can convert the result to 0 or 1
# using round()
predictions = [round(value) for value in y_pred]

In [84]:
# evaluate predictions
from sklearn.metrics import confusion_matrix
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
f1 = f1_score(y_test, predictions)
print("f1 score: %.2f%%" % (f1 * 100.0))
roc_auc = roc_auc_score(y_test, predictions)
print("roc_auc score: %.2f%%" % (roc_auc * 100.0))
confusion = confusion_matrix(y_test, predictions)
print("confusion matrix: \n n = %d\n" %y_.shape[0], confusion)
print("precision score: %.2f%%" % precision_score(y_test, predictions))
print("recall score: %.2f%%" % recall_score(y_test, predictions))

Accuracy: 98.65%
f1 score: 11.46%
roc_auc score: 53.09%
confusion matrix: 
 n = 380176
 [[123650     39]
 [  1660    110]]
precision score: 0.74%
recall score: 0.06%


In [None]:
# base parameters
base_params = {
    'booster': 'gbtree',
    'objective': 'reg:linear',  # regression task
    'seed': 42} 

In [None]:
# tune the parameters
# Randomized Search
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5, 10, 20],
        'learning_rate': [0.01, 0.03, 0.1, 0.3]
}

In [None]:
folds = 5
param_comb = 6

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(model, 
                             param_distributions=params, 
                             n_iter=param_comb, 
                             scoring='roc_auc', 
                             n_jobs=4, 
                             cv=skf.split(X_train, y_train), 
                             verbose=3, 
                             random_state=42)
random_search.fit(X_train, y_train)

In [None]:
print(random_search.best_params_)

In [None]:
best_params_ = {'subsample': 1.0, 'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.3, 'gamma': 5, 'colsample_bytree': 0.6}
params_new = {**base_params, **best_params_}
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)
rounds = 15
model_final = xgb.train(params_new, dtrain, 
                        num_boost_round=rounds)
pred = model_final.predict(dtest)
pred_final = [round(value) for value in pred]


In [None]:
print(params_new)

In [None]:
final_model = XGBClassifier(**params_new)

In [None]:
final_model.fit(X_train, y_train)
final_y_pred = model.predict(X_test)
pred_final = [round(value) for value in final_y_pred]

In [None]:
accuracy2 = accuracy_score(y_test, pred_final)
print("Accuracy: %.2f%%" % (accuracy2 * 100.0))

In [None]:
final_model.save_model('002.model')

In [None]:
# analyze feature importance
importance = final_model.feature_importances_
print(importance)

In [None]:
vector = np.ones((1,40))
df = pd.DataFrame(importance, columns = ['fscore'])
df['fscore'] = df['fscore'] / df['fscore'].sum()
plt.figure()

In [None]:
df.plot(kind='barh', y='fscore',
            legend=False, figsize=(12, 10))
plt.title('XGBoost Feature Importance')
plt.xlabel('relative importance')
plt.tight_layout()
plt.savefig('XGBoost Feature Importance' + '.png', dpi=300)
plt.show()

In [None]:
model = XGBClassifier()
model.load_model('002.model')
model._le = LabelEncoder().fit(y_test)

In [None]:
final_y_pred = model.predict(X_)
pred_final = [round(value) for value in final_y_pred]

In [None]:
any([value != 0 for value in final_y_pred])

In [None]:
# evaluations
from sklearn.metrics import confusion_matrix
accuracy = accuracy_score(y_, pred_final)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
f1 = f1_score(y_, pred_final)
print("f1 score: %.2f%%" % (f1 * 100.0))
roc_auc = roc_auc_score(y_, pred_final)
print("roc_auc score: %.2f%%" % (roc_auc * 100.0))
confusion = confusion_matrix(y_, pred_final)
print("confusion matrix: \n n = %d\n" %y_.shape[0], confusion)

In [None]:
print(precision_score(y_, pred_final))
print(recall_score(y_,pred_final))