In [1]:
import os
from os.path import join as oj
import sys, time
sys.path.insert(1, oj(sys.path[0], '..'))  # insert parent path
import seaborn as sns
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from copy import deepcopy
import pickle as pkl
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import cross_validate, ShuffleSplit, train_test_split
from sklearn.linear_model import LogisticRegressionCV
import pandas as pd
import data 
from collections import Counter

# plt.style.use('dark_background')
# sns.set(style="white")
%matplotlib inline
%load_ext autoreload
%autoreload 2
NUM_PATIENTS = 12044

In [3]:
features = data.get_features() # uses several forms
outcomes = data.get_outcomes() # 2 outcomes: iai, and iai_intervention
df = pd.merge(features, outcomes, on='id', how='left')
X_feats_full = data.preprocess(features)

# remove unusable features
ks = [k for k in X_feats_full.keys() if not k in ['id'] and not 'IAI' in k]
X_feats_full= X_feats_full[ks]

# get actual features
X_np = X_feats_full.values
feature_names = list(X_feats_full)
y_np = outcomes['iai_intervention']

100%|██████████| 49/49 [00:01<00:00, 35.87it/s]
48it [00:03, 14.27it/s]


final shape (12044, 433)


In [18]:
useful = ['VomitWretch_1', 'RecodedMOI_1', 'GCSScore_1', 'ThoracicTender_1', 'ThoracicTrauma_1', 
          'Costal_1', 'DecrBreathSound_1', 'AbdDistention_1', 'AbdomenPain_1', 'AbdTenderDegree_1',
          'AbdTrauma_1', 'SeatBeltSign_1', 'DistractingPain_1']
# InjuryMechanism_1, hypotension?, femure fracture
ks = set()
for k_useful in useful:
    for k in feature_names:
        if k_useful in k:
            ks.add(k)
ks = np.array(list(ks))


In [6]:
X_feats = X_feats_full[ks]

In [7]:
X_feats.shape

(12044, 53)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X_feats.values, y_np, test_size=0.2)

In [23]:
Counter(y_train)

Counter({0: 9463, 1: 172})

# decision tree fitting

In [36]:
model_type='tree'
max_depth = 3
num_cv = 10

if model_type == 'tree':
    m = DecisionTreeClassifier(max_depth=max_depth, class_weight={0: 1, 1: 500})
elif model_type == 'logistic':
    m = LogisticRegressionCV(class_weight={0: 1, 1: 300}, cv=3, max_iter=100)
cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=42)
m_cv = cross_validate(m, X_train, y_train, cv=num_cv, scoring=['precision', 'recall', 'f1', 'balanced_accuracy', 'roc_auc'],
                      return_train_score=True, return_estimator=True)

In [37]:
# print('metric\ttrain\ttest')
for key in m_cv:
    if 'test' in key:
#         print(f"{key.replace('test_', '')}\t{np.mean(m_cv[key.replace('test', 'train')]):0.2f}\t{np.mean(m_cv[key]):0.2f}")
        print(f"{np.mean(m_cv[key]):0.2f}\t{key.replace('test_', '')}")

0.03	precision
0.98	recall
0.05	f1
0.65	balanced_accuracy
0.82	roc_auc


In [38]:
def count_common_feats(m_cv, model_type='tree'):
    if model_type == 'tree':
        feats_used = []
        for i in range(len(m_cv['estimator'])):
            m_fit = m_cv['estimator'][i]
            feats_used += list(ks[m_fit.feature_importances_ != 0])
    elif model_type == 'logistic':
        feats_used = []
        for i in range(len(m_cv['estimator'])):
            m_fit = m_cv['estimator'][i]
            num_feats = 5
            # get top num_feats features with biggest bsolute weights
            idxs = np.abs(m_fit.coef_).flatten().argsort()[-num_feats:][::-1]
            feats_used += list(ks[idxs])
    return sorted(dict(Counter(feats_used)).items(), key=lambda kv: kv[1], reverse=True)
        
count_common_feats(m_cv, model_type)

[('AbdomenPain_1_2', 10),
 ('RecodedMOI_1_2', 10),
 ('GCSScore_1', 10),
 ('AbdTrauma_1_2', 10),
 ('DecrBreathSound_1_4', 9),
 ('DistractingPain_1_1', 8),
 ('DistractingPain_1_2', 2),
 ('RecodedMOI_1_4', 1),
 ('AbdTenderDegree_1_1.0', 1),
 ('RecodedMOI_1_1', 1)]

In [None]:
# plot a tree
m_fit = m_cv['estimator'][0]
plt.figure(dpi=300)
plot_tree(m_fit, feature_names=ks)
plt.show()