In [1]:
import os
from os.path import join as oj
import sys, time
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from copy import deepcopy
import pickle as pkl
import pandas as pd
import data_pecarn
import matplotlib.gridspec as grd
import data

# sns.set(style="black")
plt.style.use('dark_background')
%matplotlib inline
%load_ext autoreload
%autoreload 2

# look at all dsets

In [26]:
from data_pecarn import *
df_features = get_features()  # read all features into df
df_outcomes = get_outcomes()  # 2 outcomes: iai, and iai_intervention
df = pd.merge(df_features, df_outcomes, on='id', how='left')
df = rename_values(df)  # rename the features by their meaning

In [27]:
ks_remap = ['Hispanic', 'VomitWretch', 'RecodedMOI', 
                'ThoracicTender', 'ThoracicTrauma', 'Costal', 
                'DecrBreathSound', 'AbdDistention', 'AbdTenderDegree',
                'AbdTrauma', 'SeatBeltSign', 'AbdTrauma_or_SeatBeltSign', 
                'DistractingPain', 'AbdomenPain']

In [28]:
# get without processing
'''
features = data_pecarn.get_features() # uses several forms
outcomes = data_pecarn.get_outcomes() # 2 outcomes: iai, and iai_intervention
d = pd.merge(features, outcomes, on='id', how='left')
'''

df_full = data_pecarn.get_data()
feat_names, pecarn_feat_names = data.get_feat_names(df_full)
# df = df_full[pecarn_feat_names + ['id', 'iai', 'iai_intervention']]
# df = rename_values(df)
# df = data.preprocess(features) # process the feats (should save into df)

computing pecarn preprocessing...


# look at demographics

In [10]:
d = df

In [11]:
np.sum(d['iai_intervention'][d['iai']==1])

203

In [22]:
d['Race'].unique()

array(['unknown', 'White', 'Black or African American', 'Asian',
       'American Indian or Alaska Native',
       'Native Hawaiian or other Pacific Islander'], dtype=object)

In [25]:
d = df_full
ids_np = df.iai_intervention == 1
# print(list(d.keys()))

# the mean age is the only one which is wrong
print('iai', np.sum(d['iai']), np.sum(d['iai_intervention']))
print('age', np.mean(d.Age[ids_np]), np.mean(d.Age))
print('age<2', np.sum(d.Age[ids_np] < 2), np.sum(d.Age < 2))
print('sex', np.sum(d.Sex[ids_np]=='M'), np.sum(d.Sex=='M'))
print('hispanic ethnicity', np.sum(d.Race ==  == 1), np.sum(d.Hispanic))

iai 761 203
age 9.339901477832512 9.767270009963468
age<2 10 1167
sex 125 7384
hispanic ethnicity 0 1010000100-100-10-1-10-1001010-11000110000-10100000000111010-111010110000-1010001010-1-10000011100-1001000010110110000100-110010000100-100010-100000110-1100011100-10000-101000000-111111010110000111011-111001000110001000001-1-100000100000100010100-11101000-1-1-10-101000000001000100110000-10010000000010000000000010-1001001000100001-1000100100-101000010-11000-1110010010110000000000-111-11100001-1000010-101100-1-11-1000001000100001000010010-1000000000100001-100100010001101110000000-1-111100100100-10010-100010010010-10100-1011010001000111-111001100010001010-1-1001101101110101-1000001000100001100000001010001110-11000100000-101101011-100000110-1-111-1010001-11000100-11-11000-1-100-110010100011-10101-10000000-10000000000-1101000001010010011010-10-1-111000001-10011010-100001000001-1-101000-1110-110100111000000100110000010100-11000-1101000001101-100-110100000110010-10-100000011-1010-111010101010-1

# correlations with outcome

In [73]:
df_filt = df.dropna(axis=1, thresh=1)

In [None]:
X = df_full[['SEX_M', 'ageinyrs', 'GCSScore', 'iai', 'iai_intervention']]
sns.pairplot(X, hue='iai_intervention')
plt.show()

# misc important vars

In [None]:
corrs_feat = df.corr(method='pearson')
cs = corrs_feat.iai_intervention.sort_values(ascending=False)
cs = cs[~cs.isnull()]
print(cs[:30], cs[-30:])

In [82]:
feat_names, pecarn_feat_names = data_pecarn.get_feat_names(df_full)

# dim reduction

In [9]:
df = data.get_data(use_processed=True)
feat_names, pecarn_feat_names = data.get_feat_names(df)
outcome_def = 'iai_intervention' # output

In [35]:
import prince
fca = prince.MCA(n_components=len(pecarn_feat_names)).fit(df[pecarn_feat_names])
comps = fca.transform(df[pecarn_feat_names]).head()

In [None]:
# ax = mca.plot_coordinates(
#     df[pecarn_feat_names],
# #     color_labels=df[outcome_def]
# )
# ax.grid(False)

In [None]:
def plot_fcs(fca, comps):

    # create a 2 X 2 grid 
    gs = grd.GridSpec(2, 2, height_ratios=[2,10], 
                      width_ratios=[12, 1], wspace=0.1, hspace=0)

#     try:
#     var_norm = pca.explained_variance_ / np.sum(pca.explained_variance_) * 100
    fca.explained_intertia
    # plot explained variance
    ax2 = plt.subplot(gs[0])
    ax2.bar(np.arange(0, comps.shape[1]), var_norm)
#     , color='black', width=0.8)
    plt.title('Explained variance (%)')
    ax2.spines['right'].set_visible(False)
    ax2.spines['top'].set_visible(False)
#         ax2.xaxis.set_visible(False)
    ax2.yaxis.set_ticks_position('left')
    ax2.set_yticks([0, max(var_norm)])
    plt.xlim((-0.5, comps.shape[1]-0.5))
#     except:
#         print('not pca')

    # plot pcs
    
    ax = plt.subplot(gs[2])
    p = ax.imshow(comps, interpolation='None', cmap='viridis',aspect='auto')
    plt.xlabel('PCA component number')

    # make colorbar
    colorAx = plt.subplot(gs[3])
    cb = plt.colorbar(p, cax = colorAx)

    plt.show()
    

# perform pca
# pca = decomposition.PCA()
# # pca = decomposition.NMF()
# pca.fit(df.transpose())
# comps = pca.components_.transpose()
# plt.figure(figsize=(6, 5))
plot_fcs(fca, comps)

# look at missing data

In [None]:
fnames = df.keys()
missing_arr = np.zeros((len(fnames), data_pecarn.NUM_PATIENTS + 1), dtype=np.bool)

for i, fname in enumerate(fnames):
#     df = r[fname]
    ids = np.unique(df.id.values)
#     print(np.min(ids), np.max(ids))
    missing_arr[i, ids] = 1
    
plt.figure(figsize=(5, 10))
sns.barplot(x=data_pecarn.NUM_PATIENTS - missing_arr.sum(axis=1), y=fnames, orient='h')
plt.xlabel('number of patients missing this feat')
plt.tight_layout()
plt.show()

In [None]:
# plt.imshow(not_missing)
plt.figure(figsize=(12, 20), dpi=300)
sns.clustermap(missing_arr, yticklabels=fnames)
# plt.tight_layout()
plt.show()

In [None]:
# Compute the correlation matrix
corr = df.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(10, 220, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
# sns.clustermap(corr) #, mask=mask, cmap=cmap, vmax=.3, center=0,
#             square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.show()