In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import wrangle
import prepare_ravinder
from sklearn.model_selection import train_test_split
# ignore warnings
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import auc, roc_curve, roc_auc_score
sns.set()
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
import matplotlib.gridspec as gridspec
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer
import lightgbm as lgb
from xgboost import XGBClassifier
import xgboost as xgb
from lightgbm import plot_tree
from lightgbm import plot_importance
import shap
#import colors
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 200)

params = {'axes.labelsize': 14,
          'axes.titlesize': 14,
         'xtick.labelsize': 14,
         'ytick.labelsize': 14}
plt.rcParams.update(params)

In [2]:
# Get raw data from csv file
df = wrangle.get_raw_data()

In [3]:
# Get data dictionary:
data_dict = pd.read_csv('data/WiDS Datathon 2020 Dictionary.csv')

In [4]:
# Get the list of categorical features

df_obj = data_dict[(data_dict['Data Type'] == 'string') | (data_dict['Data Type'] == 'binary')]
cat_features = list(df_obj['Variable Name'].values)
for c in cat_features:
    if c not in data_dict.columns or c == 'hospital_death':
        cat_features.remove(c)
cat_features.remove('bmi')

In [5]:
# Some data cleaning and processing done, but no imputation
df = wrangle.prepare_data(df)

In [6]:
# Create new features
df['gcs'] = df.gcs_eyes_apache + df.gcs_motor_apache + df.gcs_verbal_apache
df['almost_dead'] = np.where(((df.temp_apache <= 38) & ((df.ph_apache < 7.1) | (df.ph_apache < 7.3 )) & (df.temp_apache<35) |((df.ph_apache < 7.1 )) & (df.map_apache < 50) | ((df.ph_apache < 7.2 )) & (df.gcs < 5) | (df.pre_icu_los_days > 40)) | ((df.d1_lactate_max > 8) | (df.d1_lactate_min > 10)),True,False)
df['arterial_bp'] = np.where((df.d1_mbp_invasive_max.notnull()), True, False)
df['ventilated_apache'] = np.where(((df['ventilated_apache'].isnull()) & (df.h1_arterial_po2_min.notnull())), 1.0, df['ventilated_apache'])
df['ventilated_apache'] = np.where((df['ventilated_apache'].isnull()), 0.0, df['ventilated_apache'])
df['bool_'+ 'h1_lactate_max'] = np.where(pd.notnull(df['h1_lactate_max']), True, False)

In [7]:
#consolidate some categories for icu_type and 'hospital_admit_source'

df['icu_type']=df['icu_type'].replace({'CTICU':'CCU-CTICU',
                                              'Cardiac ICU':'CCT-CTICU',
                                              'CTICU':'CCT-CTICU',
                                              'CSICU':'SICU'})

df['hospital_admit_source']= df['hospital_admit_source'].replace({
                                        'Other ICU':"ICU",'ICU to SDU':"SDU",
                                       'Step-Down Unit (SDU)':"SDU",
                                      'Acute Care/Floor':"Floor",
                                      'Other Hospital':"Other"})

In [8]:
# list of variables missing more than 80% values
missing = []
for col in df.columns:
    if df[col].isnull().mean() > 0.8:
        missing.append(col)

In [20]:
# other columns to remove
to_remove=['icu_id','patient_id',
           'encounter_id',
           'apache_4a_hospital_death_prob',
           'readmission_status', 
           'apache_2_bodysystem', 
           'hospital_id' ]

In [10]:
# get list of diagnosis in data
diagnosis = df.apache_2_diagnosis.value_counts().index
diagnosis = diagnosis.astype(str)
diagnosis

Index(['113.0', '301.0', '302.0', '112.0', '308.0', '117.0', '124.0', '122.0',
       '303.0', '110.0', '304.0', '203.0', '202.0', '106.0', '305.0', '114.0',
       '121.0', '123.0', '102.0', '307.0', '119.0', '120.0', '118.0', '108.0',
       '214.0', '109.0', '105.0', '213.0', '207.0', '306.0', '218.0', '217.0',
       '209.0', '219.0', '101.0', '104.0', '103.0', '115.0', '116.0', '107.0',
       '215.0', '208.0', '212.0', '216.0'],
      dtype='object')

In [12]:
#list of variables identified, values of which will be identified against the mean for patients with same diagnosis
apache = [ 'age', 'bmi', 'height','weight', 'albumin_apache','arf_apache','bilirubin_apache','bun_apache',
          'creatinine_apache','fio2_apache','glucose_apache', 'heart_rate_apache','hematocrit_apache',
         'intubated_apache', 'map_apache','paco2_apache','pao2_apache','ph_apache','resprate_apache',
          'sodium_apache','temp_apache','urineoutput_apache','ventilated_apache', 'wbc_apache']

In [13]:
# create a empty dataframe (with NaNs) with index as apache variables and columns as diagnosis
df_means = pd.DataFrame(index=apache, columns= diagnosis)

In [14]:
# Fill the empty dataframe created above with mean values of for patients with same diagnosis:
for d in diagnosis:
    for a in apache:
        df_means[d][a] = df[df.apache_2_diagnosis == float(d)][a].mean()

df_means.head(2)

Unnamed: 0,113.0,301.0,302.0,112.0,308.0,117.0,124.0,122.0,303.0,110.0,304.0,203.0,202.0,106.0,305.0,114.0,121.0,123.0,102.0,307.0,119.0,120.0,118.0,108.0,214.0,109.0,105.0,213.0,207.0,306.0,218.0,217.0,209.0,219.0,101.0,104.0,103.0,115.0,116.0,107.0,215.0,208.0,212.0,216.0
age,64.6395,63.3042,63.2816,63.748,63.7793,69.7777,66.4985,41.9204,62.2462,68.8166,61.621,69.3032,68.8343,65.558,58.7136,62.5583,66.0796,43.7127,66.9973,60.3408,57.4853,52.1251,53.2238,63.3758,68.1451,56.4745,61.6461,66.2956,54.7947,61.2651,57.434,59.9051,64.62,61.2571,44.573,67.8795,59.6049,66.5285,65.6256,63.9492,68.6497,49.6942,60.7653,53.3462
bmi,29.1053,28.9512,29.7945,29.4347,30.5397,29.4761,28.0408,27.844,29.068,31.4086,29.5544,29.1144,28.468,28.8172,31.3788,29.9581,27.4484,27.2402,30.0142,28.502,26.4958,27.7413,27.9283,30.1016,28.0588,30.8019,32.7656,28.4348,27.7849,28.3742,30.0609,28.414,28.2362,30.5285,31.506,26.8084,31.486,29.1809,28.2941,25.8731,29.5407,27.0977,29.0655,27.1413


In [22]:
# create new columns named diff_col_name which is difference between apache variables and
# corresponding mean values for patients with same diagnosis:
for a in apache:
    for d in diagnosis:
        df['diff_'+ a] = np.where(df[a].notnull(), df_means[d][a]-df[a], 0)

In [15]:
# drop columns which are missing more than 80% values
df.drop(columns = missing,axis=1,inplace=True)

In [16]:
# Drop columns with 'invasive in their names since they are redundant'
inv_cols=[s for s in df.columns.tolist() if "invasive" in s]
df.drop(inv_cols,axis=1,inplace=True)

In [17]:
# all features/columns in df
features = [col for col in df.columns]

In [18]:
# Function which takes all numerical columns and returns columns with correlation higher than 0.99
def drop_correlated_features(df):
    num_feature = [col for col in features if col not in cat_features and df[col].dtype != 'object']
    drop_columns=[]
    corr = df[num_feature].corr()

    columns = np.full((corr.shape[0],), True, dtype=bool)

    for i in range(corr.shape[0]):
        for j in range(i+1, corr.shape[0]):
            if corr.iloc[i,j] >=0.99 :
                if columns[j] == True:
                    columns[j] = False
                    print('col1: {} col2: {} - Correlation: {}'.format(df[num_feature].columns[i] , df[num_feature].columns[j], corr.iloc[i,j]))
            elif corr.iloc[i,j] <= -0.99:
                if columns[j] == True:
                    columns[j] = False

    drop_columns = df[num_feature].columns[columns == False].values
    return drop_columns

In [19]:
#list of correlated columns returned from function above
drop_columns = drop_correlated_features(df)

col1: bilirubin_apache col2: d1_bilirubin_max - Correlation: 0.9965677029378415
col1: bun_apache col2: d1_bun_max - Correlation: 0.9911211474368951
col1: creatinine_apache col2: d1_creatinine_max - Correlation: 0.9941463711670832
col1: paco2_apache col2: paco2_for_ph_apache - Correlation: 1.0
col1: d1_inr_max col2: h1_inr_max - Correlation: 1.0
col1: d1_inr_min col2: h1_inr_min - Correlation: 1.0


In [23]:
#drop columns identified above
features = [col for col in features if col not in drop_columns]
df = df[features]

# drop diff_ventilated_apache columns since it is categorical
df = df.drop(columns = 'diff_ventilated_apache')
df = df.drop(columns = to_remove)

KeyError: "['diff_ventilated_apache'] not found in axis"