In [8]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.exceptions import ConvergenceWarning
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [2]:
from load_adult import EXPORTED_DATASET as adult
from load_mimic import EXPORTED_DATASET as mimic

adult_preprocess, adult_ds , adult_y = adult
mimic_preprocess, mimic_ds, mimic_y = mimic

In [3]:
models = list(zip(
    [
        "Logistic Regression", 
        "SVC (Linear)", 
        "SVC (RBF)", 
        "Random Forest"
    ],
    [
        LogisticRegression(solver = 'saga', fit_intercept = True, random_state = 42, n_jobs = 1),
        SVC(kernel = "linear", random_state = 42),
        SVC(kernel = "rbf", random_state = 42),
        RandomForestClassifier(n_estimators = 100, random_state = 42, n_jobs = 1)
    ]
))

# Adult Dataset

In [4]:
adult_ds.head()

Unnamed: 0,age,workclass,fnlwgt,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,34,Private,198693,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K


Suppress one variable to see its effect on predictivity

In [11]:
dataset = adult_ds
preprocess = adult_preprocess
y_var = adult_y
N_trials = 1

preprocessed_ds, one_hot_map, label_encoded = preprocess(dataset)
independent_variables = list(dataset.columns.difference([y_var]))
independent_variables.append(None)

train, test = train_test_split(preprocessed_ds)
Y_train, Y_test = train[y_var].values.flatten(), test[y_var].values.flatten()

results = []
for model_type, model in models:
    for variable in independent_variables:
        if variable and variable not in one_hot_map: 
            continue
        
        if variable != None:
            index = one_hot_map[variable] if variable in one_hot_map else [variable]
            inverse_index = train.columns.difference(index).difference([y_var])
        else:
            inverse_index = train.columns.difference([y_var])
            variable = "Control"
        
        X_train, X_test = train[inverse_index].values, test[inverse_index].values
        
        trials = []
        
        for _ in range(N_trials):
            model.fit(X_train, Y_train)
            
            Y_pred = model.predict(X_test)
            trials.append(balanced_accuracy_score(Y_test, Y_pred))
        
        results.append([model_type, variable, np.mean(trials), np.std(trials)])
      
results = pd.DataFrame(results, columns=["model", "variable", "mean_bal_acc", "std_bal_acc"])
results.to_csv("adult_predictivity.csv")
results.head()

Unnamed: 0,model,variable,mean_bal_acc,std_bal_acc
0,Logistic Regression,marital-status,0.753394,0.0
1,Logistic Regression,race,0.755932,0.0
2,Logistic Regression,relationship,0.750442,0.0
3,Logistic Regression,workclass,0.751151,0.0
4,Logistic Regression,Control,0.754044,0.0


In [5]:
mimic_ds.head()

Unnamed: 0,first_careunit,admission_location,insurance,race,gender,anchor_age,value,elix_score,cci_score,mortality,disease
0,Surgical Intensive Care Unit (SICU),EMERGENCY ROOM,Private/Other,unknown,F,61,DNR,5.0,0.0,False,other
1,Medical Intensive Care Unit (MICU),TRANSFER FROM HOSPITAL,Private/Other,white,M,53,Full code,0.0,0.0,False,other
2,Surgical Intensive Care Unit (SICU),TRANSFER FROM HOSPITAL,Private/Other,white,M,30,Full code,5.0,0.0,False,other
3,Surgical Intensive Care Unit (SICU),EMERGENCY ROOM,Private/Other,unknown,F,91,Full code,10.0,3.0,False,congestive_heart_failure
4,Medical Intensive Care Unit (MICU),EMERGENCY ROOM,Medicare,white,F,73,Full code,13.0,0.0,False,other


In [9]:
dataset = mimic_ds
preprocess = mimic_preprocess
y_var = mimic_y
N_trials = 1

preprocessed_ds, one_hot_map, label_encoded = preprocess(dataset)
independent_variables = list(dataset.columns.difference([y_var]))
independent_variables.append(None)

train, test = train_test_split(preprocessed_ds)
Y_train, Y_test = train[y_var].values.flatten(), test[y_var].values.flatten()
ros = RandomOverSampler(sampling_strategy='minority')

results = []
for model_type, model in models:
    for variable in independent_variables:
        if variable and variable not in one_hot_map: 
            continue
        
        if variable != None:
            index = one_hot_map[variable] if variable in one_hot_map else [variable]
            inverse_index = train.columns.difference(index).difference([y_var])
        else:
            inverse_index = train.columns.difference([y_var])
            variable = "Control"
        
        X_train, X_test = train[inverse_index].values, test[inverse_index].values
        
        trials = []
        
        for _ in range(N_trials):
            model.fit(*ros.fit_resample(X_train, Y_train))
            
            Y_pred = model.predict(X_test)
            trials.append(balanced_accuracy_score(Y_test, Y_pred))
        
        results.append([model_type, variable, np.mean(trials)])
      
results = pd.DataFrame(results, columns=["model", "variable", "bal_acc"])
results.to_csv("mimic_predictivity.csv")
results.head()
      

Unnamed: 0,model,variable,bal_acc
0,Logistic Regression,admission_location,0.696273
1,Logistic Regression,disease,0.686104
2,Logistic Regression,insurance,0.694261
3,Logistic Regression,race,0.676996
4,Logistic Regression,Control,0.696311


- Optimal scaling vs dataset size
- David lopez book on bias as related to causal learning