In [1]:
import pandas as pd
import seaborn as sns
import sklearn

In [2]:
%pylab inline 
import seaborn as sns
sns.set()
from IPython.display import SVG

Populating the interactive namespace from numpy and matplotlib


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [4]:
import numpy as np
np.set_printoptions(suppress=True) # Suppress scientific notation where possible

In [5]:
# Make some room to see stuff (i.e. drop display limits on Pandas rows & cols - be careful w/ big df's!)

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)

# 1) Import the training data

In [6]:
X_train = np.load('data2/X_baby_train.dat')
y_train = np.load('data2/y_baby_train.dat')

X_val = np.load('data2/X_baby_val.dat')
y_val = np.load('data2/y_baby_val.dat')

print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

(985, 224, 273, 1)
(985, 4)
(247, 224, 273, 1)
(247, 4)


### Standardize the X Data

In [11]:
print(X_train.shape)
print(X_val.shape)

(985, 224, 273, 1)
(247, 224, 273, 1)


In [12]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train.reshape((985, 224*273)))

X_val_scaled = scaler.transform(X_val.reshape((247, 224*273)))

print(X_train_scaled.shape)
print(X_val_scaled.shape)

(985, 61152)
(247, 61152)


# 2) Random Forest Classifier (multi-label)

In [48]:
model1 = RandomForestClassifier(n_estimators=500, random_state=42)
model1.fit(X_train_scaled, y_train)

RandomForestClassifier(n_estimators=500, random_state=42)

In [49]:
y_pred = model1.predict(X_val_scaled)

roc_auc_score(y_val, y_pred, average='micro')

0.5541559494027695

In [50]:
roc_auc_score(y_val, y_pred, average=None)

array([0.4939759 , 0.5       , 0.53794574, 0.63647084])

# 3) Logistic Regression on just the first target (diagnosis = 'Atelectasis')

In [79]:
model2 = LogisticRegression(penalty='l2', C=0.0005, max_iter=10000, random_state=42)
model2.fit(X_train_scaled, y_train[:, 0])

LogisticRegression(C=0.0005, max_iter=10000, random_state=42)

In [80]:
y_pred = model2.predict(X_val_scaled)

roc_auc_score(y_val[:, 0], y_pred)

0.5567455005206009

In [147]:
model2b = RandomForestClassifier(n_estimators=500, random_state=42)
model2b.fit(X_train_scaled, y_train[:, 0])

RandomForestClassifier(n_estimators=500, random_state=42)

In [148]:
y_pred = model2b.predict(X_val_scaled)

roc_auc_score(y_val[:, 0], y_pred)

0.5124944221329764

# 4) Logistic Regression on just the second target (diagnosis = 'Cardiomegaly')

In [92]:
model3 = LogisticRegression(penalty='l2', C=0.025, max_iter=10000, random_state=42)
model3.fit(X_train_scaled, y_train[:, 1])

LogisticRegression(C=0.025, max_iter=10000, random_state=42)

In [93]:
y_pred = model3.predict(X_val_scaled)

roc_auc_score(y_val[:, 1], y_pred)

0.5755469755469755

In [153]:
model3b = RandomForestClassifier(n_estimators=500, random_state=42)
model3b.fit(X_train_scaled, y_train[:, 1])

RandomForestClassifier(random_state=42)

In [150]:
y_pred = model3b.predict(X_val_scaled)

roc_auc_score(y_val[:, 1], y_pred)

0.4976190476190476

# 5) Logistic Regression on just the third target (diagnosis = 'Edema')

In [122]:
model4 = LogisticRegression(penalty='l2', C=0.000075, max_iter=10000, random_state=42)
model4.fit(X_train_scaled, y_train[:, 2])

LogisticRegression(C=7.5e-05, max_iter=10000, random_state=42)

In [123]:
y_pred = model4.predict(X_val_scaled)

roc_auc_score(y_val[:, 2], y_pred)

0.5734883720930233

In [158]:
model4b = RandomForestClassifier(n_estimators=100, random_state=42)
model4b.fit(X_train_scaled, y_train[:, 2])

RandomForestClassifier(random_state=42)

In [159]:
y_pred = model4b.predict(X_val_scaled)

roc_auc_score(y_val[:, 2], y_pred)

0.5463178294573643

# 6) Logistic Regression on just the fourth target (diagnosis = 'Pleural Effusion')

In [136]:
model5 = LogisticRegression(penalty='l2', C=0.00005, max_iter=10000, random_state=42)
model5.fit(X_train_scaled, y_train[:, 3])

LogisticRegression(C=5e-05, max_iter=10000, random_state=42)

In [137]:
y_pred = model5.predict(X_val_scaled)

roc_auc_score(y_val[:, 3], y_pred)

0.6038093196112064

In [169]:
model5b = RandomForestClassifier(n_estimators=100, random_state=42)
model5b.fit(X_train_scaled, y_train[:, 3])

RandomForestClassifier(random_state=42)

In [170]:
y_pred = model5b.predict(X_val_scaled)

roc_auc_score(y_val[:, 3], y_pred)

0.6168882218410521