In [1]:
import numpy as np
import pandas as pd
import os
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.metrics import mean_absolute_error
from sklearn.svm import LinearSVR, SVR, SVC,LinearSVC
from sklearn.datasets import make_regression
from sklearn.preprocessing import normalize 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression, RidgeClassifier, RidgeClassifierCV
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier


In [2]:
# Read in data
df_training_original = pd.read_csv('train_features.csv')
df_training_label = pd.read_csv('train_labels.csv')
all_pids = [pid for pid in df_training_original['pid'].unique()]

In [85]:
def getPatientData(trainingData, pids, patients=0, mode='pid'):
    if mode == 'number':
        pids = all_pids[:patients]
    if len(pids) == 0:
        return trainingData
    #pids = np.array(pids).astype(np.float)
    patients = [trainingData.iloc[idx] for idx in range(0, len(trainingData)) if trainingData['pid'][idx] in pids]    
    #patientTrainingDataIndex = [trainingData.iloc[idx] for idx, col in enumerate(trainingData) if trainingData['pid'][idx] in pids]    
    return pd.DataFrame(patients)

def partitionData(trainingDataPids, trainingPartition=80):
    validationPartition = 100 - trainingPartition
    countTraining = int((trainingPartition/100)*len(trainingDataPids))
    training = trainingDataPids[:countTraining]
    validation = trainingDataPids[countTraining:]
    print('')
    print('Training size: ' + str(countTraining))
    print('Validation size: ' + str(len(validation)))
    return training, validation

def populateData(X,Y):
    Z = pd.merge(X, Y, on='pid')
    YData = Z[Y.columns].iloc[:,1:]
    XData = Z[X.columns].iloc[:,1:]
    return XData, YData
import sklearn.metrics as metrics

TESTS = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total',
         'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
         'LABEL_Bilirubin_direct', 'LABEL_EtCO2']
def get_score(df_true, df_submission):
    df_submission = df_submission.sort_values('pid')
    df_true = df_true.sort_values('pid')
    
    #task1 = np.mean([metrics.roc_auc_score(df_true[entry], df_submission[entry]) for entry in TESTS])
    task2 = metrics.roc_auc_score(df_true['LABEL_Sepsis'], df_submission['LABEL_Sepsis'])
    #task3 = np.mean([0.5 + 0.5 * np.maximum(0, metrics.r2_score(df_true[entry], df_submission[entry])) for entry in VITALS])
    #score = np.mean([task1, task2, task3])
    return task2

In [4]:

X_pid_train = pd.DataFrame(columns=df_training_original.columns)
for i, pid in enumerate(all_pids):
    uniqueData = df_training_original[df_training_original['pid']==pid]
    means = uniqueData.mean()
    means['pid'] = pid
    means['Age'] = uniqueData['Age'].iloc[0]
    means = pd.DataFrame(means).transpose()
    X_pid_train = pd.concat([X_pid_train, means])
    #counts.drop('Time', inplace=True)
    if i % 500 == 0:
        print(round(i/len(all_pids),2)*100, '%')
    del means
    

print(X_pid_train)

0.0 %
3.0 %
5.0 %
8.0 %
11.0 %
13.0 %
16.0 %
18.0 %
21.0 %
24.0 %
26.0 %
28.999999999999996 %
32.0 %
34.0 %
37.0 %
39.0 %
42.0 %
45.0 %
47.0 %
50.0 %
53.0 %
55.00000000000001 %
57.99999999999999 %
61.0 %
63.0 %
66.0 %
68.0 %
71.0 %
74.0 %
76.0 %
79.0 %
82.0 %
84.0 %
87.0 %
89.0 %
92.0 %
95.0 %
97.0 %
        pid  Time   Age      EtCO2    PTT    BUN  Lactate       Temp  \
0       1.0   8.5  34.0        NaN    NaN   12.0      NaN  36.750000   
0      10.0   6.5  71.0        NaN  27.80   12.0      NaN  36.000000   
0     100.0   7.5  68.0        NaN  20.90   21.0      NaN  36.250000   
0    1000.0   6.5  79.0  31.863636    NaN   22.0    3.855  36.818182   
0   10000.0   6.5  76.0        NaN  28.55   22.0      NaN  36.750000   
0   10002.0   6.5  73.0  19.000000  31.30   18.0    3.005  37.000000   
0   10006.0   6.5  51.0        NaN    NaN    NaN      NaN  37.500000   
0   10007.0   6.5  60.0        NaN    NaN    NaN      NaN  38.000000   
0   10009.0   6.5  69.0        NaN  86.05   15.0  

In [96]:
#x_train = X_pid_train[['pid', 'PTT', 'HCO3', 'BaseExcess', 'PaCO2', 'FiO2', 'SaO2','Chloride', 'Hct', 'pH']]
x_train = X_pid_train.copy().sort_values(['pid'])
x_train = x_train[['pid','ABPm','RRate','FiO2','Platelets','Creatinine']].fillna(0)
x_train = x_train
print(x_train.head())

   pid        ABPm      RRate      FiO2  Platelets  Creatinine
0  1.0   68.333333  17.000000  0.425000      143.0        0.50
0  2.0   94.636364  18.000000  0.000000      226.0        2.12
0  4.0   80.909091  14.636364  0.000000      269.0        0.53
0  6.0   65.750000  15.833333  0.566667      105.0        1.35
0  8.0  143.900000  17.181818  0.000000        0.0        6.46


In [97]:
df_training_label = df_training_label
print(df_training_label.iloc[:,[0,-5]].head())

       pid  LABEL_Sepsis
0        1           0.0
6622     2           0.0
15008    4           0.0
16335    6           0.0
17676    8           0.0


In [98]:
print(x_train.shape)

(18995, 6)


In [99]:
# Train MLPClassifier
#regr =   KNeighborsClassifier(3)
regr = MLPClassifier(alpha=1e-4, hidden_layer_sizes=(100,200,200), random_state=1, solver='adam', max_iter=100)
#regr = RidgeClassifierCV()
#regr = RandomForestClassifier()
trainNumbers = 16000
regr.fit(x_train.iloc[:trainNumbers,1:], df_training_label.iloc[:trainNumbers,-5].values)
print(regr.classes_)
print(regr.loss_)

[0. 1.]
0.20724038976472534




In [100]:
print(x_train.iloc[:20,1:])
print(df_training_label.iloc[:20,-5:-4])

         ABPm      RRate      FiO2   Platelets  Creatinine
0   68.333333  17.000000  0.425000  143.000000        0.50
0   94.636364  18.000000  0.000000  226.000000        2.12
0   80.909091  14.636364  0.000000  269.000000        0.53
0   65.750000  15.833333  0.566667  105.000000        1.35
0  143.900000  17.181818  0.000000    0.000000        6.46
0  101.727273  18.090909  0.000000  207.000000        0.82
0   79.916667  16.583333  0.516667  237.000000        0.80
0   82.833333  18.333333  0.000000  200.000000        0.99
0   87.750000  13.000000  0.000000  138.000000        6.80
0   91.181818  32.444444  0.000000   25.000000        0.80
0   85.500000  12.583333  0.500000  147.500000        0.70
0   66.250000  11.333333  0.516667  171.000000        3.40
0   79.200000  12.666667  0.000000    0.000000        0.00
0   94.916667  20.333333  0.540000  481.000000        0.60
0   74.727273  18.700000  0.000000    0.000000        0.00
0   79.111111  17.875000  0.400000    0.000000        0.

In [101]:
f = regr.predict_proba(x_train.iloc[trainNumbers:,1:])
f = pd.DataFrame(f)
#f.columns = list(df_training_label.columns[-5])
f.columns = ['False', 'LABEL_Sepsis']
f['pid'] = x_train.iloc[trainNumbers:,0].reset_index(drop=True)

print(f[f['pid'].isin([27001, 27003, 27007, 27010])])

print(get_score(df_training_label.iloc[trainNumbers:,:], f))

        False  LABEL_Sepsis      pid
194  0.935903      0.064097  27001.0
195  0.973876      0.026124  27003.0
198  0.956648      0.043352  27007.0
199  0.954395      0.045605  27010.0
0.7004478538488826
