In [1]:
import numpy as np
import pandas as pd
import os
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.metrics import mean_absolute_error
from sklearn.svm import LinearSVR, SVR, SVC,LinearSVC
from sklearn.datasets import make_regression
from sklearn.preprocessing import normalize 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression, RidgeClassifier, RidgeClassifierCV
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier


In [2]:
# Read in data
df_training_original = pd.read_csv('train_features.csv')
df_training_label = pd.read_csv('train_labels.csv')
all_pids = [pid for pid in df_training_original['pid'].unique()]

In [3]:
def getPatientData(trainingData, pids, patients=0, mode='pid'):
    if mode == 'number':
        pids = all_pids[:patients]
    if len(pids) == 0:
        return trainingData
    #pids = np.array(pids).astype(np.float)
    patients = [trainingData.iloc[idx] for idx in range(0, len(trainingData)) if trainingData['pid'][idx] in pids]    
    #patientTrainingDataIndex = [trainingData.iloc[idx] for idx, col in enumerate(trainingData) if trainingData['pid'][idx] in pids]    
    return pd.DataFrame(patients)

def partitionData(trainingDataPids, trainingPartition=80):
    validationPartition = 100 - trainingPartition
    countTraining = int((trainingPartition/100)*len(trainingDataPids))
    training = trainingDataPids[:countTraining]
    validation = trainingDataPids[countTraining:]
    print('')
    print('Training size: ' + str(countTraining))
    print('Validation size: ' + str(len(validation)))
    return training, validation

def populateData(X,Y):
    Z = pd.merge(X, Y, on='pid')
    YData = Z[Y.columns].iloc[:,1:]
    XData = Z[X.columns].iloc[:,1:]
    return XData, YData
import sklearn.metrics as metrics

TESTS = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total',
         'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
         'LABEL_Bilirubin_direct', 'LABEL_EtCO2']
def get_score(df_true, df_submission):
    df_submission = df_submission.sort_values('pid')
    df_true = df_true.sort_values('pid')
    
    #task1 = np.mean([metrics.roc_auc_score(df_true[entry], df_submission[entry]) for entry in TESTS])
    task2 = metrics.roc_auc_score(df_true['LABEL_Sepsis'], df_submission['LABEL_Sepsis'])
    #task3 = np.mean([0.5 + 0.5 * np.maximum(0, metrics.r2_score(df_true[entry], df_submission[entry])) for entry in VITALS])
    #score = np.mean([task1, task2, task3])
    return task2

In [75]:

X_pid_train = pd.DataFrame()
for i, pid in enumerate(all_pids):
    uniqueData = df_training_original[df_training_original['pid']==pid]
    means = uniqueData.mean()
    means['pid'] = pid
    means['Age'] = uniqueData['Age'].iloc[0]
    means = pd.DataFrame(means).transpose()
    counts = uniqueData.isna().sum()
    counts = -1*counts + 12
   
    counts = pd.DataFrame(counts).transpose()
    #print(counts)
    countsCol = [x + '_count' for x in means.columns]
    countsCol[0] = 'pid'
    countsCol[1] = 'Time'
    countsCol[2] = 'Age'
    counts.columns  = countsCol
    counts['pid'] = pid
    counts.drop(['Time', 'Age'], inplace=True, axis=1)
    merged = pd.merge(means, counts)
    X_pid_train = pd.concat([X_pid_train, merged])
    #counts.drop('Time', inplace=True)
    if i % 500 == 0:
        print(round(i/len(all_pids),2)*100, '%')
    del means
    #print(means)


    
print(X_pid_train)

0.0 %
3.0 %
5.0 %
8.0 %
11.0 %
13.0 %
16.0 %
18.0 %
21.0 %
24.0 %
26.0 %
28.999999999999996 %
32.0 %
34.0 %
37.0 %
39.0 %
42.0 %
45.0 %
47.0 %
50.0 %
53.0 %
55.00000000000001 %
57.99999999999999 %
61.0 %
63.0 %
66.0 %
68.0 %
71.0 %
74.0 %
76.0 %
79.0 %
82.0 %
84.0 %
87.0 %
89.0 %
92.0 %
95.0 %
97.0 %
        pid  Time   Age      EtCO2    PTT   BUN  Lactate       Temp  \
0       1.0   8.5  34.0        NaN    NaN  12.0      NaN  36.750000   
0      10.0   6.5  71.0        NaN  27.80  12.0      NaN  36.000000   
0     100.0   7.5  68.0        NaN  20.90  21.0      NaN  36.250000   
0    1000.0   6.5  79.0  31.863636    NaN  22.0    3.855  36.818182   
0   10000.0   6.5  76.0        NaN  28.55  22.0      NaN  36.750000   
..      ...   ...   ...        ...    ...   ...      ...        ...   
0    9993.0   6.5  80.0        NaN    NaN  13.5      NaN  35.750000   
0    9995.0   6.5  73.0        NaN  55.50  50.0      NaN  36.000000   
0    9996.0  12.5  53.0        NaN    NaN   NaN      NaN  3

In [124]:
print(X_pid_train.columns)
x_train = X_pid_train[['pid', 'PTT', 'HCO3', 'BaseExcess', 'PaCO2', 'FiO2', 'SaO2','Chloride', 'Hct', 'pH'
                      , 'EtCO2_count',
       'PTT_count', 'BUN_count', 'Lactate_count', 'Temp_count', 'Hgb_count',
       'HCO3_count', 'BaseExcess_count', 'RRate_count', 'Fibrinogen_count',
       'Phosphate_count', 'WBC_count', 'Creatinine_count', 'PaCO2_count',
       'AST_count', 'FiO2_count', 'Platelets_count', 'SaO2_count',
       'Glucose_count', 'ABPm_count', 'Magnesium_count', 'Potassium_count',
       'ABPd_count', 'Calcium_count', 'Alkalinephos_count', 'SpO2_count',
       'Bilirubin_direct_count', 'Chloride_count', 'Hct_count',
       'Heartrate_count', 'Bilirubin_total_count', 'TroponinI_count',
       'ABPs_count', 'pH_count']].fillna(0)
#x_train = X_pid_train.copy().sort_values(['pid'])
print(x_train.columns)
#x_train = X_pid_train[['pid','ABPm','RRate','FiO2','Platelets','Creatinine', 'Lactate','ABPm_count', 'RRate_count','FiO2_count', 'Platelets_count','Creatinine_count','Lactate_count']].fillna(0)
#x_train = x_train[['pid','ABPm','RRate','FiO2','Platelets','Creatinine', 'Lactate']].fillna(0)
x_train = x_train
print(x_train.head())

Index(['pid', 'Time', 'Age', 'EtCO2', 'PTT', 'BUN', 'Lactate', 'Temp', 'Hgb',
       'HCO3', 'BaseExcess', 'RRate', 'Fibrinogen', 'Phosphate', 'WBC',
       'Creatinine', 'PaCO2', 'AST', 'FiO2', 'Platelets', 'SaO2', 'Glucose',
       'ABPm', 'Magnesium', 'Potassium', 'ABPd', 'Calcium', 'Alkalinephos',
       'SpO2', 'Bilirubin_direct', 'Chloride', 'Hct', 'Heartrate',
       'Bilirubin_total', 'TroponinI', 'ABPs', 'pH', 'EtCO2_count',
       'PTT_count', 'BUN_count', 'Lactate_count', 'Temp_count', 'Hgb_count',
       'HCO3_count', 'BaseExcess_count', 'RRate_count', 'Fibrinogen_count',
       'Phosphate_count', 'WBC_count', 'Creatinine_count', 'PaCO2_count',
       'AST_count', 'FiO2_count', 'Platelets_count', 'SaO2_count',
       'Glucose_count', 'ABPm_count', 'Magnesium_count', 'Potassium_count',
       'ABPd_count', 'Calcium_count', 'Alkalinephos_count', 'SpO2_count',
       'Bilirubin_direct_count', 'Chloride_count', 'Hct_count',
       'Heartrate_count', 'Bilirubin_total_count', 'Tr

In [125]:
df_training_label = df_training_label
print(df_training_label.iloc[:,[0,-5]].head())

     pid  LABEL_Sepsis
0      1           0.0
1     10           0.0
2    100           0.0
3   1000           0.0
4  10000           0.0


In [126]:
print(x_train.shape)

(18995, 44)


In [129]:
# Train MLPClassifier
#regr =   KNeighborsClassifier(3)
regr = MLPClassifier(alpha=1e-4, hidden_layer_sizes=(200,200,200), random_state=1, solver='sgd', max_iter=300)
#regr = RidgeClassifierCV()
#regr = RandomForestClassifier()
trainNumbers = 16000
regr.fit(x_train.iloc[:trainNumbers,1:], df_training_label.iloc[:trainNumbers,-5].values)
print(regr.classes_)
print(regr.loss_)

[0. 1.]
0.16903217384408292




In [130]:
f = regr.predict_proba(x_train.iloc[trainNumbers:,1:])
f = pd.DataFrame(f)
#f.columns = list(df_training_label.columns[-5])
f.columns = ['False', 'LABEL_Sepsis']
f['pid'] = x_train.iloc[trainNumbers:,0].reset_index(drop=True)

print(f[f['pid'].isin([27001, 27003, 27007, 27010])])

print(get_score(df_training_label.iloc[trainNumbers:,:], f))

Empty DataFrame
Columns: [False, LABEL_Sepsis, pid]
Index: []
0.640378086419753


In [131]:
0.48263668430335094
0.6736342592592591

0.6736342592592591