In [22]:
import numpy as np
import pandas as pd
import os
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.metrics import mean_absolute_error
from sklearn.svm import LinearSVR, SVR, SVC,LinearSVC
from sklearn.datasets import make_regression
from sklearn.preprocessing import normalize 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression, RidgeClassifier, RidgeClassifierCV
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.ensemble import RandomForestClassifier


In [4]:
# Read in data
df_training_original = pd.read_csv('train_features.csv')
df_training_label = pd.read_csv('train_labels.csv')
all_pids = [pid for pid in df_training_original['pid'].unique()]

In [5]:
def getPatientData(trainingData, pids, patients=0, mode='pid'):
    if mode == 'number':
        pids = all_pids[:patients]
    if len(pids) == 0:
        return trainingData
    #pids = np.array(pids).astype(np.float)
    patients = [trainingData.iloc[idx] for idx in range(0, len(trainingData)) if trainingData['pid'][idx] in pids]    
    #patientTrainingDataIndex = [trainingData.iloc[idx] for idx, col in enumerate(trainingData) if trainingData['pid'][idx] in pids]    
    return pd.DataFrame(patients)

def partitionData(trainingDataPids, trainingPartition=80):
    validationPartition = 100 - trainingPartition
    countTraining = int((trainingPartition/100)*len(trainingDataPids))
    training = trainingDataPids[:countTraining]
    validation = trainingDataPids[countTraining:]
    print('')
    print('Training size: ' + str(countTraining))
    print('Validation size: ' + str(len(validation)))
    return training, validation

def populateData(X,Y):
    Z = pd.merge(X, Y, on='pid')
    YData = Z[Y.columns].iloc[:,1:]
    XData = Z[X.columns].iloc[:,1:]
    return XData, YData
import sklearn.metrics as metrics

TESTS = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total',
         'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
         'LABEL_Bilirubin_direct', 'LABEL_EtCO2']
def get_score(df_true, df_submission):
    df_submission = df_submission.sort_values('pid')
    df_true = df_true.sort_values('pid')
    
    #task1 = np.mean([metrics.roc_auc_score(df_true[entry], df_submission[entry]) for entry in TESTS])
    #task2 = metrics.roc_auc_score(df_true['LABEL_Sepsis'], df_submission['LABEL_Sepsis'])
    task3 = np.mean([0.5 + 0.5 * np.maximum(0, metrics.r2_score(df_true[entry], df_submission[entry])) for entry in VITALS])
    #score = np.mean([task1, task2, task3])
    return task3

In [6]:

X_pid_train = pd.DataFrame(columns=df_training_original.columns)
for i, pid in enumerate(all_pids):
    uniqueData = df_training_original[df_training_original['pid']==pid]
    means = uniqueData.mean()
    means['pid'] = pid
    means['Age'] = uniqueData['Age'].iloc[0]
    means = pd.DataFrame(means).transpose()
    X_pid_train = pd.concat([X_pid_train, means])
    #counts.drop('Time', inplace=True)
    if i % round(0.1*len(all_pids)) == 0:
        print(round(i/len(all_pids),2)*100, '%')
    del means
    

print(X_pid_train)

0.0 %
3.0 %
5.0 %
8.0 %
11.0 %
13.0 %
16.0 %
18.0 %
21.0 %
24.0 %
26.0 %
28.999999999999996 %
32.0 %
34.0 %
37.0 %
39.0 %
42.0 %
45.0 %
47.0 %
50.0 %
53.0 %
55.00000000000001 %
57.99999999999999 %
61.0 %
63.0 %
66.0 %
68.0 %
71.0 %
74.0 %
76.0 %
79.0 %
82.0 %
84.0 %
87.0 %
89.0 %
92.0 %
95.0 %
97.0 %
        pid  Time   Age      EtCO2    PTT    BUN  Lactate       Temp  \
0       1.0   8.5  34.0        NaN    NaN   12.0      NaN  36.750000   
0      10.0   6.5  71.0        NaN  27.80   12.0      NaN  36.000000   
0     100.0   7.5  68.0        NaN  20.90   21.0      NaN  36.250000   
0    1000.0   6.5  79.0  31.863636    NaN   22.0    3.855  36.818182   
0   10000.0   6.5  76.0        NaN  28.55   22.0      NaN  36.750000   
0   10002.0   6.5  73.0  19.000000  31.30   18.0    3.005  37.000000   
0   10006.0   6.5  51.0        NaN    NaN    NaN      NaN  37.500000   
0   10007.0   6.5  60.0        NaN    NaN    NaN      NaN  38.000000   
0   10009.0   6.5  69.0        NaN  86.05   15.0  

In [7]:
#x_train = X_pid_train[['pid', 'PTT', 'HCO3', 'BaseExcess', 'PaCO2', 'FiO2', 'SaO2','Chloride', 'Hct', 'pH']]
x_train = X_pid_train.copy()
x_train = x_train[['pid','RRate','Heartrate','ABPm','SpO2']].fillna(0)
x_train = x_train.sort_values(['pid'])
print(x_train.head())

   pid      RRate  Heartrate        ABPm        SpO2
0  1.0  17.000000  77.083333   68.333333  100.000000
0  2.0  18.000000  59.000000   94.636364   96.000000
0  4.0  14.636364  72.545455   80.909091   99.272727
0  6.0  15.833333  87.333333   65.750000   99.333333
0  8.0  17.181818  81.181818  143.900000   97.800000


In [8]:
df_training_label = df_training_label.sort_values(['pid'])
print(df_training_label.iloc[:,[0,-4,-3,-2,-1]].head())

       pid  LABEL_RRate  LABEL_ABPm  LABEL_SpO2  LABEL_Heartrate
0        1         12.1        85.4       100.0             59.9
6622     2         20.4        99.1        95.4             65.8
15008    4         17.8        78.8        97.4             71.8
16335    6         17.9        75.1        97.3             80.7
17676    8         18.7       112.8        97.0             92.6


In [9]:
print(x_train.shape)

(18995, 5)


In [28]:
# Train MLPClassifier
#regr =   KNeighborsClassifier(3)
regr = MLPRegressor(alpha=1e-4, hidden_layer_sizes=(100,200,200), random_state=1, solver='adam', max_iter=100)
#regr = RidgeClassifierCV()
#regr = RandomForestClassifier()
trainNumbers = 16000
regr.fit(x_train.iloc[:trainNumbers,1:], df_training_label.iloc[:trainNumbers,-4:])
print(regr.loss_)

20.481147173124107


In [29]:
print(x_train.iloc[:20,1:])
print(df_training_label.iloc[:20,-4:])

       RRate   Heartrate        ABPm        SpO2
0  17.000000   77.083333   68.333333  100.000000
0  18.000000   59.000000   94.636364   96.000000
0  14.636364   72.545455   80.909091   99.272727
0  15.833333   87.333333   65.750000   99.333333
0  17.181818   81.181818  143.900000   97.800000
0  18.090909   78.818182  101.727273   98.000000
0  16.583333   82.833333   79.916667   99.833333
0  18.333333   85.285714   82.833333   94.666667
0  13.000000   59.166667   87.750000   99.583333
0  32.444444  115.454545   91.181818   98.000000
0  12.583333   82.583333   85.500000  100.000000
0  11.333333   80.833333   66.250000   99.181818
0  12.666667   81.100000   79.200000   97.500000
0  20.333333   98.333333   94.916667   95.416667
0  18.700000   82.545455   74.727273   94.727273
0  17.875000   66.222222   79.111111   99.666667
0  17.583333   79.500000   93.833333   95.000000
0  16.777778   52.300000   74.100000   96.900000
0  14.444444   65.222222   68.000000   99.555556
0  14.300000   76.27

In [30]:
f = regr.predict_proba(x_train.iloc[trainNumbers:,1:])
f = pd.DataFrame(f)
#f.columns = list(df_training_label.columns[-5])
f.columns = ['False', 'LABEL_Sepsis']
f['pid'] = x_train.iloc[trainNumbers:,0].reset_index(drop=True)


# pid 27001(T), 27002(F), 27007(F), 27010(T)
print(f[f['pid'].isin([27001, 27003, 27007, 27010])])

get_score(df_training_label.iloc[trainNumbers:,:], f)

AttributeError: 'MLPRegressor' object has no attribute 'predict_proba'