In [1]:
import numpy as np
import pandas as pd
import os
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.metrics import mean_absolute_error
from sklearn.svm import LinearSVR, SVR, SVC,LinearSVC
from sklearn.datasets import make_regression
from sklearn.preprocessing import normalize 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression, RidgeClassifier, RidgeClassifierCV
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier


In [2]:
# Read in data
df_training_original = pd.read_csv('train_features.csv')
df_training_label = pd.read_csv('train_labels.csv')
all_pids = [pid for pid in df_training_original['pid'].unique()]

In [3]:
def getPatientData(trainingData, pids, patients=0, mode='pid'):
    if mode == 'number':
        pids = all_pids[:patients]
    if len(pids) == 0:
        return trainingData
    #pids = np.array(pids).astype(np.float)
    patients = [trainingData.iloc[idx] for idx in range(0, len(trainingData)) if trainingData['pid'][idx] in pids]    
    #patientTrainingDataIndex = [trainingData.iloc[idx] for idx, col in enumerate(trainingData) if trainingData['pid'][idx] in pids]    
    return pd.DataFrame(patients)

def partitionData(trainingDataPids, trainingPartition=80):
    validationPartition = 100 - trainingPartition
    countTraining = int((trainingPartition/100)*len(trainingDataPids))
    training = trainingDataPids[:countTraining]
    validation = trainingDataPids[countTraining:]
    print('')
    print('Training size: ' + str(countTraining))
    print('Validation size: ' + str(len(validation)))
    return training, validation

def populateData(X,Y):
    Z = pd.merge(X, Y, on='pid')
    YData = Z[Y.columns].iloc[:,1:]
    XData = Z[X.columns].iloc[:,1:]
    return XData, YData
import sklearn.metrics as metrics

TESTS = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total',
         'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
         'LABEL_Bilirubin_direct', 'LABEL_EtCO2']
def get_score(df_true, df_submission):
    df_submission = df_submission.sort_values('pid')
    df_true = df_true.sort_values('pid')
    
    task1 = np.mean([metrics.roc_auc_score(df_true[entry], df_submission[entry]) for entry in TESTS])
    #task2 = metrics.roc_auc_score(df_true['LABEL_Sepsis'], df_submission['LABEL_Sepsis'])
    #task3 = np.mean([0.5 + 0.5 * np.maximum(0, metrics.r2_score(df_true[entry], df_submission[entry])) for entry in VITALS])
    #score = np.mean([task1, task2, task3])
    return task1

In [4]:

X_pid_train = pd.DataFrame(columns=df_training_original.columns)
for pid in all_pids:
    uniqueData = df_training_original[df_training_original['pid']==pid]
    counts = uniqueData.isna().sum()
    counts = -1*counts + 12
    counts['pid'] = pid
    counts['Age'] = uniqueData['Age'].iloc[0]
    counts = pd.DataFrame(counts).transpose()
    X_pid_train = pd.concat([X_pid_train, counts])
    #counts.drop('Time', inplace=True)
    
    
print(X_pid_train)

      pid Time Age EtCO2 PTT BUN Lactate Temp Hgb HCO3 ... Alkalinephos SpO2  \
0       1   12  34     0   0   3       0    8   3    3 ...            0   12   
0      10   12  71     0   1   1       0    2   1    0 ...            1   11   
0     100   12  68     0   1   2       0    4   2    2 ...            0   12   
0    1000   12  79    11   0   1       2   11   1    0 ...            0   11   
0   10000   12  76     0   2   1       0    4   2    2 ...            0   11   
0   10002   12  73     1   1   1       2   11   1    0 ...            1   11   
0   10006   12  51     0   0   0       0    2   0    0 ...            0    9   
0   10007   12  60     0   0   0       0    2   0    0 ...            0   11   
0   10009   12  69     0   2   1       0    4   1    1 ...            1   11   
0    1001   12  36     0   1   2       4    3   2    2 ...            0    9   
0   10010   12  85     0   0   1       0   10   3    0 ...            0   11   
0   10012   12  63     0   0   0       0

In [51]:
X_pid_train = X_pid_train.sort_values(['pid'])
print(X_pid_train)

      pid Time Age EtCO2 PTT BUN Lactate Temp Hgb HCO3 ... Alkalinephos SpO2  \
0       1   12  34     0   0   3       0    8   3    3 ...            0   12   
0       2   12  86     0   1   1       0    3   1    0 ...            0   11   
0       4   12  66     0   1   2       0    3   2    0 ...            1   11   
0       6   12  66     0   1   2       2   12   6    2 ...            0   12   
0       8   12  42     0   0   1       0    3   0    0 ...            0   10   
0      10   12  71     0   1   1       0    2   1    0 ...            1   11   
0      13   12  73     0   1   2       0    5   2    2 ...            0   12   
0      14   12  37     0   0   1       0    3   1    0 ...            1    3   
0      18   12  70     0   1   1       0    4   1    1 ...            0   12   
0      20   12  77     0   0   1       0    2   1    0 ...            1   11   
0      23   12  82     0   1   3       2    0   3    3 ...            0   12   
0      24   12  85     0   1   1       2

In [9]:
gg3 = pd.merge(X_pid_train, df_training_label )
gg3 = gg3.astype(float)
print(gg3[gg3.columns[:38]].head())
print(len(gg3.columns))

ValueError: You are trying to merge on object and int64 columns. If you wish to proceed you should use pd.concat

In [10]:
plt.figure(figsize=(20,8))
print(gg3.columns[10:38])
corr = gg3[gg3.columns[1:-5]].corr()
f = plt.figure(figsize=(45, 35))
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True, annot=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);
plt.show()

NameError: name 'gg3' is not defined

<Figure size 1440x576 with 0 Axes>

In [37]:
#x_train = X_pid_train[['pid', 'PTT', 'HCO3', 'BaseExcess', 'PaCO2', 'FiO2', 'SaO2','Chloride', 'Hct', 'pH']]
x_train = X_pid_train.copy()
print(x_train.columns)
x_train = x_train[['pid','ABPm','RRate','FiO2','Platelets','Bilirubin_total','Creatinine']]


Index(['pid', 'Time', 'Age', 'EtCO2', 'PTT', 'BUN', 'Lactate', 'Temp', 'Hgb',
       'HCO3', 'BaseExcess', 'RRate', 'Fibrinogen', 'Phosphate', 'WBC',
       'Creatinine', 'PaCO2', 'AST', 'FiO2', 'Platelets', 'SaO2', 'Glucose',
       'ABPm', 'Magnesium', 'Potassium', 'ABPd', 'Calcium', 'Alkalinephos',
       'SpO2', 'Bilirubin_direct', 'Chloride', 'Hct', 'Heartrate',
       'Bilirubin_total', 'TroponinI', 'ABPs', 'pH'],
      dtype='object')


In [45]:
print(df_training_label.iloc[:,-5:-4].head())

   LABEL_Sepsis
0           0.0
1           0.0
2           0.0
3           0.0
4           0.0


In [49]:
print(x_train.shape)

(18995, 7)


In [46]:
# Train MLPClassifier
#regr =   KNeighborsClassifier(3)
regr = MLPClassifier(alpha=1e-5,hidden_layer_sizes=(100,100), random_state=1, solver='sgd', max_iter=200)
#regr = RidgeClassifierCV()
#regr = RandomForestClassifier()
trainNumbers = 16000
regr.fit(x_train.iloc[:trainNumbers,:], df_training_label.iloc[:trainNumbers,-5:-4])

  y = column_or_1d(y, warn=True)


MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100, 100), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=1, shuffle=True, solver='sgd',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [47]:
print(x_train.head())
print(df_training_label.iloc[:5,-5:-4])

     pid ABPm RRate FiO2 Platelets Bilirubin_total Creatinine
0      1   12    12    4         1               0          1
0     10   11    11    0         1               1          1
0    100   12    12    0         1               0          1
0   1000   11    10    1         1               0          1
0  10000   11    11    6         2               0          2
   LABEL_Sepsis
0           0.0
1           0.0
2           0.0
3           0.0
4           0.0


In [48]:
f = regr.predict_proba(x_train.iloc[trainNumbers:,:])
f = pd.DataFrame(f, index=None)
print(f)
f.columns = list(df_training_label.columns[-5:-4])
#f.columns = ['LABEL_BaseExcess1', 'LABEL_BaseExcess']
f['pid'] = x_train.iloc[trainNumbers:,0].reset_index(drop=True)
get_score(df_training_label.iloc[trainNumbers:,:], f)

             0         1
0     0.935409  0.064591
1     0.935409  0.064591
2     0.935409  0.064591
3     0.935409  0.064591
4     0.935409  0.064591
5     0.935409  0.064591
6     0.935409  0.064591
7     0.935409  0.064591
8     0.935409  0.064591
9     0.935409  0.064591
10    0.935409  0.064591
11    0.935409  0.064591
12    0.935409  0.064591
13    0.935409  0.064591
14    0.935409  0.064591
15    0.935409  0.064591
16    0.935409  0.064591
17    0.935409  0.064591
18    0.935409  0.064591
19    0.935409  0.064591
20    0.984129  0.015871
21    0.935409  0.064591
22    0.935409  0.064591
23    0.935409  0.064591
24    0.935409  0.064591
25    0.935409  0.064591
26    0.935409  0.064591
27    0.935409  0.064591
28    0.935409  0.064591
29    0.935409  0.064591
...        ...       ...
2965  0.935409  0.064591
2966  0.935409  0.064591
2967  0.935409  0.064591
2968  0.935409  0.064591
2969  0.935409  0.064591
2970  0.935409  0.064591
2971  0.935409  0.064591
2972  0.935409  0.064591


ValueError: Length mismatch: Expected axis has 2 elements, new values have 1 elements