In [1]:
import numpy as np
import pandas as pd
import os
# explicitly require this experimental feature
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.metrics import mean_absolute_error
from sklearn.svm import LinearSVR, SVR, SVC,LinearSVC
from sklearn.datasets import make_regression
from sklearn.preprocessing import normalize 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression, RidgeClassifier, RidgeClassifierCV
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier


In [2]:
# Read in data
df_training_original = pd.read_csv('train_features.csv')
df_training_label = pd.read_csv('train_labels.csv')
all_pids = [pid for pid in df_training_original['pid'].unique()]

In [3]:
def getPatientData(trainingData, pids, patients=0, mode='pid'):
    if mode == 'number':
        pids = all_pids[:patients]
    if len(pids) == 0:
        return trainingData
    #pids = np.array(pids).astype(np.float)
    patients = [trainingData.iloc[idx] for idx in range(0, len(trainingData)) if trainingData['pid'][idx] in pids]    
    #patientTrainingDataIndex = [trainingData.iloc[idx] for idx, col in enumerate(trainingData) if trainingData['pid'][idx] in pids]    
    return pd.DataFrame(patients)

def partitionData(trainingDataPids, trainingPartition=80):
    validationPartition = 100 - trainingPartition
    countTraining = int((trainingPartition/100)*len(trainingDataPids))
    training = trainingDataPids[:countTraining]
    validation = trainingDataPids[countTraining:]
    print('')
    print('Training size: ' + str(countTraining))
    print('Validation size: ' + str(len(validation)))
    return training, validation

def populateData(X,Y):
    Z = pd.merge(X, Y, on='pid')
    YData = Z[Y.columns].iloc[:,1:]
    XData = Z[X.columns].iloc[:,1:]
    return XData, YData
import sklearn.metrics as metrics

TESTS = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total',
         'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
         'LABEL_Bilirubin_direct', 'LABEL_EtCO2']
def get_score(df_true, df_submission):
    df_submission = df_submission.sort_values('pid')
    df_true = df_true.sort_values('pid')
    
    task1 = np.mean([metrics.roc_auc_score(df_true[entry], df_submission[entry]) for entry in TESTS])
    #task2 = metrics.roc_auc_score(df_true['LABEL_Sepsis'], df_submission['LABEL_Sepsis'])
    #task3 = np.mean([0.5 + 0.5 * np.maximum(0, metrics.r2_score(df_true[entry], df_submission[entry])) for entry in VITALS])
    #score = np.mean([task1, task2, task3])
    return task1

In [4]:

X_pid_train = pd.DataFrame(columns=df_training_original.columns)
for pid in all_pids:
    uniqueData = df_training_original[df_training_original['pid']==pid]
    counts = uniqueData.isna().sum()
    counts = -1*counts + 12
    counts['pid'] = pid
    counts['Age'] = uniqueData['Age'].iloc[0]
    counts = pd.DataFrame(counts).transpose()
    X_pid_train = pd.concat([X_pid_train, counts])
    #counts.drop('Time', inplace=True)
    
    
print(X_pid_train)

      pid Time Age EtCO2 PTT BUN Lactate Temp Hgb HCO3 ... Alkalinephos SpO2  \
0       1   12  34     0   0   3       0    8   3    3 ...            0   12   
0      10   12  71     0   1   1       0    2   1    0 ...            1   11   
0     100   12  68     0   1   2       0    4   2    2 ...            0   12   
0    1000   12  79    11   0   1       2   11   1    0 ...            0   11   
0   10000   12  76     0   2   1       0    4   2    2 ...            0   11   
0   10002   12  73     1   1   1       2   11   1    0 ...            1   11   
0   10006   12  51     0   0   0       0    2   0    0 ...            0    9   
0   10007   12  60     0   0   0       0    2   0    0 ...            0   11   
0   10009   12  69     0   2   1       0    4   1    1 ...            1   11   
0    1001   12  36     0   1   2       4    3   2    2 ...            0    9   
0   10010   12  85     0   0   1       0   10   3    0 ...            0   11   
0   10012   12  63     0   0   0       0

In [9]:
gg3 = pd.merge(X_pid_train, df_training_label )
gg3 = gg3.astype(float)
print(gg3[gg3.columns[:38]].head())
print(len(gg3.columns))

ValueError: You are trying to merge on object and int64 columns. If you wish to proceed you should use pd.concat

In [10]:
plt.figure(figsize=(20,8))
print(gg3.columns[10:38])
corr = gg3[gg3.columns[1:-5]].corr()
f = plt.figure(figsize=(45, 35))
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True, annot=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);
plt.show()

NameError: name 'gg3' is not defined

<Figure size 1440x576 with 0 Axes>

In [11]:
#x_train = X_pid_train[['pid', 'PTT', 'HCO3', 'BaseExcess', 'PaCO2', 'FiO2', 'SaO2','Chloride', 'Hct', 'pH']]
x_train = X_pid_train.copy()
print(x_train.columns)
x_train.drop(['ABPs','Heartrate','SpO2','ABPd','ABPm','Age'],axis=1, inplace=True)


Index(['pid', 'Time', 'Age', 'EtCO2', 'PTT', 'BUN', 'Lactate', 'Temp', 'Hgb',
       'HCO3', 'BaseExcess', 'RRate', 'Fibrinogen', 'Phosphate', 'WBC',
       'Creatinine', 'PaCO2', 'AST', 'FiO2', 'Platelets', 'SaO2', 'Glucose',
       'ABPm', 'Magnesium', 'Potassium', 'ABPd', 'Calcium', 'Alkalinephos',
       'SpO2', 'Bilirubin_direct', 'Chloride', 'Hct', 'Heartrate',
       'Bilirubin_total', 'TroponinI', 'ABPs', 'pH'],
      dtype='object')


In [12]:
print(df_training_label.iloc[:,:-5].head())

     pid  LABEL_BaseExcess  LABEL_Fibrinogen  LABEL_AST  LABEL_Alkalinephos  \
0      1               1.0               0.0        0.0                 0.0   
1     10               0.0               0.0        0.0                 0.0   
2    100               1.0               0.0        0.0                 0.0   
3   1000               0.0               0.0        0.0                 0.0   
4  10000               0.0               0.0        0.0                 0.0   

   LABEL_Bilirubin_total  LABEL_Lactate  LABEL_TroponinI  LABEL_SaO2  \
0                    0.0            1.0              0.0         0.0   
1                    0.0            0.0              0.0         0.0   
2                    0.0            1.0              0.0         0.0   
3                    0.0            1.0              0.0         1.0   
4                    0.0            0.0              0.0         0.0   

   LABEL_Bilirubin_direct  LABEL_EtCO2  
0                     0.0          0.0  
1         

In [13]:
print(x_train.shape)

(18995, 31)


In [None]:
# Train MLPClassifier
#regr =   KNeighborsClassifier(3)
regr = MLPClassifier(alpha=1e-5,hidden_layer_sizes=(100,100), random_state=1, solver='sgd', max_iter=200)
#regr = RidgeClassifierCV()
#regr = RandomForestClassifier()
trainNumbers = 16000
regr.fit(x_train.iloc[:trainNumbers,2:], df_training_label.iloc[:trainNumbers,1:-5])

In [23]:
print(x_train.head())
print(df_training_label.iloc[:5,1:-5])

     pid Time EtCO2 PTT BUN Lactate Temp Hgb HCO3 BaseExcess ... Magnesium  \
0      1   12     0   0   3       0    8   3    3          6 ...         5   
0     10   12     0   1   1       0    2   1    0          0 ...         1   
0    100   12     0   1   2       0    4   2    2          0 ...         2   
0   1000   12    11   0   1       2   11   1    0          0 ...         1   
0  10000   12     0   2   1       0    4   2    2          4 ...         1   

  Potassium Calcium Alkalinephos Bilirubin_direct Chloride Hct  \
0         3       1            0                0        3   6   
0         1       1            1                0        0   1   
0         2       1            0                0        2   2   
0         3       3            0                0        0   1   
0         2       0            0                0        2   2   

  Bilirubin_total TroponinI pH  
0               0         0  7  
0               1         1  0  
0               0         0  0  
0 

In [25]:
f = regr.predict_proba(x_train.iloc[trainNumbers:,2:])
f = pd.DataFrame(f)
f.columns = df_training_label.columns[1:-5]
#f.columns = ['LABEL_BaseExcess1', 'LABEL_BaseExcess']
f['pid'] = x_train.iloc[trainNumbers:,0].reset_index(drop=True)

print(get_score(df_training_label.iloc[trainNumbers:,:], f))
print(f)
f.to_csv('task1_prediction.csv')

0.8086830016803719
      LABEL_BaseExcess  LABEL_Fibrinogen  LABEL_AST  LABEL_Alkalinephos  \
0             0.403712          0.016098   0.113581            0.102396   
1             0.070310          0.055763   0.298484            0.274558   
2             0.755917          0.047127   0.283607            0.345120   
3             0.782757          0.020449   0.122511            0.123943   
4             0.008163          0.014319   0.103829            0.102712   
5             0.322285          0.132939   0.111519            0.126718   
6             0.258863          0.778515   0.800410            0.773003   
7             0.661174          0.018215   0.130791            0.125976   
8             0.059437          0.028559   0.170467            0.170876   
9             0.155965          0.049617   0.225120            0.198213   
10            0.072047          0.019097   0.106332            0.095937   
11            0.007999          0.019309   0.275510            0.301016   
12    

AttributeError: 'numpy.ndarray' object has no attribute 'to_csv_csv'