In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, svm, metrics
from sklearn.model_selection import train_test_split

In [2]:
## Read training data, labels, test data
train_data = pd.read_csv('dataset/train_features.csv', delimiter=',')
train_labels = pd.read_csv('dataset/train_labels.csv', delimiter=',')
test_data = pd.read_csv('dataset/test_features.csv', delimiter=',')

In [3]:
train_data

Unnamed: 0,pid,Time,Age,EtCO2,PTT,BUN,Lactate,Temp,Hgb,HCO3,...,Alkalinephos,SpO2,Bilirubin_direct,Chloride,Hct,Heartrate,Bilirubin_total,TroponinI,ABPs,pH
0,1,3,34.0,,,12.0,,36.0,8.7,24.0,...,,100.0,,114.0,24.6,94.0,,,142.0,7.33
1,1,4,34.0,,,,,36.0,,,...,,100.0,,,,99.0,,,125.0,7.33
2,1,5,34.0,,,,,36.0,,,...,,100.0,,,,92.0,,,110.0,7.37
3,1,6,34.0,,,,,37.0,,,...,,100.0,,,,88.0,,,104.0,7.37
4,1,7,34.0,,,,,,,,...,,100.0,,,22.4,81.0,,,100.0,7.41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227935,9999,8,85.0,,,,,,,,...,,,,,,80.0,,,110.0,
227936,9999,9,85.0,,,,,,,,...,,,,,,83.0,,,123.0,
227937,9999,10,85.0,,,,,36.0,,,...,,98.0,,,,80.0,,,138.0,
227938,9999,11,85.0,,,,,,10.2,,...,,98.0,,,31.0,75.0,,,125.0,


In [4]:
train_labels

Unnamed: 0,pid,LABEL_BaseExcess,LABEL_Fibrinogen,LABEL_AST,LABEL_Alkalinephos,LABEL_Bilirubin_total,LABEL_Lactate,LABEL_TroponinI,LABEL_SaO2,LABEL_Bilirubin_direct,LABEL_EtCO2,LABEL_Sepsis,LABEL_RRate,LABEL_ABPm,LABEL_SpO2,LABEL_Heartrate
0,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,12.1,85.4,100.0,59.9
1,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.8,100.6,95.5,85.5
2,100,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,16.5,88.3,96.5,108.1
3,1000,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,19.4,77.2,98.3,80.9
4,10000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.6,76.8,97.7,95.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18990,9993,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,17.1,69.8,100.0,110.7
18991,9995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.6,97.3,97.8,59.2
18992,9996,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,17.3,66.3,96.9,100.3
18993,9998,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,18.8,81.5,96.9,99.4


# Sub Task 1

In [5]:
# labels needed for this subtask only up to LABEL_EtCO2
train_labels_1 = train_labels.iloc[:,:11]

In [6]:
train_labels_1

Unnamed: 0,pid,LABEL_BaseExcess,LABEL_Fibrinogen,LABEL_AST,LABEL_Alkalinephos,LABEL_Bilirubin_total,LABEL_Lactate,LABEL_TroponinI,LABEL_SaO2,LABEL_Bilirubin_direct,LABEL_EtCO2
0,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1000,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,10000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
18990,9993,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
18991,9995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18992,9996,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
18993,9998,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


## How to replace Nan
### Replace with number -9999

In [7]:
# replace nan in training data with -9999
train_data_1 = train_data.iloc[:,:]
train_data_1 = train_data_1.replace(np.nan,-9999)

In [8]:
test_data_1 = test_data.iloc[:,:]
test_data_1 = test_data_1.replace(np.nan,-9999)

In [9]:
# Time Stamp between 1 and 12 (I simply replace it with numbers from 1 to 12)
# To check :
    # Are time stamps for a single patient consecutive?
    # Are the times for each patient in order?
    
train_data_1['Time']= np.array([[1,2,3,4,5,6,7,8,9,10,11,12] for i in range(int(len(train_data_1['Time'])/12))]).flatten()

test_data_1['Time']= np.array([[1,2,3,4,5,6,7,8,9,10,11,12] for i in range(int(len(test_data_1['Time'])/12))]).flatten()

In [10]:
# now we flatten the dataframe for each patient using the pivot function on the column Time
columns = train_data_1.columns
print(columns[2:])

train_data_1 = train_data_1.pivot(index='pid', columns='Time', values=columns[2:])
test_data_1 = test_data_1.pivot(index='pid', columns='Time', values=columns[2:])

Index(['Age', 'EtCO2', 'PTT', 'BUN', 'Lactate', 'Temp', 'Hgb', 'HCO3',
       'BaseExcess', 'RRate', 'Fibrinogen', 'Phosphate', 'WBC', 'Creatinine',
       'PaCO2', 'AST', 'FiO2', 'Platelets', 'SaO2', 'Glucose', 'ABPm',
       'Magnesium', 'Potassium', 'ABPd', 'Calcium', 'Alkalinephos', 'SpO2',
       'Bilirubin_direct', 'Chloride', 'Hct', 'Heartrate', 'Bilirubin_total',
       'TroponinI', 'ABPs', 'pH'],
      dtype='object')


In [16]:
# split train data into train and validation
X_train, X_test, y_train, y_test = train_test_split(train_data_1,train_labels_1, train_size=0.8)

In [10]:
train_data_1

Unnamed: 0_level_0,Age,Age,Age,Age,Age,Age,Age,Age,Age,Age,...,pH,pH,pH,pH,pH,pH,pH,pH,pH,pH
Time,1,2,3,4,5,6,7,8,9,10,...,3,4,5,6,7,8,9,10,11,12
pid,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,...,7.37,7.37,7.41,-9999.00,-9999.00,-9999.00,-9999.00,7.39,7.39,-9999.0
2,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0,...,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.0
4,66.0,66.0,66.0,66.0,66.0,66.0,66.0,66.0,66.0,66.0,...,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.0
6,66.0,66.0,66.0,66.0,66.0,66.0,66.0,66.0,66.0,66.0,...,-9999.00,7.33,7.35,7.34,7.39,7.37,7.34,-9999.00,-9999.00,-9999.0
8,42.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,42.0,...,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31653,52.0,52.0,52.0,52.0,52.0,52.0,52.0,52.0,52.0,52.0,...,-9999.00,7.33,7.33,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.0
31654,66.0,66.0,66.0,66.0,66.0,66.0,66.0,66.0,66.0,66.0,...,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.0
31656,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,44.0,...,-9999.00,7.28,-9999.00,7.34,-9999.00,7.33,-9999.00,-9999.00,-9999.00,-9999.0
31657,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0,...,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.00,-9999.0


In [12]:
test_data_1

Unnamed: 0_level_0,Age,Age,Age,Age,Age,Age,Age,Age,Age,Age,...,pH,pH,pH,pH,pH,pH,pH,pH,pH,pH
Time,1,2,3,4,5,6,7,8,9,10,...,3,4,5,6,7,8,9,10,11,12
pid,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,39.0,39.0,39.0,39.0,39.0,39.0,39.0,39.0,39.0,39.0,...,7.34,-9999.0,-9999.0,-9999.0,-9999.00,7.4,-9999.00,-9999.0,-9999.00,-9999.0
3,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,...,-9999.00,-9999.0,-9999.0,-9999.0,-9999.00,-9999.0,-9999.00,-9999.0,-9999.00,-9999.0
5,62.0,62.0,62.0,62.0,62.0,62.0,62.0,62.0,62.0,62.0,...,-9999.00,-9999.0,-9999.0,-9999.0,-9999.00,-9999.0,-9999.00,-9999.0,-9999.00,-9999.0
7,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,...,-9999.00,-9999.0,7.4,-9999.0,7.36,-9999.0,-9999.00,-9999.0,-9999.00,-9999.0
9,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0,51.0,...,-9999.00,-9999.0,-9999.0,-9999.0,-9999.00,-9999.0,-9999.00,-9999.0,-9999.00,-9999.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31647,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0,...,-9999.00,-9999.0,-9999.0,-9999.0,-9999.00,-9999.0,-9999.00,-9999.0,-9999.00,-9999.0
31649,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0,...,7.26,-9999.0,-9999.0,-9999.0,-9999.00,-9999.0,-9999.00,-9999.0,-9999.00,7.3
31651,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0,...,-9999.00,-9999.0,-9999.0,-9999.0,-9999.00,-9999.0,7.33,-9999.0,7.38,-9999.0
31652,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,...,-9999.00,-9999.0,-9999.0,-9999.0,-9999.00,-9999.0,-9999.00,-9999.0,-9999.00,-9999.0


## Which model to use?
### SVM

In [30]:
# Try with only one binary classification problem for now, after wee need to do all of them

# kernel = sigmoid ; but the result is not a value between [0,1] but it is 0 or 1. Why? See y_pred later
classifier = svm.SVC(kernel = 'sigmoid')
#fit to the trainin data
classifier.fit(X_train,y_train['LABEL_BaseExcess'])
y_pred = classifier.predict(X_test)
print("ROC AUC: %.2e" %(np.sqrt(metrics.roc_auc_score(y_test['LABEL_BaseExcess'], y_pred))))

ROC AUC: 7.06e-01


In [29]:
# I want this to be between 0 and 1, why it is not?
y_pred

array([0., 1., 0., ..., 0., 0., 0.])

In [None]:
#####################################################################

In [None]:
# We actually need to have one model for each binary classification problem, so we have to loop (but for)

for item in train_labels_1:
    MODELS