In [199]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, svm, neighbors, metrics
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
import datetime
import pickle

In [2]:
## Read training data, labels, test data
train_features_original = pd.read_csv('dataset/train_features.csv', delimiter=',')
train_labels_original   = pd.read_csv('dataset/train_labels.csv'  , delimiter=',')
test_features_original  = pd.read_csv('dataset/test_features.csv' , delimiter=',')

In [3]:
# sort train data and labels values by pid, so they are the same (because after with the manipulation of data things get nasty)
train_features_original = train_features_original.sort_values(['pid','Time'])
train_labels_original   = train_labels_original.sort_values(['pid'])
test_features_original  = test_features_original.sort_values(['pid','Time'])

# Use pid as index
train_labels_original.set_index("pid", inplace=True)

In [4]:
train_features_original

Unnamed: 0,pid,Time,Age,EtCO2,PTT,BUN,Lactate,Temp,Hgb,HCO3,...,Alkalinephos,SpO2,Bilirubin_direct,Chloride,Hct,Heartrate,Bilirubin_total,TroponinI,ABPs,pH
0,1,3,34.0,,,12.0,,36.0,8.7,24.0,...,,100.0,,114.0,24.6,94.0,,,142.0,7.33
1,1,4,34.0,,,,,36.0,,,...,,100.0,,,,99.0,,,125.0,7.33
2,1,5,34.0,,,,,36.0,,,...,,100.0,,,,92.0,,,110.0,7.37
3,1,6,34.0,,,,,37.0,,,...,,100.0,,,,88.0,,,104.0,7.37
4,1,7,34.0,,,,,,,,...,,100.0,,,22.4,81.0,,,100.0,7.41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172879,31658,8,60.0,,,,,37.0,,,...,,96.0,,,,71.0,,,127.0,
172880,31658,9,60.0,,,,,,,,...,,,,,,,,,,
172881,31658,10,60.0,,,,,,,,...,,,,,,,,,,
172882,31658,11,60.0,,,,,,,,...,,96.0,,,,71.0,,,135.0,


In [5]:
train_labels_original

Unnamed: 0_level_0,LABEL_BaseExcess,LABEL_Fibrinogen,LABEL_AST,LABEL_Alkalinephos,LABEL_Bilirubin_total,LABEL_Lactate,LABEL_TroponinI,LABEL_SaO2,LABEL_Bilirubin_direct,LABEL_EtCO2,LABEL_Sepsis,LABEL_RRate,LABEL_ABPm,LABEL_SpO2,LABEL_Heartrate
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,12.1,85.4,100.0,59.9
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,20.4,99.1,95.4,65.8
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,17.8,78.8,97.4,71.8
6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,17.9,75.1,97.3,80.7
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.7,112.8,97.0,92.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31653,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.5,97.0,95.3,101.4
31654,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.2,119.2,97.6,91.8
31656,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,21.0,93.8,99.2,92.2
31657,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.8,72.5,98.7,64.0


# Some preprocessing

If the series of 12 measurements has at least 2 non NaN:
* replace the series of 12 values by their average and the slope of a 1D fit

If the series of 12 measurements has exactly 1 non NaN:
* replace the series of 12 values by their average
* a value that should be imputed, see (1)

If the series of 12 measurements has only NaNs:
* a value that should be imputed, see (2)
* a value that should be imputed, see (1)

(1) Replace the value to be imputed by:
* either the average of the slopes that can be computed
* or 0.
* or -9999.

(2) Replace the value to be imputed by:
* either the average of the average that can be computed
* or 0.
* or -9999.

In [6]:
# Make list of PIDs
pids = train_features_original["pid"].drop_duplicates().to_list()
Npatients_original = len(pids)
print("Number of patients: %d" %Npatients_original)
print("Dataset length: %d" %(len(train_features_original)))

Number of patients: 18995
Dataset length: 227940


In [7]:
# Reducing training for speeding up tests
Npatients_lite = Npatients_original//10
pids_lite = pids[:Npatients_lite]

train_features_lite = train_features_original[train_features_original["pid"].isin(pids_lite)]
print("Lite number of patients: %d" %Npatients_lite)
print("Lite dataset length: %d" %(len(train_features_lite)))

train_labels_lite = train_labels_original[train_labels_original.index.isin(pids_lite)]

Lite number of patients: 1899
Lite dataset length: 22788


In [8]:
# Decide here if to use the lite dataset or the whole dataset
train_features = train_features_lite.copy()
train_labels   = train_labels_lite.copy()
pids = pids_lite.copy()

In [9]:
# Make list of feature names
feature_names = [ x for x in train_features.columns  if x not in ("pid", "Time") ]
print("Features:")
print(feature_names)

Features:
['Age', 'EtCO2', 'PTT', 'BUN', 'Lactate', 'Temp', 'Hgb', 'HCO3', 'BaseExcess', 'RRate', 'Fibrinogen', 'Phosphate', 'WBC', 'Creatinine', 'PaCO2', 'AST', 'FiO2', 'Platelets', 'SaO2', 'Glucose', 'ABPm', 'Magnesium', 'Potassium', 'ABPd', 'Calcium', 'Alkalinephos', 'SpO2', 'Bilirubin_direct', 'Chloride', 'Hct', 'Heartrate', 'Bilirubin_total', 'TroponinI', 'ABPs', 'pH']


In [10]:
# Replace the series of 12 measurements by their average
# This takes a lot of time...
train_dict = {}
for feature_name in feature_names:
#    train_dict[feature_name] = { pid: train_features[feature_name].where(train_features["pid"] == pid).dropna() for pid in pids }
#    train_dict[feature_name + "_n"] = { pid: len(train_dict[feature_name][pid]) for pid in pids }
#    train_dict[feature_name + "_avg"] = { pid: np.mean(train_dict[feature_name][pid]) for pid in pids }
    train_dict[feature_name] = [ train_features[feature_name][train_features["pid"] == pid] for pid in pids ]
    train_dict[feature_name + "_n"] = [ len(x.dropna()) for x in train_dict[feature_name] ]
    train_dict[feature_name + "_avg"] = [ np.mean(x) for x in train_dict[feature_name] ]

In [11]:
train_dict["EtCO2_avg"]

[nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 45.125,
 nan,
 31.85,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 28.333333333333332,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 33.4444444444

In [12]:
# Replace NaNs of a column by the average of the column
feature_averages = {}
for feature_name in feature_names:
    train_dict[feature_name + "_avg_noNaN"] = [ x if not np.isnan(x) else 0. for x in train_dict[feature_name + "_avg"] ]
    feature_averages[feature_name] = np.average(train_dict[feature_name + "_avg_noNaN"], weights=train_dict[feature_name + "_n"])

In [13]:
feature_averages

{'Age': 62.60084254870985,
 'EtCO2': 32.419786096256686,
 'PTT': 40.182739861523245,
 'BUN': 23.120751802265705,
 'Lactate': 2.835894632206759,
 'Temp': 36.84988536261615,
 'Hgb': 10.567268112598061,
 'HCO3': 23.53391959798995,
 'BaseExcess': -1.0051269288203084,
 'RRate': 18.13236468146256,
 'Fibrinogen': 278.352422907489,
 'Phosphate': 3.5896883593033917,
 'WBC': 12.155412995594716,
 'Creatinine': 1.4935263462651995,
 'PaCO2': 40.925117150890344,
 'AST': 235.66516245487364,
 'FiO2': 0.5489114114114115,
 'Platelets': 208.8068181818182,
 'SaO2': 92.52402402402403,
 'Glucose': 143.01821122938844,
 'ABPm': 81.93786891919397,
 'Magnesium': 2.0312162971839425,
 'Potassium': 4.139645542427497,
 'ABPd': 63.80840467428498,
 'Calcium': 7.104068965517242,
 'Alkalinephos': 109.55045871559633,
 'SpO2': 97.68725079235251,
 'Bilirubin_direct': 1.499625,
 'Chloride': 106.22289156626506,
 'Hct': 31.16950894556528,
 'Heartrate': 84.79644870068333,
 'Bilirubin_total': 1.7437377690802347,
 'TroponinI': 

In [62]:
# Make features to use in training
train_features_preprocessed = pd.DataFrame()

# Add pids
train_features_preprocessed["pid"] = pids
train_features_preprocessed.set_index("pid", inplace=True)

def std_scaler(array):
    mean = np.mean(array)
    std = np.std(array, ddof=1)
    return (array-mean)/std

# Add features
for feature_name in feature_names:
    train_features_preprocessed[feature_name + "_avg"] = train_dict[feature_name + "_avg"]
    train_features_preprocessed[feature_name + "_avg"].replace(np.nan, feature_averages[feature_name], inplace=True)
    # Std scaling
    train_features_preprocessed[feature_name + "_avg"] = std_scaler(train_features_preprocessed[feature_name + "_avg"])
    
    train_features_preprocessed[feature_name + "_n"] = train_dict[feature_name + "_n"]
    
print(train_features_preprocessed.head())
print(len(train_features_preprocessed))

      Age_avg  Age_n  EtCO2_avg  EtCO2_n   PTT_avg  PTT_n   BUN_avg  BUN_n  \
pid                                                                          
1   -1.708968     12  -0.012238        0  0.041580      0 -0.675449      3   
2    1.398155     12  -0.012238        0 -0.494195      1  0.581357      1   
4    0.203108     12  -0.012238        0 -0.315235      1 -0.926810      2   
6    0.203108     12  -0.012238        0  0.911914      1  0.581357      2   
8   -1.230949     12  -0.012238        0  0.041580      0 -0.298407      1   

     Lactate_avg  Lactate_n  ...  Heartrate_avg  Heartrate_n  \
pid                          ...                               
1       0.145337          0  ...      -0.466378           12   
2       0.145337          0  ...      -1.580753           11   
4       0.145337          0  ...      -0.746022           11   
6      -1.004387          2  ...       0.165272           12   
8       0.145337          0  ...      -0.213811           11   

    

# Subtask 3

## Training

In [63]:
# Make target variables dataframe
col_numbers = [0, 11, 12, 13, 14]
train_labels_3 = train_labels.iloc[:, col_numbers]

# Restrict to pid existing in features dataset
train_labels_3 = train_labels_3[train_labels_3.index.isin(pids)]

print(train_labels_3.head())
print(len(train_labels_3))

     LABEL_BaseExcess  LABEL_RRate  LABEL_ABPm  LABEL_SpO2  LABEL_Heartrate
pid                                                                        
1                 1.0         12.1        85.4       100.0             59.9
2                 0.0         20.4        99.1        95.4             65.8
4                 0.0         17.8        78.8        97.4             71.8
6                 1.0         17.9        75.1        97.3             80.7
8                 0.0         18.7       112.8        97.0             92.6
1899


In [96]:
# Select a subset of training features
features_names_used = [ x for x in train_features_preprocessed.columns if x.endswith("_avg") ]
#features_names_used = train_features_preprocessed.columns

print("Used features:")
print(features_names_used)

Used features:
['Age_avg', 'EtCO2_avg', 'PTT_avg', 'BUN_avg', 'Lactate_avg', 'Temp_avg', 'Hgb_avg', 'HCO3_avg', 'BaseExcess_avg', 'RRate_avg', 'Fibrinogen_avg', 'Phosphate_avg', 'WBC_avg', 'Creatinine_avg', 'PaCO2_avg', 'AST_avg', 'FiO2_avg', 'Platelets_avg', 'SaO2_avg', 'Glucose_avg', 'ABPm_avg', 'Magnesium_avg', 'Potassium_avg', 'ABPd_avg', 'Calcium_avg', 'Alkalinephos_avg', 'SpO2_avg', 'Bilirubin_direct_avg', 'Chloride_avg', 'Hct_avg', 'Heartrate_avg', 'Bilirubin_total_avg', 'TroponinI_avg', 'ABPs_avg', 'pH_avg']


In [97]:
# split train data into train and validation
X_train, X_test, y_train, y_test = train_test_split(train_features_preprocessed[features_names_used], train_labels_3, train_size=0.8)

In [98]:
X_train

Unnamed: 0_level_0,Age_avg,EtCO2_avg,PTT_avg,BUN_avg,Lactate_avg,Temp_avg,Hgb_avg,HCO3_avg,BaseExcess_avg,RRate_avg,...,Alkalinephos_avg,SpO2_avg,Bilirubin_direct_avg,Chloride_avg,Hct_avg,Heartrate_avg,Bilirubin_total_avg,TroponinI_avg,ABPs_avg,pH_avg
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
151,0.920136,-0.012238,-0.615631,2.466566,0.145337,-0.436287,1.453595,2.333977,-0.062104,0.018002,...,0.009347,-0.838191,0.045787,-4.652502,1.537544,-0.505127,0.027601,0.007644,-1.055913,-0.072687
1515,-1.111444,-0.012238,0.041580,0.581357,0.549452,0.328697,2.750922,-2.817096,-2.751716,0.849134,...,-0.897340,0.092310,0.045787,2.121661,3.166211,2.512134,-1.001769,0.007644,-0.495975,-1.814396
3125,-0.812682,-0.012238,0.041580,-0.738289,0.145337,-0.436287,-0.766276,1.230175,-0.062104,-1.655647,...,0.009347,0.092310,0.045787,-0.858971,-0.493263,0.800190,0.027601,0.007644,-0.244477,-0.072687
1988,-2.485748,-0.012238,0.041580,0.078635,0.145337,-0.784007,1.924477,0.494308,-0.062104,3.934568,...,-0.997210,-0.553871,0.045787,-1.671870,1.808988,1.108312,-0.535006,-0.702220,-0.595625,-0.072687
2881,-0.155406,-0.012238,-0.487803,-0.518348,0.031505,0.583692,-0.016709,-0.609493,-1.181434,-0.278018,...,0.009347,1.123614,0.045787,1.173278,-0.540180,0.345010,0.027601,0.007644,-0.692111,-0.389887
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138,1.517659,-0.012238,3.666608,-0.424087,0.145337,-1.711261,0.098609,-0.977427,-0.957108,-1.430217,...,1.157137,0.287715,0.045787,-0.994454,0.029519,0.822599,-0.468326,0.007644,-1.790001,-0.715489
266,0.322612,-0.012238,-0.705111,-0.424087,-2.225260,1.552672,-0.881594,1.966043,1.061827,0.086314,...,0.009347,1.055377,0.045787,-0.317038,-0.714440,-0.017734,0.027601,0.007644,0.690335,0.139216
1678,0.023851,-0.012238,0.041580,0.023383,0.145337,0.328697,-0.035582,-0.045113,-0.062104,-1.001621,...,0.009347,-0.698616,0.045787,0.014325,-0.097254,0.391228,0.027601,0.007644,-0.771199,-0.072687
1731,1.577412,-0.012238,0.041580,-0.486928,0.145337,-0.181292,-0.074368,-0.045113,-0.062104,0.598656,...,0.672052,0.492425,-3.761337,0.014325,0.391445,0.927360,-0.935088,1.896201,1.153944,-0.072687


In [99]:
y_train

Unnamed: 0_level_0,LABEL_BaseExcess,LABEL_RRate,LABEL_ABPm,LABEL_SpO2,LABEL_Heartrate
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
151,0.0,16.2,63.7,96.6,72.2
1515,1.0,15.1,97.9,98.0,130.7
3125,0.0,13.7,65.1,97.4,99.9
1988,0.0,26.1,68.0,95.7,97.3
2881,1.0,21.7,67.0,97.4,87.0
...,...,...,...,...,...
138,1.0,13.1,69.8,98.4,94.3
266,1.0,14.7,85.2,99.0,88.0
1678,0.0,14.9,73.4,93.4,89.9
1731,0.0,19.6,88.4,98.6,88.1


In [294]:
## Regressor
#reg = svm.SVR()
#reg = svm.SVR(kernel='rbf')
#reg = svm.SVR(kernel='poly')
#reg = svm.SVR(kernel='sigmoid')

#reg = neighbors.KNeighborsRegressor(n_neighbors=25, weights="distance")
#reg = neighbors.KNeighborsRegressor(n_neighbors=25, weights="uniform")
# whether using distance or uniform weight does not matter much

#reg = Ridge(alpha = 100, fit_intercept=True)
reg = Lasso(alpha = 0.5, fit_intercept=True)

In [295]:
# Fit to the training data
# For 1 column for now
#col = 'LABEL_RRate'
col = 'LABEL_ABPm'
#col = 'LABEL_Heartrate'

print(datetime.datetime.now())
reg.fit(X_train, y_train[col])
print(datetime.datetime.now())

2021-04-21 00:06:12.687353
2021-04-21 00:06:12.705506


In [296]:
# Prediction to evaluate the model
y_pred = reg.predict(X_test)
print(y_pred[:10])
print("R2 score: %.2f" %(metrics.r2_score(y_test[col], y_pred)))

[78.06376494 97.30715793 95.07806886 78.26698449 94.53774283 91.95624651
 74.07081679 95.66516942 76.81231436 83.1000878 ]
R2 score: 0.60


### Summary of tests

#### Avg variables only

**LABEL_RRate**     
SVR        : R2 = 0.31     
SVR rbf    : R2 = 0.31     
SVR poly   : R2 = -0.32    
SVR sigmoid: R2 = -0.44     
knn 25     : R2 = 0.20     
Ridge 0.1  : R2 = 0.38      
Ridge 100  : R2 = 0.39    
Lasso 0.1  : R2 = 0.41

**LABEL_ABPm**     
SVR        : R2 = 0.46     
SVR rbf    : R2 = 0.46     
SVR poly   : R2 = 0.13     
SVR sigmoid: R2 = 0.56     
knn 25     : R2 = 0.45          
Ridge 0.1  : R2 = 0.59     
Ridge 100  : R2 = 0.59    
Lasso 0.1  : R2 = 0.60

**LABEL_Hearteate**     
SVR        : R2 = 0.44     
SVR rbf    : R2 = 0.44    
SVR poly   : R2 = 0.19     
SVR sigmoid: R2 = 0.52     
knn 25     : R2 = 0.36     
Ridge 0.1  : R2 = 0.55    
Ridge 100  : R2 = 0.55    
Lasso 0.1  : R2 = 0.56

**Conclusions**
* poly looks bad
* sigmoid sometimes does not work well
* no difference by linear and gaussian kernel
* Ridge works quite well for a wide range of alpha values
* Lasso works slightly better than Ridge for small alpha (<1)
  => Some variables do not bring much information
  => More complex model with L1 regularisation?



In [94]:
#save model into file
filename = '3_'+col
pickle.dump(reg, open(filename, 'wb'))