In [606]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, metrics
from sklearn import svm, neighbors, neural_network
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
import datetime
import pickle

In [607]:
## Read training data, labels, test data
train_features_original = pd.read_csv('dataset/train_features.csv', delimiter=',')
train_labels_original   = pd.read_csv('dataset/train_labels.csv'  , delimiter=',')
test_features_original  = pd.read_csv('dataset/test_features.csv' , delimiter=',')

In [608]:
# sort train data and labels values by pid, so they are the same (because after with the manipulation of data things get nasty)
train_features_original = train_features_original.sort_values(['pid','Time'])
train_labels_original   = train_labels_original.sort_values(['pid'])
test_features_original  = test_features_original.sort_values(['pid','Time'])

# Use pid as index
train_labels_original.set_index("pid", inplace=True)

In [609]:
train_features_original

Unnamed: 0,pid,Time,Age,EtCO2,PTT,BUN,Lactate,Temp,Hgb,HCO3,...,Alkalinephos,SpO2,Bilirubin_direct,Chloride,Hct,Heartrate,Bilirubin_total,TroponinI,ABPs,pH
0,1,3,34.0,,,12.0,,36.0,8.7,24.0,...,,100.0,,114.0,24.6,94.0,,,142.0,7.33
1,1,4,34.0,,,,,36.0,,,...,,100.0,,,,99.0,,,125.0,7.33
2,1,5,34.0,,,,,36.0,,,...,,100.0,,,,92.0,,,110.0,7.37
3,1,6,34.0,,,,,37.0,,,...,,100.0,,,,88.0,,,104.0,7.37
4,1,7,34.0,,,,,,,,...,,100.0,,,22.4,81.0,,,100.0,7.41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172879,31658,8,60.0,,,,,37.0,,,...,,96.0,,,,71.0,,,127.0,
172880,31658,9,60.0,,,,,,,,...,,,,,,,,,,
172881,31658,10,60.0,,,,,,,,...,,,,,,,,,,
172882,31658,11,60.0,,,,,,,,...,,96.0,,,,71.0,,,135.0,


In [610]:
train_labels_original

Unnamed: 0_level_0,LABEL_BaseExcess,LABEL_Fibrinogen,LABEL_AST,LABEL_Alkalinephos,LABEL_Bilirubin_total,LABEL_Lactate,LABEL_TroponinI,LABEL_SaO2,LABEL_Bilirubin_direct,LABEL_EtCO2,LABEL_Sepsis,LABEL_RRate,LABEL_ABPm,LABEL_SpO2,LABEL_Heartrate
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,12.1,85.4,100.0,59.9
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,20.4,99.1,95.4,65.8
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,17.8,78.8,97.4,71.8
6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,17.9,75.1,97.3,80.7
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.7,112.8,97.0,92.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31653,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.5,97.0,95.3,101.4
31654,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.2,119.2,97.6,91.8
31656,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,21.0,93.8,99.2,92.2
31657,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.8,72.5,98.7,64.0


# Some preprocessing

If the series of 12 measurements has at least 2 non NaN:
* replace the series of 12 values by their average and the slope of a 1D fit

If the series of 12 measurements has exactly 1 non NaN:
* replace the series of 12 values by their average
* a value that should be imputed, see (1)

If the series of 12 measurements has only NaNs:
* a value that should be imputed, see (2)
* a value that should be imputed, see (1)

(1) Replace the value to be imputed by:
* either the average of the slopes that can be computed
* or 0.
* or -9999.

(2) Replace the value to be imputed by:
* either the average of the average that can be computed
* or 0.
* or -9999.

In [611]:
# Make list of PIDs
pids = train_features_original["pid"].drop_duplicates().to_list()
Npatients_original = len(pids)
print("Number of patients: %d" %Npatients_original)
print("Dataset length: %d" %(len(train_features_original)))

Number of patients: 18995
Dataset length: 227940


In [612]:
# Reducing training for speeding up tests
Npatients_lite = Npatients_original//10
pids_lite = pids[:Npatients_lite]

train_features_lite = train_features_original[train_features_original["pid"].isin(pids_lite)]
print("Lite number of patients: %d" %Npatients_lite)
print("Lite dataset length: %d" %(len(train_features_lite)))

train_labels_lite = train_labels_original[train_labels_original.index.isin(pids_lite)]

Lite number of patients: 189
Lite dataset length: 2268


In [619]:
# Decide here if to use the lite dataset or the whole dataset
train_features = train_features_lite.copy()
train_labels   = train_labels_lite.copy()
pids = pids_lite.copy()
Npatients = Npatients_lite

In [617]:
# Make list of feature names
feature_names = [ x for x in train_features.columns  if x not in ("pid", "Time") ]
print("Features:")
print(feature_names)

Features:
['Age', 'EtCO2', 'PTT', 'BUN', 'Lactate', 'Temp', 'Hgb', 'HCO3', 'BaseExcess', 'RRate', 'Fibrinogen', 'Phosphate', 'WBC', 'Creatinine', 'PaCO2', 'AST', 'FiO2', 'Platelets', 'SaO2', 'Glucose', 'ABPm', 'Magnesium', 'Potassium', 'ABPd', 'Calcium', 'Alkalinephos', 'SpO2', 'Bilirubin_direct', 'Chloride', 'Hct', 'Heartrate', 'Bilirubin_total', 'TroponinI', 'ABPs', 'pH']


In [620]:
# Replace the series of 12 measurements by their average
# This takes a lot of time...
print(datetime.datetime.now())

def make_linear_fit(x, y):
    if len(x) < 2:
        return np.nan
    else:
        return np.polyfit(x, y, 1)[0]


train_dict = {}
times = list(range(12))
for feature_name in feature_names:
    if feature_name == "Age":
        train_dict[feature_name + "_n"] = [12] * Npatients
        train_dict[feature_name + "_avg"] = [ np.mean(train_features["Age"][train_features["pid"] == pid]) for pid in pids ]
        train_dict[feature_name + "_slope"] = [0.] * Npatients
    else:
        train_dict[feature_name] = [ train_features[feature_name][train_features["pid"] == pid] for pid in pids ]
        train_dict[feature_name + "_n"] = [ len(x.dropna()) for x in train_dict[feature_name] ]
        train_dict[feature_name + "_avg"] = [ np.mean(x) for x in train_dict[feature_name] ]
        train_dict[feature_name + "_slope"] = [ make_linear_fit(train_features["Time"].where((train_features["pid"] == pid) & (train_features[feature_name].notna())).dropna(), x.dropna()) for x, pid in zip(train_dict[feature_name], pids) ]
    print("%s finished" %feature_name)
    
print(datetime.datetime.now())

2021-04-21 11:09:27.707631
Age finished
EtCO2 finished
PTT finished
BUN finished
Lactate finished
Temp finished
Hgb finished
HCO3 finished
BaseExcess finished
RRate finished
Fibrinogen finished
Phosphate finished
WBC finished
Creatinine finished
PaCO2 finished
AST finished
FiO2 finished
Platelets finished
SaO2 finished
Glucose finished
ABPm finished
Magnesium finished
Potassium finished
ABPd finished
Calcium finished
Alkalinephos finished
SpO2 finished
Bilirubin_direct finished
Chloride finished
Hct finished
Heartrate finished
Bilirubin_total finished
TroponinI finished
ABPs finished
pH finished
2021-04-21 11:10:37.161090


In [622]:
train_dict["EtCO2_slope"]

[nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 -3.2428571428571438,
 nan,
 -0.9655647382920121,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 4.999999999999998,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,


In [623]:
# Replace NaNs of a column by the average of the column
feature_averages = {}
for feature_name in feature_names:
    train_dict[feature_name + "_avg_noNaN"]   = [ x if not np.isnan(x) else 0. for x in train_dict[feature_name + "_avg"] ]
    train_dict[feature_name + "_slope_noNaN"] = [ x if not np.isnan(x) else 0. for x in train_dict[feature_name + "_slope"] ]
    feature_averages[feature_name + "_avg"] = np.average(train_dict[feature_name + "_avg_noNaN"], weights=train_dict[feature_name + "_n"])
    feature_averages[feature_name + "_slope"] = np.average(train_dict[feature_name + "_slope_noNaN"], weights=train_dict[feature_name + "_n"])

In [624]:
feature_averages

{'Age_avg': 62.44973544973545,
 'Age_slope': 0.0,
 'EtCO2_avg': 35.294871794871796,
 'EtCO2_slope': -0.16594994926635306,
 'PTT_avg': 36.05441176470588,
 'PTT_slope': -0.8501271041480015,
 'BUN_avg': 21.67910447761194,
 'BUN_slope': -0.09898648467565091,
 'Lactate_avg': 2.5327380952380953,
 'Lactate_slope': -0.17932941404181996,
 'Temp_avg': 36.824644549763036,
 'Temp_slope': 0.07659833511530002,
 'Hgb_avg': 10.378577981651373,
 'Hgb_slope': 0.012051875139421214,
 'HCO3_avg': 23.819444444444443,
 'HCO3_slope': 0.05147447051297791,
 'BaseExcess_avg': -0.6544502617801047,
 'BaseExcess_slope': -0.050788017103961317,
 'RRate_avg': 18.24170616113744,
 'RRate_slope': 0.061502881753095526,
 'Fibrinogen_avg': 263.08,
 'Fibrinogen_slope': 8.162688842688835,
 'Phosphate_avg': 3.463084112149532,
 'Phosphate_slope': -0.008184043370959265,
 'WBC_avg': 11.964054054054056,
 'WBC_slope': -0.03881272596509771,
 'Creatinine_avg': 1.5739053254437871,
 'Creatinine_slope': -0.009570741966141856,
 'PaCO2_av

In [625]:
# Make features to use in training
train_features_preprocessed = pd.DataFrame()

# Add pids
train_features_preprocessed["pid"] = pids
train_features_preprocessed.set_index("pid", inplace=True)

def std_scaler(array):
    mean = np.mean(array)
    std = np.std(array, ddof=1)
    if std != 0:
        return (array-mean)/std
    else:
        return array

# Add features
for feature_name in feature_names:
    train_features_preprocessed[feature_name + "_avg"] = train_dict[feature_name + "_avg"]
    train_features_preprocessed[feature_name + "_avg"].replace(np.nan, feature_averages[feature_name + "_avg"], inplace=True)
    # Std scaling
    train_features_preprocessed[feature_name + "_avg"] = std_scaler(train_features_preprocessed[feature_name + "_avg"])

    if feature_name != "Age":
        train_features_preprocessed[feature_name + "_n"] = train_dict[feature_name + "_n"]
        train_features_preprocessed[feature_name + "_slope"] = train_dict[feature_name + "_slope"]
        train_features_preprocessed[feature_name + "_slope"].replace(np.nan, feature_averages[feature_name + "_slope"], inplace=True)
        # Std scaling
        train_features_preprocessed[feature_name + "_slope"] = std_scaler(train_features_preprocessed[feature_name + "_slope"])
    
print(train_features_preprocessed.head())
print(len(train_features_preprocessed))

      Age_avg  EtCO2_avg  EtCO2_n  EtCO2_slope   PTT_avg  PTT_n  PTT_slope  \
pid                                                                          
1   -1.674982  -0.011287        0    -0.013764  0.012721      0   0.078615   
2    1.386525  -0.011287        0    -0.013764 -0.275932      1   0.078615   
4    0.209022  -0.011287        0    -0.013764 -0.085958      1   0.078615   
6    0.209022  -0.011287        0    -0.013764  1.216726      1   0.078615   
8   -1.203981  -0.011287        0    -0.013764  0.012721      0   0.078615   

      BUN_avg  BUN_n  BUN_slope  ...  Bilirubin_total_slope  TroponinI_avg  \
pid                              ...                                         
1   -0.656081      3   0.353838  ...               0.072162       0.023973   
2    0.599157      1   0.068591  ...               0.072162      -0.937412   
4   -0.907128      2   0.353838  ...               0.072162      -0.953739   
6    0.599157      2   0.353838  ...               0.072162    

# Subtask 3

## Training

In [626]:
# Make target variables dataframe
col_numbers = [0, 11, 12, 13, 14]
train_labels_3 = train_labels.iloc[:, col_numbers]

# Restrict to pid existing in features dataset
train_labels_3 = train_labels_3[train_labels_3.index.isin(pids)]

print(train_labels_3.head())
print(len(train_labels_3))

     LABEL_BaseExcess  LABEL_RRate  LABEL_ABPm  LABEL_SpO2  LABEL_Heartrate
pid                                                                        
1                 1.0         12.1        85.4       100.0             59.9
2                 0.0         20.4        99.1        95.4             65.8
4                 0.0         17.8        78.8        97.4             71.8
6                 1.0         17.9        75.1        97.3             80.7
8                 0.0         18.7       112.8        97.0             92.6
189


In [627]:
# Select a subset of training features
#features_names_used = [ x for x in train_features_preprocessed.columns if x.endswith("_avg") ]
features_names_used = [ x for x in train_features_preprocessed.columns if (x.endswith("_avg") or x.endswith("_slope")) ]
#features_names_used = train_features_preprocessed.columns

print("Used features:")
print(features_names_used)
print("Number of used features: %d" %(len(features_names_used)))

Used features:
['Age_avg', 'EtCO2_avg', 'EtCO2_slope', 'PTT_avg', 'PTT_slope', 'BUN_avg', 'BUN_slope', 'Lactate_avg', 'Lactate_slope', 'Temp_avg', 'Temp_slope', 'Hgb_avg', 'Hgb_slope', 'HCO3_avg', 'HCO3_slope', 'BaseExcess_avg', 'BaseExcess_slope', 'RRate_avg', 'RRate_slope', 'Fibrinogen_avg', 'Fibrinogen_slope', 'Phosphate_avg', 'Phosphate_slope', 'WBC_avg', 'WBC_slope', 'Creatinine_avg', 'Creatinine_slope', 'PaCO2_avg', 'PaCO2_slope', 'AST_avg', 'AST_slope', 'FiO2_avg', 'FiO2_slope', 'Platelets_avg', 'Platelets_slope', 'SaO2_avg', 'SaO2_slope', 'Glucose_avg', 'Glucose_slope', 'ABPm_avg', 'ABPm_slope', 'Magnesium_avg', 'Magnesium_slope', 'Potassium_avg', 'Potassium_slope', 'ABPd_avg', 'ABPd_slope', 'Calcium_avg', 'Calcium_slope', 'Alkalinephos_avg', 'Alkalinephos_slope', 'SpO2_avg', 'SpO2_slope', 'Bilirubin_direct_avg', 'Bilirubin_direct_slope', 'Chloride_avg', 'Chloride_slope', 'Hct_avg', 'Hct_slope', 'Heartrate_avg', 'Heartrate_slope', 'Bilirubin_total_avg', 'Bilirubin_total_slope',

In [628]:
# split train data into train and validation
X_train, X_test, y_train, y_test = train_test_split(train_features_preprocessed[features_names_used], train_labels_3, train_size=0.8)

In [629]:
X_train

Unnamed: 0_level_0,Age_avg,EtCO2_avg,EtCO2_slope,PTT_avg,PTT_slope,BUN_avg,BUN_slope,Lactate_avg,Lactate_slope,Temp_avg,...,Heartrate_avg,Heartrate_slope,Bilirubin_total_avg,Bilirubin_total_slope,TroponinI_avg,TroponinI_slope,ABPs_avg,ABPs_slope,pH_avg,pH_slope
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
184,1.386525,-0.011287,-0.013764,-0.567679,0.078615,1.666110,0.068591,0.142325,0.037148,0.384797,...,1.611988,-0.028360,-2.049602,0.072162,0.023973,0.026896,-1.148028,1.109984,-0.120917,-0.001424
195,0.385648,-0.011287,-0.013764,-0.214869,0.078615,-0.530557,0.068591,-2.191159,0.037148,0.384797,...,-0.225920,-1.048501,0.006084,0.072162,0.023973,0.026896,-0.806546,-0.230425,3.014715,-1.274435
204,-0.968481,-0.011287,-0.013764,-0.201299,0.286713,-0.718843,0.068591,5.688931,-3.622008,0.384797,...,0.370666,1.081305,0.006084,0.072162,0.023973,0.026896,0.241755,0.577711,-3.278398,3.068827
101,-0.556355,-0.011287,-0.013764,0.012721,0.078615,-0.048602,0.068591,-2.008468,0.037148,0.233412,...,-0.496071,0.851383,0.006084,0.072162,0.023973,0.026896,-0.486583,0.156807,-0.332685,-0.001424
94,-0.556355,-0.011287,-0.013764,0.823207,0.078615,-0.048602,0.068591,0.142325,0.037148,-1.280439,...,0.325640,-0.570505,0.006084,0.072162,0.023973,0.026896,-1.075989,-0.087318,-0.120917,-0.001424
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23,1.151025,-0.011287,-0.013764,3.360725,0.078615,-0.405033,0.353838,-0.897095,-0.878377,0.092789,...,-0.133524,0.381764,0.006084,0.072162,0.023973,0.026896,0.027744,0.251025,0.126387,0.301202
230,-0.615230,-0.011287,-0.013764,0.012721,0.078615,-0.185366,-0.881166,-1.993243,0.037148,-0.170282,...,-0.066455,0.802978,0.006084,0.072162,0.023973,0.026896,1.309001,0.386541,-1.671646,-0.001424
114,-1.557232,-0.011287,-0.013764,0.012721,0.078615,-0.048602,0.068591,0.142325,0.037148,-0.447821,...,0.820918,-0.213520,0.006084,0.072162,0.023973,0.026896,0.683809,0.038954,-0.120917,-0.001424
161,0.267897,-0.011287,-0.013764,0.012721,0.078615,-0.048602,0.068591,0.142325,0.037148,-1.280439,...,-1.392356,-0.379963,0.006084,0.072162,-0.954127,0.662574,0.429102,-0.619694,-0.120917,-0.001424


In [630]:
y_train

Unnamed: 0_level_0,LABEL_BaseExcess,LABEL_RRate,LABEL_ABPm,LABEL_SpO2,LABEL_Heartrate
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
184,0.0,20.6,62.6,96.0,76.6
195,1.0,19.7,60.6,98.8,88.5
204,1.0,17.1,89.7,95.4,86.5
101,0.0,17.8,73.9,97.6,92.7
94,0.0,13.8,75.5,95.9,96.4
...,...,...,...,...,...
23,1.0,15.0,62.7,97.9,67.8
230,1.0,28.1,101.1,93.2,89.2
114,0.0,18.6,93.3,99.1,92.1
161,0.0,18.0,81.5,92.2,60.0


In [631]:
## Regressor
#reg = svm.SVR()
#reg = svm.SVR(kernel='rbf')
#reg = svm.SVR(kernel='poly')
#reg = svm.SVR(kernel='sigmoid')

#reg = neighbors.KNeighborsRegressor(n_neighbors=25, weights="distance")
#reg = neighbors.KNeighborsRegressor(n_neighbors=25, weights="uniform")
# whether using distance or uniform weight does not matter much

#reg = Ridge(alpha = 1, fit_intercept=True)
reg = Lasso(alpha = 0.1, fit_intercept=True)

# Multi Layer Perceptron
# Use L2 penalty (alpha)
# reg = neural_network.MLPRegressor(hidden_layer_sizes=(100),
#                                   alpha=100,        # L2 regularization
#                                   activation="relu",
#                                   solver="adam",
#                                   learning_rate_init=0.01,
#                                   learning_rate="constant",
#                                   max_iter=500)

In [632]:
# Fit to the training data
# For 1 column for now
col = 'LABEL_RRate'
#col = 'LABEL_ABPm'
#col = 'LABEL_Heartrate'

print(datetime.datetime.now())
reg.fit(X_train, y_train[col])
print(datetime.datetime.now())

2021-04-21 11:12:14.603879
2021-04-21 11:12:14.622787


In [633]:
# Prediction to evaluate the model
y_pred = reg.predict(X_test)
print("R2 score: %.2f" %(metrics.r2_score(y_test[col], y_pred)))

R2 score: 0.30


### Summary of tests

#### Avg variables only

**LABEL_RRate**     
SVR        : R2 = 0.31     
SVR rbf    : R2 = 0.31     
SVR poly   : R2 = -0.32    
SVR sigmoid: R2 = -0.44     
knn 25     : R2 = 0.20     
Ridge 0.1  : R2 = 0.38      
Ridge 100  : R2 = 0.39    
Lasso 0.1  : R2 = 0.41
MLP   hidden_layer_sizes=(40, 40),   R2 = 0.41  (but fluctuates!)      
      alpha=10,      
      activation="relu",     
      solver="adam",    
      learning_rate_init=0.02,    
      learning_rate="constant",    
      max_iter=500    

MPL   hidden_layer_sizes=(100),    R2 = 0.39        
      alpha=100,    
      activation="relu",    
      solver="adam",     
      learning_rate_init=0.01,    
      learning_rate="constant",     
      max_iter=500)    

**LABEL_ABPm**     
SVR        : R2 = 0.46     
SVR rbf    : R2 = 0.46     
SVR poly   : R2 = 0.13     
SVR sigmoid: R2 = 0.56     
knn 25     : R2 = 0.45          
Ridge 0.1  : R2 = 0.59     
Ridge 100  : R2 = 0.59    
Lasso 0.1  : R2 = 0.60

MPL   hidden_layer_sizes=(100),    R2 = 0.62        
      alpha=100,    
      activation="relu",    
      solver="adam",     
      learning_rate_init=0.01,    
      learning_rate="constant",     
      max_iter=500)    
      
MPL   hidden_layer_sizes=(50, 50, 50, 50),   R2 = 0.60    
      alpha=300,    
      
MPL   hidden_layer_sizes=(200),   R2 = 0.60    
      alpha=200,    

MPL   hidden_layer_sizes=(100),    R2 = 0.62 (relu), 0.54 (tanh), 0.55 (logistic)    
      alpha=100,    
      activation=varied,    
      solver="adam",    
      learning_rate_init=0.01,    
      learning_rate="constant",    
      max_iter=500)   

   => Relu is the best


**LABEL_Heartrate**     
SVR        : R2 = 0.44     
SVR rbf    : R2 = 0.44    
SVR poly   : R2 = 0.19     
SVR sigmoid: R2 = 0.52     
knn 25     : R2 = 0.36     
Ridge 0.1  : R2 = 0.55    
Ridge 100  : R2 = 0.55    
Lasso 0.1  : R2 = 0.56

MPL   hidden_layer_sizes=(100),    R2 = 0.67     
      alpha=100,    
      activation="relu",    
      solver="adam",     
      learning_rate_init=0.01,    
      learning_rate="constant",     
      max_iter=500)    

**Conclusions**
* SVR poly looks bad
* SVR sigmoid sometimes does not work well
* SVR: no difference by linear and gaussian kernel
* Ridge works quite well for a wide range of alpha values
* Lasso works slightly better than Ridge for small alpha (<1)
  => Some variables do not bring much information
  => More complex model with L1 regularisation?
* MLP hidden_leayers=(100), alpha=100, activation="relu", learning_rate=0.01
  gives promising results


#### Avg and n variables
n variable are not std scaled

**LABEL_RRate**          
Ridge 0.1  : R2 = 0.45      
Ridge 100  : R2 = 0.46   
Lasso 0.1  : R2 = 0.45    
MPL   hidden_layer_sizes=(100),    R2 = 0.40        
      alpha=100,    
      activation="relu",    
      solver="adam",     
      learning_rate_init=0.01,    
      learning_rate="constant",     
      max_iter=500)    

**LABEL_ABPm**              
Ridge 0.1  : R2 = 0.62    
Ridge 100  : R2 = 0.62     
Lasso 0.1  : R2 = 0.63

MPL   hidden_layer_sizes=(100),    R2 = 0.60         
      alpha=100,    
      activation="relu",    
      solver="adam",     
      learning_rate_init=0.01,    
      learning_rate="constant",     
      max_iter=500)    

**LABEL_Heartrate**        
Ridge 0.1  : R2 = 0.53    
Ridge 100  : R2 = 0.54    
Lasso 0.1  : R2 = 0.54

MPL   hidden_layer_sizes=(100),    R2 = 0.50     
      alpha=100,    
      activation="relu",    
      solver="adam",     
      learning_rate_init=0.01,    
      learning_rate="constant",     
      max_iter=500)    

**Conclusions**
* Adding the number of measurements does not help for heartrate but seems to help a bit the others
* Will not use them 


#### Avg and slope variables

**LABEL_RRate**          
Ridge 0.1  : R2 =      
Ridge 100  : R2 =    
Lasso 0.1  : R2 =     
MPL   hidden_layer_sizes=(100),    R2 =         
      alpha=100,    
      activation="relu",    
      solver="adam",     
      learning_rate_init=0.01,    
      learning_rate="constant",     
      max_iter=500)    

**LABEL_ABPm**              
Ridge 0.1  : R2 =     
Ridge 100  : R2 =      
Lasso 0.1  : R2 = 

MPL   hidden_layer_sizes=(100),    R2 =          
      alpha=100,    
      activation="relu",    
      solver="adam",     
      learning_rate_init=0.01,    
      learning_rate="constant",     
      max_iter=500)    

**LABEL_Heartrate**        
Ridge 0.1  : R2 =     
Ridge 100  : R2 =     
Lasso 0.1  : R2 = 

MPL   hidden_layer_sizes=(100),    R2 =      
      alpha=100,    
      activation="relu",    
      solver="adam",     
      learning_rate_init=0.01,    
      learning_rate="constant",     
      max_iter=500)    

In [94]:
#save model into file
filename = '3_'+col
pickle.dump(reg, open(filename, 'wb'))