In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, metrics
from sklearn import svm, neighbors, neural_network
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
import datetime
import pickle

In [607]:
## Read training data, labels, test data
train_features_original = pd.read_csv('dataset/train_features.csv', delimiter=',')
train_labels_original   = pd.read_csv('dataset/train_labels.csv'  , delimiter=',')
test_features_original  = pd.read_csv('dataset/test_features.csv' , delimiter=',')

In [608]:
# sort train data and labels values by pid, so they are the same (because after with the manipulation of data things get nasty)
train_features_original = train_features_original.sort_values(['pid','Time'])
train_labels_original   = train_labels_original.sort_values(['pid'])
test_features_original  = test_features_original.sort_values(['pid','Time'])

# Use pid as index
train_labels_original.set_index("pid", inplace=True)

In [609]:
train_features_original

Unnamed: 0,pid,Time,Age,EtCO2,PTT,BUN,Lactate,Temp,Hgb,HCO3,...,Alkalinephos,SpO2,Bilirubin_direct,Chloride,Hct,Heartrate,Bilirubin_total,TroponinI,ABPs,pH
0,1,3,34.0,,,12.0,,36.0,8.7,24.0,...,,100.0,,114.0,24.6,94.0,,,142.0,7.33
1,1,4,34.0,,,,,36.0,,,...,,100.0,,,,99.0,,,125.0,7.33
2,1,5,34.0,,,,,36.0,,,...,,100.0,,,,92.0,,,110.0,7.37
3,1,6,34.0,,,,,37.0,,,...,,100.0,,,,88.0,,,104.0,7.37
4,1,7,34.0,,,,,,,,...,,100.0,,,22.4,81.0,,,100.0,7.41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172879,31658,8,60.0,,,,,37.0,,,...,,96.0,,,,71.0,,,127.0,
172880,31658,9,60.0,,,,,,,,...,,,,,,,,,,
172881,31658,10,60.0,,,,,,,,...,,,,,,,,,,
172882,31658,11,60.0,,,,,,,,...,,96.0,,,,71.0,,,135.0,


In [610]:
train_labels_original

Unnamed: 0_level_0,LABEL_BaseExcess,LABEL_Fibrinogen,LABEL_AST,LABEL_Alkalinephos,LABEL_Bilirubin_total,LABEL_Lactate,LABEL_TroponinI,LABEL_SaO2,LABEL_Bilirubin_direct,LABEL_EtCO2,LABEL_Sepsis,LABEL_RRate,LABEL_ABPm,LABEL_SpO2,LABEL_Heartrate
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,12.1,85.4,100.0,59.9
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,20.4,99.1,95.4,65.8
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,17.8,78.8,97.4,71.8
6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,17.9,75.1,97.3,80.7
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.7,112.8,97.0,92.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31653,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.5,97.0,95.3,101.4
31654,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.2,119.2,97.6,91.8
31656,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,21.0,93.8,99.2,92.2
31657,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.8,72.5,98.7,64.0


# Some preprocessing

If the series of 12 measurements has at least 2 non NaN:
* replace the series of 12 values by their average and the slope of a 1D fit

If the series of 12 measurements has exactly 1 non NaN:
* replace the series of 12 values by their average
* a value that should be imputed, see (1)

If the series of 12 measurements has only NaNs:
* a value that should be imputed, see (2)
* a value that should be imputed, see (1)

(1) Replace the value to be imputed by:
* either the average of the slopes that can be computed
* or 0.
* or -9999.

(2) Replace the value to be imputed by:
* either the average of the average that can be computed
* or 0.
* or -9999.

In [611]:
# Make list of PIDs
pids_original = train_features_original["pid"].drop_duplicates().to_list()
Npatients_original = len(pids_original)
print("Number of patients: %d" %Npatients_original)
print("Dataset length: %d" %(len(train_features_original)))

Number of patients: 18995
Dataset length: 227940


In [7]:
# Reducing training for speeding up tests
Npatients_lite = Npatients_original//100
pids_lite = pids_original[:Npatients_lite]

train_features_lite = train_features_original[train_features_original["pid"].isin(pids_lite)]
print("Lite number of patients: %d" %Npatients_lite)
print("Lite dataset length: %d" %(len(train_features_lite)))

train_labels_lite = train_labels_original[train_labels_original.index.isin(pids_lite)]

Lite number of patients: 189
Lite dataset length: 2268


In [8]:
# Decide here if to use the lite dataset or the whole dataset

train_features = train_features_lite.copy()
train_labels   = train_labels_lite.copy()
pids = pids_lite.copy()
Npatients = Npatients_lite

# train_features = train_features_original.copy()
# train_labels   = train_labels_original.copy()
# pids = pids_original.copy()
# Npatients = Npatients_original

In [9]:
# Make list of feature names
feature_names = [ x for x in train_features.columns  if x not in ("pid", "Time") ]
print("Features:")
print(feature_names)

Features:
['Age', 'EtCO2', 'PTT', 'BUN', 'Lactate', 'Temp', 'Hgb', 'HCO3', 'BaseExcess', 'RRate', 'Fibrinogen', 'Phosphate', 'WBC', 'Creatinine', 'PaCO2', 'AST', 'FiO2', 'Platelets', 'SaO2', 'Glucose', 'ABPm', 'Magnesium', 'Potassium', 'ABPd', 'Calcium', 'Alkalinephos', 'SpO2', 'Bilirubin_direct', 'Chloride', 'Hct', 'Heartrate', 'Bilirubin_total', 'TroponinI', 'ABPs', 'pH']


In [22]:
# Replace the series of 12 measurements by their average and slope
print(datetime.datetime.now())

def make_linear_fit(x, y):
    x = x.where(y.notna()).dropna()
    y = y.dropna()
    if len(x) < 2:
        return np.nan
    else:
        return np.polyfit(x, y, 1)[0]
    
    
counts = train_features[["pid"]+feature_names].groupby(["pid"]).count().add_suffix("_n")
avgs = train_features[["pid"]+feature_names].groupby(["pid"]).mean().add_suffix("_avg")
#train_features_preprocessed = counts.copy()
train_features_preprocessed = pd.concat([counts, avgs], axis=1)
train_features_preprocessed["pid"] = train_features["pid"].copy()

train_features_preprocessed["time_list"] = train_features.groupby(["pid"]).Time.apply(list)
for feature_name in feature_names:
    if feature_name == "Age":
        train_features_preprocessed[feature_name + "_slope"] = Npatients * [0.]
    else:
        train_features_preprocessed[feature_name + "_list"] = train_features.groupby(["pid"])[feature_name].apply(list)
        train_features_preprocessed[feature_name + "_slope"] = train_features_preprocessed.apply(lambda row: make_linear_fit(pd.Series(row["time_list"]), pd.Series(row[feature_name + "_list"])), axis=1)
        # Delete the _list columns
        train_features_preprocessed.drop(feature_name + "_list", axis=1, inplace=True)

    print("%s finished at %s" %(feature_name, datetime.datetime.now()))
    
# Delete the time_list columns
train_features_preprocessed.drop("time_list", axis=1, inplace=True)
    
print(datetime.datetime.now())

Age finished at 2021-04-21 17:00:57.972960
EtCO2 finished at 2021-04-21 17:00:59.346779
PTT finished at 2021-04-21 17:01:00.819938
BUN finished at 2021-04-21 17:01:02.368881
Lactate finished at 2021-04-21 17:01:03.654477
Temp finished at 2021-04-21 17:01:05.026413
Hgb finished at 2021-04-21 17:01:06.343960
HCO3 finished at 2021-04-21 17:01:07.712692
BaseExcess finished at 2021-04-21 17:01:09.073317
RRate finished at 2021-04-21 17:01:10.480220
Fibrinogen finished at 2021-04-21 17:01:11.796867
Phosphate finished at 2021-04-21 17:01:13.195730
WBC finished at 2021-04-21 17:01:14.788776
Creatinine finished at 2021-04-21 17:01:16.382355
PaCO2 finished at 2021-04-21 17:01:17.864812
AST finished at 2021-04-21 17:01:19.262022
FiO2 finished at 2021-04-21 17:01:20.672075
Platelets finished at 2021-04-21 17:01:22.082065
SaO2 finished at 2021-04-21 17:01:23.557362
Glucose finished at 2021-04-21 17:01:24.903068
ABPm finished at 2021-04-21 17:01:26.217420
Magnesium finished at 2021-04-21 17:01:27.497

In [23]:
train_features_preprocessed

Unnamed: 0_level_0,Age_n,EtCO2_n,PTT_n,BUN_n,Lactate_n,Temp_n,Hgb_n,HCO3_n,BaseExcess_n,RRate_n,...,Alkalinephos_slope,SpO2_slope,Bilirubin_direct_slope,Chloride_slope,Hct_slope,Heartrate_slope,Bilirubin_total_slope,TroponinI_slope,ABPs_slope,pH_slope
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,12,0,0,3,0,8,3,3,6,12,...,,-8.943129e-16,,-3.493151e-01,-0.101480,-3.381119e+00,,,-1.083916,0.005503
2,12,0,1,1,0,3,1,0,0,11,...,,9.090909e-03,,,,5.578248e-16,,,-0.700000,
4,12,0,1,2,0,3,2,0,0,11,...,,-1.545455e-01,,,-0.442857,-3.272727e-01,,,-8.345455,
6,12,0,1,2,2,12,6,2,7,12,...,,-2.377622e-01,,-1.000000e+00,-0.168780,8.111888e-01,,,0.230769,0.002664
8,12,0,0,1,0,3,0,0,0,11,...,,1.458333e-01,,,,1.700000e+00,,,-1.527273,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289,12,0,1,1,1,3,1,1,2,12,...,,3.496503e-02,,,0.900000,-2.846154e+00,,,0.118881,0.010000
290,12,0,0,1,0,10,1,0,0,10,...,,-1.264749e-15,,,,7.333333e-01,,,1.854545,
292,12,0,1,1,1,3,0,1,1,11,...,,1.785252e-01,,,,-2.328590e-01,,,-0.858344,
294,12,0,1,2,0,2,1,2,0,11,...,,3.636364e-01,,-4.000000e-01,0.120000,6.272727e-01,,,0.636364,


In [24]:
# Replace NaNs of a column by the average of the column
feature_averages = {}
for feature_name in feature_names:
    avgs = train_features_preprocessed[feature_name + "_avg"].replace(np.nan, 0)
    slopes = train_features_preprocessed[feature_name + "_slope"].replace(np.nan, 0)
    
    avg_avg = np.average(avgs, weights=train_features_preprocessed[feature_name + "_n"])
    slope_avg = np.average(slopes, weights=train_features_preprocessed[feature_name + "_n"])

    train_features_preprocessed[feature_name + "_avg"].replace(np.nan, avg_avg, inplace=True)
    train_features_preprocessed[feature_name + "_slope"].replace(np.nan, slope_avg, inplace=True)

In [25]:
# Make features to use in training
def std_scaler(array):
    mean = np.mean(array)
    std = np.std(array, ddof=1)
    if std != 0:
        return (array-mean)/std
    else:
        return array

# Add features
for feature_name in feature_names:
    # Std scaling
    train_features_preprocessed[feature_name + "_avg"] = std_scaler(train_features_preprocessed[feature_name + "_avg"])
    train_features_preprocessed[feature_name + "_slope"] = std_scaler(train_features_preprocessed[feature_name + "_slope"])
    
print(train_features_preprocessed.head())
print(len(train_features_preprocessed))

     Age_n  EtCO2_n  PTT_n  BUN_n  Lactate_n  Temp_n  Hgb_n  HCO3_n  \
pid                                                                   
1       12        0      0      3          0       8      3       3   
2       12        0      1      1          0       3      1       0   
4       12        0      1      2          0       3      2       0   
6       12        0      1      2          2      12      6       2   
8       12        0      0      1          0       3      0       0   

     BaseExcess_n  RRate_n  ...  Alkalinephos_slope  SpO2_slope  \
pid                         ...                                   
1               6       12  ...            0.100128    0.265253   
2               0       11  ...            0.100128    0.302165   
4               0       11  ...            0.100128   -0.362259   
6               7       12  ...            0.100128   -0.700151   
8               0       11  ...            0.100128    0.857391   

     Bilirubin_direct_slope  Chl

# Subtask 3

## Training

In [31]:
# Loading reprocessed dataset if starting from here
with open("subtask3_fulldata.pkl", "rb") as f:
    train_features_preprocessed = pickle.load(f)

train_features_preprocessed

KeyboardInterrupt: 

In [16]:
# Make target variables dataframe
col_numbers = [0, 11, 12, 13, 14]
train_labels_3 = train_labels.iloc[:, col_numbers]

# Restrict to pid existing in features dataset
#train_labels_3 = train_labels_3[train_labels_3.index.isin(pids)]

print(train_labels_3.head())
print(len(train_labels_3))

     LABEL_BaseExcess  LABEL_RRate  LABEL_ABPm  LABEL_SpO2  LABEL_Heartrate
pid                                                                        
1                 1.0         12.1        85.4       100.0             59.9
2                 0.0         20.4        99.1        95.4             65.8
4                 0.0         17.8        78.8        97.4             71.8
6                 1.0         17.9        75.1        97.3             80.7
8                 0.0         18.7       112.8        97.0             92.6
189


In [245]:
# Select a subset of training features
#features_names_used = [ x for x in train_features_preprocessed.columns if x.endswith("_avg") ]
features_names_used = [ x for x in train_features_preprocessed.columns if (x.endswith("_avg") or x.endswith("_slope")) ]
#features_names_used = train_features_preprocessed.columns

print("Used features:")
print(features_names_used)
print("Number of used features: %d" %(len(features_names_used)))

Used features:
['Age_avg', 'EtCO2_avg', 'EtCO2_slope', 'PTT_avg', 'PTT_slope', 'BUN_avg', 'BUN_slope', 'Lactate_avg', 'Lactate_slope', 'Temp_avg', 'Temp_slope', 'Hgb_avg', 'Hgb_slope', 'HCO3_avg', 'HCO3_slope', 'BaseExcess_avg', 'BaseExcess_slope', 'RRate_avg', 'RRate_slope', 'Fibrinogen_avg', 'Fibrinogen_slope', 'Phosphate_avg', 'Phosphate_slope', 'WBC_avg', 'WBC_slope', 'Creatinine_avg', 'Creatinine_slope', 'PaCO2_avg', 'PaCO2_slope', 'AST_avg', 'AST_slope', 'FiO2_avg', 'FiO2_slope', 'Platelets_avg', 'Platelets_slope', 'SaO2_avg', 'SaO2_slope', 'Glucose_avg', 'Glucose_slope', 'ABPm_avg', 'ABPm_slope', 'Magnesium_avg', 'Magnesium_slope', 'Potassium_avg', 'Potassium_slope', 'ABPd_avg', 'ABPd_slope', 'Calcium_avg', 'Calcium_slope', 'Alkalinephos_avg', 'Alkalinephos_slope', 'SpO2_avg', 'SpO2_slope', 'Bilirubin_direct_avg', 'Bilirubin_direct_slope', 'Chloride_avg', 'Chloride_slope', 'Hct_avg', 'Hct_slope', 'Heartrate_avg', 'Heartrate_slope', 'Bilirubin_total_avg', 'Bilirubin_total_slope',

In [246]:
# split train data into train and validation
random_state = 2
X_train, X_test, y_train, y_test = train_test_split(train_features_preprocessed[features_names_used], train_labels_3, train_size=0.8, random_state=random_state)

In [247]:
X_train

Unnamed: 0_level_0,Age_avg,EtCO2_avg,EtCO2_slope,PTT_avg,PTT_slope,BUN_avg,BUN_slope,Lactate_avg,Lactate_slope,Temp_avg,...,Heartrate_avg,Heartrate_slope,Bilirubin_total_avg,Bilirubin_total_slope,TroponinI_avg,TroponinI_slope,ABPs_avg,ABPs_slope,pH_avg,pH_slope
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3068,0.979888,-0.012238,0.008637,0.041580,0.039039,0.023383,-0.000523,-1.015486,-0.884216,0.099026,...,0.122322,2.494821,0.027601,0.00382,0.007644,-0.003337,-0.102120,0.991473,0.352893,-2.643192
1607,2.234688,-0.012238,0.008637,-0.494195,0.039039,-0.361247,-0.000523,0.145337,0.017309,0.328697,...,-0.667591,-0.381626,-0.401646,0.00382,0.764991,-0.003337,1.311961,-0.105834,-0.072687,-0.014839
2874,0.023851,-0.012238,0.008637,-0.698719,0.039039,-0.518348,-0.601292,0.145337,0.017309,0.328697,...,-0.329777,-0.149186,0.027601,0.00382,0.007644,-0.003337,0.433617,1.072792,-0.072687,-0.014839
2551,0.860384,-0.012238,0.008637,-0.430280,0.039039,-0.549768,-0.473077,0.145337,0.017309,0.711189,...,-0.325856,1.350053,0.027601,0.00382,-0.609655,-0.271253,1.013011,-0.316388,-0.072687,-0.014839
1570,-0.394416,-0.012238,0.008637,0.041580,0.039039,-0.486928,0.039784,0.145337,0.017309,-0.895278,...,0.088242,-0.450911,-0.268285,0.00382,0.007644,-0.003337,0.839019,-0.223258,-0.593388,-0.416077
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2610,-2.485748,-0.012238,0.008637,0.041580,0.039039,-0.088939,-3.165597,0.145337,0.017309,0.099026,...,0.175543,-1.410585,0.027601,0.00382,0.007644,-0.003337,0.137251,-0.021619,-0.072687,-0.014839
2684,-0.155406,-0.012238,0.008637,0.041580,0.039039,0.392836,-0.000523,0.145337,0.017309,-1.201271,...,0.284786,-1.311387,-0.401646,0.00382,0.007644,-0.003337,-1.407061,-0.328535,-0.072687,-0.014839
779,0.322612,-0.012238,0.008637,-0.328018,0.039039,0.023383,-0.000523,0.145337,0.017309,-0.088567,...,-0.697470,-0.113426,0.027601,0.00382,0.007644,-0.003337,-0.522469,-0.388649,-0.398027,0.313014
840,-0.573673,-0.012238,0.008637,0.041580,0.039039,-0.591662,-0.415174,0.145337,0.017309,-0.436287,...,-0.332859,0.894560,-0.468326,0.00382,0.007644,-0.003337,-0.681039,0.059018,-0.349187,-0.416077


In [248]:
y_train

Unnamed: 0_level_0,LABEL_BaseExcess,LABEL_RRate,LABEL_ABPm,LABEL_SpO2,LABEL_Heartrate
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3068,1.0,19.1,65.5,97.6,80.8
1607,0.0,18.2,86.9,98.4,80.7
2874,0.0,18.5,85.8,96.7,78.8
2551,0.0,20.5,85.8,98.1,76.8
1570,1.0,20.2,97.8,93.9,89.5
...,...,...,...,...,...
2610,0.0,18.3,77.7,99.5,76.7
2684,0.0,17.1,70.6,94.5,79.9
779,1.0,19.6,68.1,96.2,68.9
840,1.0,14.8,101.1,100.0,88.1


In [252]:
## Regressor
#reg = svm.SVR()
#reg = svm.SVR(kernel='rbf')
#reg = svm.SVR(kernel='poly')
#reg = svm.SVR(kernel='sigmoid')

#reg = neighbors.KNeighborsRegressor(n_neighbors=25, weights="distance")
#reg = neighbors.KNeighborsRegressor(n_neighbors=25, weights="uniform")
# whether using distance or uniform weight does not matter much

#reg = Ridge(alpha = 100, fit_intercept=True)
#reg = Lasso(alpha = 0.1, fit_intercept=True)

# Multi Layer Perceptron
# Use L2 penalty (alpha)
reg = neural_network.MLPRegressor(hidden_layer_sizes=(100),
                                  alpha=100,        # L2 regularization
                                  activation="relu",
                                  solver="adam",
                                  learning_rate_init=0.01,
                                  learning_rate="constant",
                                  max_iter=500)

In [253]:
# Fit to the training data
# For 1 column for now
#col = 'LABEL_RRate'
#col = 'LABEL_ABPm'
col = 'LABEL_Heartrate'

print(datetime.datetime.now())
reg.fit(X_train, y_train[col])
print(datetime.datetime.now())

2021-04-21 12:22:31.126472
2021-04-21 12:22:34.576167


In [254]:
# Prediction to evaluate the model
y_pred = reg.predict(X_test)
print("R2 score: %.2f" %(metrics.r2_score(y_test[col], y_pred)))

R2 score: 0.56


### Summary of tests

#### Avg variables only

**LABEL_RRate**     
SVR        : R2 = 0.31     
SVR rbf    : R2 = 0.31     
SVR poly   : R2 = -0.32    
SVR sigmoid: R2 = -0.44     
knn 25     : R2 = 0.20     
Ridge 0.1  : R2 = 0.38      
Ridge 100  : R2 = 0.39    
Lasso 0.1  : R2 = 0.41
MLP   hidden_layer_sizes=(40, 40),   R2 = 0.41  (but fluctuates!)      
      alpha=10,      
      activation="relu",     
      solver="adam",    
      learning_rate_init=0.02,    
      learning_rate="constant",    
      max_iter=500    

MPL   hidden_layer_sizes=(100),    R2 = 0.39        
      alpha=100,    
      activation="relu",    
      solver="adam",     
      learning_rate_init=0.01,    
      learning_rate="constant",     
      max_iter=500)    

**LABEL_ABPm**     
SVR        : R2 = 0.46     
SVR rbf    : R2 = 0.46     
SVR poly   : R2 = 0.13     
SVR sigmoid: R2 = 0.56     
knn 25     : R2 = 0.45          
Ridge 0.1  : R2 = 0.59     
Ridge 100  : R2 = 0.59    
Lasso 0.1  : R2 = 0.60

MPL   hidden_layer_sizes=(100),    R2 = 0.62        
      alpha=100,    
      activation="relu",    
      solver="adam",     
      learning_rate_init=0.01,    
      learning_rate="constant",     
      max_iter=500)    
      
MPL   hidden_layer_sizes=(50, 50, 50, 50),   R2 = 0.60    
      alpha=300,    
      
MPL   hidden_layer_sizes=(200),   R2 = 0.60    
      alpha=200,    

MPL   hidden_layer_sizes=(100),    R2 = 0.62 (relu), 0.54 (tanh), 0.55 (logistic)    
      alpha=100,    
      activation=varied,    
      solver="adam",    
      learning_rate_init=0.01,    
      learning_rate="constant",    
      max_iter=500)   

   => Relu is the best


**LABEL_Heartrate**     
SVR        : R2 = 0.44     
SVR rbf    : R2 = 0.44    
SVR poly   : R2 = 0.19     
SVR sigmoid: R2 = 0.52     
knn 25     : R2 = 0.36     
Ridge 0.1  : R2 = 0.55    
Ridge 100  : R2 = 0.55    
Lasso 0.1  : R2 = 0.56

MPL   hidden_layer_sizes=(100),    R2 = 0.67     
      alpha=100,    
      activation="relu",    
      solver="adam",     
      learning_rate_init=0.01,    
      learning_rate="constant",     
      max_iter=500)    

**Conclusions**
* SVR poly looks bad
* SVR sigmoid sometimes does not work well
* SVR: no difference by linear and gaussian kernel
* Ridge works quite well for a wide range of alpha values
* Lasso works slightly better than Ridge for small alpha (<1)
  => Some variables do not bring much information
  => More complex model with L1 regularisation?
* MLP hidden_leayers=(100), alpha=100, activation="relu", learning_rate=0.01
  gives promising results


#### Avg and n variables
n variable are not std scaled

**LABEL_RRate**     
Ridge   1  : R2 = 0.45      
Ridge 100  : R2 = 0.46   
Lasso 0.1  : R2 = 0.45    
MPL   hidden_layer_sizes=(100),    R2 = 0.40        
      alpha=100,    
      activation="relu",    
      solver="adam",     
      learning_rate_init=0.01,    
      learning_rate="constant",     
      max_iter=500)    

**LABEL_ABPm**              
Ridge   1  : R2 = 0.62    
Ridge 100  : R2 = 0.62     
Lasso 0.1  : R2 = 0.63

MPL   hidden_layer_sizes=(100),    R2 = 0.60         
      alpha=100,    
      activation="relu",    
      solver="adam",     
      learning_rate_init=0.01,    
      learning_rate="constant",     
      max_iter=500)    

**LABEL_Heartrate**        
Ridge   1  : R2 = 0.53    
Ridge 100  : R2 = 0.54    
Lasso 0.1  : R2 = 0.54

MPL   hidden_layer_sizes=(100),    R2 = 0.50     
      alpha=100,    
      activation="relu",    
      solver="adam",     
      learning_rate_init=0.01,    
      learning_rate="constant",     
      max_iter=500)    

**Conclusions**
* Adding the number of measurements does not help for heartrate but seems to help a bit the others
* Will not use them 


#### Avg and slope variables

**LABEL_RRate**     
SVR rbf    : R2 = 0.36
knn 25     : R2 = 0.17 
Ridge   1  : R2 = 0.36     
Ridge 100  : R2 = 0.38   
Lasso 0.1  : R2 = 0.38    
MPL   hidden_layer_sizes=(100),    R2 = 0.39        
      alpha=100,    
      activation="relu",    
      solver="adam",     
      learning_rate_init=0.01,    
      learning_rate="constant",     
      max_iter=500)    

**LABEL_ABPm**     
SVR rbf    : R2 = 0.47    
knn 25     : R2 = 0.48
Ridge   1  : R2 = 0.58    
Ridge 100  : R2 = 0.59     
Lasso 0.1  : R2 = 0.60

MPL   hidden_layer_sizes=(100),    R2 = 0.55         
      alpha=100,    
      activation="relu",    
      solver="adam",     
      learning_rate_init=0.01,    
      learning_rate="constant",     
      max_iter=500)    

**LABEL_Heartrate**        
SVR rbf    : R2 = 0.41
knn 25     : R2 = 0.30
Ridge 1    : R2 = 0.61    
Ridge 100  : R2 = 0.62    
Lasso 0.1  : R2 = 0.63

MPL   hidden_layer_sizes=(100),    R2 = 0.60     
      alpha=100,    
      activation="relu",    
      solver="adam",     
      learning_rate_init=0.01,    
      learning_rate="constant",     
      max_iter=500)    

In [24]:
#save model into file
filename = '3_'+col
pickle.dump(reg, open(filename, 'wb'))