# Intro ML - Project 2

## Load data

In [1]:
import numpy as np
import pandas as pd
train_data = pd.read_csv('train_features.csv')
labels = pd.read_csv('train_labels.csv')
test_data = pd.read_csv('test_features.csv')

## Data preprocessing

In [35]:
def calculate_time_features(data, n_samples):
    x = []
    features = [np.nanmedian, np.nanmean, np.nanvar, np.nanmin,
           np.nanmax]
    for index in range(int(data.shape[0] / n_samples)):
        assert data[n_samples * index, 0] == data[n_samples * (index + 1) - 1, 0], \
        'Ids are {}, {}'.format(data[n_samples * index, 0], data[n_samples * (index + 1) - 1, 0])
        patient_data = data[n_samples * index:n_samples * (index + 1), 2:]
        feature_values = np.empty((len(features), data[:, 2:].shape[1]))
        for i, feature in enumerate(features):
            feature_values[i] = feature(patient_data, axis=0)
        x.append(feature_values.ravel())
    return np.array(x)

In [36]:
x_train = calculate_time_features(train_data.to_numpy(), 12)
x_test = calculate_time_features(test_data.to_numpy(), 12)

  r = func(a, **kwargs)
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()


## Learning Pipeline - Subtask 1

In [4]:
subtask1_labels_ids = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST',
         'LABEL_Alkalinephos', 'LABEL_Bilirubin_total', 
         'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
         'LABEL_Bilirubin_direct', 'LABEL_EtCO2']
y_train = labels[subtask1_labels_ids].to_numpy()

In [6]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import (GridSearchCV,
    cross_val_score, KFold)
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# TODO (yarden):
# feature selection.
# parameters tuning (subsample, learning rate).
pipeline = make_pipeline(
                    SimpleImputer(strategy='median'),
                    StandardScaler(),
                    OneVsRestClassifier(
                        GradientBoostingClassifier(subsample=0.5)))
scores = cross_val_score(pipeline, x_train, y_train,
                            cv=5,
                            scoring='roc_auc',
                            verbose=True)
print("Cross-validation score is {score:.3f},"
      " standard deviation is {err:.3f}"
      .format(score = scores.mean(), err = scores.std()))

In [7]:
classifier = pipeline.fit(x_train, y_train)
predictions = pipeline.predict_proba(x_test)
df = pd.DataFrame({'pid': test_data.iloc[0::12, 0].values})
print("Training score:", metrics.roc_auc_score(y_train, classifier.predict_proba(x_train)))
for i, label in enumerate(subtask1_labels_ids):
    df[label] = predictions[:, i]

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 19.8min finished


## Learning Pipeline - Subtask 2

In [50]:
subtask2_labels_ids = ['LABEL_Sepsis']
y_train = labels[subtask2_labels_ids].to_numpy().ravel()

In [53]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import (GridSearchCV,
    cross_val_score, KFold)
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import sklearn.metrics as metrics

# TODO (yarden):
# feature selection.
# parameters tuning (subsample, learning rate).
pipeline = make_pipeline(
                    SimpleImputer(strategy='median'),
                    StandardScaler(),
                    GradientBoostingClassifier(subsample=0.5))

scores = cross_val_score(pipeline, x_train, y_train,
                            cv=5,
                            scoring='roc_auc',
                            verbose=True)
print("Cross-validation score is {score:.3f},"
      " standard deviation is {err:.3f}"
      .format(score = scores.mean(), err = scores.std()))

In [75]:
pipeline = pipeline.fit(x_train, y_train)
predictions = pipeline.predict_proba(x_test)[:, 1]
print("Training score:", metrics.roc_auc_score(y_train, classifier.predict_proba(x_train)[:, 1]))
df[subtask2_labels_ids] = predictions

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.7min finished


Training score: 0.8053847041413316


## Learning Pipeline - Subtask 3

In [125]:
subtask3_labels_ids = ['LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2',
                      'LABEL_Heartrate']
y_train = labels[subtask3_labels_ids].to_numpy()

In [185]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import (GridSearchCV,
    cross_val_score, KFold)
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import sklearn.metrics as metrics

for i, label in enumerate(subtask3_labels_ids):
    pipeline = make_pipeline(
                        SimpleImputer(strategy='median'),
                        HistGradientBoostingRegressor(max_depth=3))
    scores = cross_val_score(pipeline, x_train, y_train[:, i],
                            cv=5,
                            scoring='r2',
                            verbose=True)
    print("Cross-validation score is {score:.3f},"
          " standard deviation is {err:.3f}"
          .format(score = scores.mean(), err = scores.std()))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    7.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.416, standard deviation is 0.008


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    6.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.616, standard deviation is 0.015


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    7.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.380, standard deviation is 0.022
Cross-validation score is 0.637, standard deviation is 0.012


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    6.9s finished


In [156]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
for i, label in enumerate(subtask3_labels_ids):
    pipeline = make_pipeline(
                        SimpleImputer(strategy='median'),
                        StandardScaler(),
                        HistGradientBoostingRegressor(max_depth=3))
    pipeline = pipeline.fit(x_train, y_train[:, i])
    predictions = pipeline.predict(x_test)
    print("Training score:", metrics.r2_score(y_train[:, i], pipeline.predict(x_train)))
    df[label] = predictions

Training score: 0.46657876158710565
Training score: 0.6432809148552587
Training score: 0.4709852855526181
Training score: 0.6631141320848979


## Save predictions

In [179]:
df.drop('LABEL_Heartrate', axis=1)
df

Unnamed: 0,pid,LABEL_BaseExcess,LABEL_Fibrinogen,LABEL_AST,LABEL_Alkalinephos,LABEL_Bilirubin_total,LABEL_Lactate,LABEL_TroponinI,LABEL_SaO2,LABEL_Bilirubin_direct,LABEL_EtCO2,LABEL_Sepsis,LABEL_RRate,LABEL_ABPm,LABEL_SpO2,LABEL_Heartrate,L,A,B,E
0,0,0.962973,0.333533,0.782626,0.801037,0.760807,0.445815,0.012547,0.400925,0.042028,0.009267,0.283366,14.009516,83.216702,98.915619,83.816023,26.372804,66.882162,102.193637,84.840032
12,10001,0.045314,0.032996,0.317925,0.318369,0.312155,0.081545,0.065000,0.082886,0.022394,0.021282,0.025460,17.535887,87.348222,95.079255,99.668985,15.636119,89.755064,95.830908,98.017504
24,10003,0.022639,0.026904,0.148735,0.152050,0.141291,0.161416,0.039760,0.325588,0.022446,0.015157,0.025350,18.546690,79.945667,97.843944,90.139501,21.447622,78.951863,98.414723,92.734244
36,10004,0.037327,0.027652,0.308094,0.285601,0.328222,0.074586,0.044575,0.095203,0.019849,0.028308,0.022223,16.278451,75.681581,95.913104,87.659049,17.288829,79.939742,97.241468,99.032436
48,10005,0.117239,0.048982,0.146678,0.125906,0.145645,0.110497,0.011876,0.107843,0.015932,0.008657,0.030735,19.289418,74.972701,95.888367,63.016676,18.286085,70.991699,96.546231,54.542910
60,10008,0.781321,0.036748,0.440223,0.556089,0.461320,0.522722,0.009247,0.120767,0.017674,0.021439,0.222734,18.883949,93.896684,97.148402,74.347134,16.716963,96.929410,99.285885,71.825842
72,10011,0.026880,0.033585,0.159286,0.163747,0.158038,0.056038,0.043510,0.069884,0.022275,0.041177,0.019769,16.506569,98.192200,98.201490,70.227373,19.422448,107.461514,99.109750,76.424986
84,10017,0.048762,0.029055,0.250034,0.240725,0.234834,0.101754,0.054999,0.079704,0.012970,0.045423,0.034797,20.595653,109.072113,97.910278,102.273785,21.453114,115.342562,99.122722,103.478408
96,10018,0.086858,0.036231,0.353449,0.324453,0.339203,0.129509,0.057995,0.452446,0.023406,0.278732,0.067823,18.473747,80.290088,95.076993,100.020172,18.215028,96.974646,94.900658,92.581513
108,10019,0.633577,0.034502,0.101202,0.107160,0.109782,0.150168,0.015772,0.753782,0.014117,0.006868,0.033474,17.526669,81.302114,95.926918,73.343664,18.010039,75.718121,96.588995,80.216187


In [171]:
df.to_csv('prediction.csv', index=False, float_format='%.4f')

          pid  LABEL_BaseExcess  LABEL_Fibrinogen  LABEL_AST  \
0           0          0.962973          0.333533   0.782626   
12      10001          0.045314          0.032996   0.317925   
24      10003          0.022639          0.026904   0.148735   
36      10004          0.037327          0.027652   0.308094   
48      10005          0.117239          0.048982   0.146678   
60      10008          0.781321          0.036748   0.440223   
72      10011          0.026880          0.033585   0.159286   
84      10017          0.048762          0.029055   0.250034   
96      10018          0.086858          0.036231   0.353449   
108     10019          0.633577          0.034502   0.101202   
120     10025          0.046448          0.033112   0.324334   
132     10028          0.063397          0.038571   0.656429   
144      1003          0.953728          0.112379   0.341064   
156     10032          0.068427          0.066252   0.730127   
168     10033          0.046114         

Unnamed: 0,pid,LABEL_BaseExcess,LABEL_Fibrinogen,LABEL_AST,LABEL_Alkalinephos,LABEL_Bilirubin_total,LABEL_Lactate,LABEL_TroponinI,LABEL_SaO2,LABEL_Bilirubin_direct,LABEL_EtCO2,LABEL_Sepsis,LABEL_RRate,LABEL_ABPm,LABEL_SpO2,LABEL_Heartrate,L,A,B,E
0,0,0.962973,0.333533,0.782626,0.801037,0.760807,0.445815,0.012547,0.400925,0.042028,0.009267,0.283366,14.009516,83.216702,98.915619,83.816023,26.372804,66.882162,102.193637,84.840032
12,10001,0.045314,0.032996,0.317925,0.318369,0.312155,0.081545,0.065000,0.082886,0.022394,0.021282,0.025460,17.535887,87.348222,95.079255,99.668985,15.636119,89.755064,95.830908,98.017504
24,10003,0.022639,0.026904,0.148735,0.152050,0.141291,0.161416,0.039760,0.325588,0.022446,0.015157,0.025350,18.546690,79.945667,97.843944,90.139501,21.447622,78.951863,98.414723,92.734244
36,10004,0.037327,0.027652,0.308094,0.285601,0.328222,0.074586,0.044575,0.095203,0.019849,0.028308,0.022223,16.278451,75.681581,95.913104,87.659049,17.288829,79.939742,97.241468,99.032436
48,10005,0.117239,0.048982,0.146678,0.125906,0.145645,0.110497,0.011876,0.107843,0.015932,0.008657,0.030735,19.289418,74.972701,95.888367,63.016676,18.286085,70.991699,96.546231,54.542910
60,10008,0.781321,0.036748,0.440223,0.556089,0.461320,0.522722,0.009247,0.120767,0.017674,0.021439,0.222734,18.883949,93.896684,97.148402,74.347134,16.716963,96.929410,99.285885,71.825842
72,10011,0.026880,0.033585,0.159286,0.163747,0.158038,0.056038,0.043510,0.069884,0.022275,0.041177,0.019769,16.506569,98.192200,98.201490,70.227373,19.422448,107.461514,99.109750,76.424986
84,10017,0.048762,0.029055,0.250034,0.240725,0.234834,0.101754,0.054999,0.079704,0.012970,0.045423,0.034797,20.595653,109.072113,97.910278,102.273785,21.453114,115.342562,99.122722,103.478408
96,10018,0.086858,0.036231,0.353449,0.324453,0.339203,0.129509,0.057995,0.452446,0.023406,0.278732,0.067823,18.473747,80.290088,95.076993,100.020172,18.215028,96.974646,94.900658,92.581513
108,10019,0.633577,0.034502,0.101202,0.107160,0.109782,0.150168,0.015772,0.753782,0.014117,0.006868,0.033474,17.526669,81.302114,95.926918,73.343664,18.010039,75.718121,96.588995,80.216187
