# Intro ML - Project 2

## Load data

In [1]:
import numpy as np
import pandas as pd
train_data = pd.read_csv('train_features.csv')
labels = pd.read_csv('train_labels.csv')
test_data = pd.read_csv('test_features.csv')

## Data preprocessing

In [2]:
def calculate_time_features(data, n_samples):
    x = []
    features = [np.nanmedian, np.nanmean, np.nanvar, np.nanmin,
           np.nanmax]
    for index in range(int(data.shape[0] / n_samples)):
        assert data[n_samples * index, 0] == data[n_samples * (index + 1) - 1, 0], \
        'Ids are {}, {}'.format(data[n_samples * index, 0], data[n_samples * (index + 1) - 1, 0])
        patient_data = data[n_samples * index:n_samples * (index + 1), 2:]
        feature_values = np.empty((len(features), data[:, 2:].shape[1]))
        for i, feature in enumerate(features):
            feature_values[i] = feature(patient_data, axis=0)
        x.append(feature_values.ravel())
    return np.array(x)

In [3]:
x_train = calculate_time_features(train_data.to_numpy(), 12)
x_test = calculate_time_features(test_data.to_numpy(), 12)

  r = func(a, **kwargs)
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()


## Learning Pipeline - Subtask 1

In [4]:
subtask1_labels_ids = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST',
         'LABEL_Alkalinephos', 'LABEL_Bilirubin_total', 
         'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
         'LABEL_Bilirubin_direct', 'LABEL_EtCO2']
y_train = labels[subtask1_labels_ids].to_numpy()

In [5]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import (GridSearchCV,
    cross_val_score, KFold)
from sklearn.multiclass import OneVsRestClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import sklearn.metrics as metrics

# TODO (yarden):
# feature selection.
# parameters tuning (subsample, learning rate).
for i, label in enumerate(subtask1_labels_ids):
    pipeline = make_pipeline(
                        SimpleImputer(strategy='median'),
                        StandardScaler(),
                        HistGradientBoostingClassifier())
    scores = cross_val_score(pipeline, x_train, y_train[:, i],
                                cv=5,
                                scoring='roc_auc',
                                verbose=True)
    print("Cross-validation score is {score:.3f},"
          " standard deviation is {err:.3f}"
          .format(score = scores.mean(), err = scores.std()))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   18.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.928, standard deviation is 0.003


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   16.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.797, standard deviation is 0.013


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   17.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.741, standard deviation is 0.003


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   17.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.746, standard deviation is 0.006


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   17.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.742, standard deviation is 0.002


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   17.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.805, standard deviation is 0.007


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   17.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.893, standard deviation is 0.002


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   17.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.830, standard deviation is 0.008


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   17.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.738, standard deviation is 0.013
Cross-validation score is 0.931, standard deviation is 0.005


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   17.6s finished


In [6]:
df = pd.DataFrame({'pid': test_data.iloc[0::12, 0].values})
for i, label in enumerate(subtask1_labels_ids):
    pipeline = pipeline.fit(x_train, y_train[:, i].ravel())
    print("Training score:", metrics.roc_auc_score(y_train[:, i], pipeline.predict_proba(x_train)[:, 1]))
    predictions = pipeline.predict_proba(x_test)[:, 1]
    df[label] = predictions

Training score: 0.9774876511563632
Training score: 0.9877754029147892
Training score: 0.913489343171739
Training score: 0.9143622734156679
Training score: 0.9144603377463604
Training score: 0.9393476707020331
Training score: 0.9885212856239103
Training score: 0.9409339146555109
Training score: 0.9993110535042855
Training score: 0.9988747804556561


## Learning Pipeline - Subtask 2

In [7]:
subtask2_labels_ids = ['LABEL_Sepsis']
y_train = labels[subtask2_labels_ids].to_numpy().ravel()

In [8]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import (GridSearchCV,
    cross_val_score, KFold)
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import sklearn.metrics as metrics

# TODO (yarden):
# feature selection.
# parameters tuning (subsample, learning rate).
pipeline = make_pipeline(
                    SimpleImputer(strategy='median'),
                    StandardScaler(),
                    HistGradientBoostingClassifier())

scores = cross_val_score(pipeline, x_train, y_train,
                            cv=5,
                            scoring='roc_auc',
                            verbose=True)
print("Cross-validation score is {score:.3f},"
      " standard deviation is {err:.3f}"
      .format(score = scores.mean(), err = scores.std()))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.699, standard deviation is 0.025


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   17.5s finished


In [11]:
pipeline = pipeline.fit(x_train, y_train)
predictions = pipeline.predict_proba(x_test)[:, 1]
print("Training score:", metrics.roc_auc_score(y_train, pipeline.predict_proba(x_train)[:, 1]))
df[subtask2_labels_ids[0]] = predictions

Training score: 0.988209866581915


## Learning Pipeline - Subtask 3

In [12]:
subtask3_labels_ids = ['LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2',
                      'LABEL_Heartrate']
y_train = labels[subtask3_labels_ids].to_numpy()

In [13]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import (GridSearchCV,
    cross_val_score, KFold)
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import sklearn.metrics as metrics

for i, label in enumerate(subtask3_labels_ids):
    pipeline = make_pipeline(
                        SimpleImputer(strategy='median'),
                        HistGradientBoostingRegressor(max_depth=3))
    scores = cross_val_score(pipeline, x_train, y_train[:, i],
                            cv=5,
                            scoring='r2',
                            verbose=True)
    print("Cross-validation score is {score:.3f},"
          " standard deviation is {err:.3f}"
          .format(score = scores.mean(), err = scores.std()))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    7.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.416, standard deviation is 0.008


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    6.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.616, standard deviation is 0.015


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    6.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Cross-validation score is 0.380, standard deviation is 0.022
Cross-validation score is 0.637, standard deviation is 0.012


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    6.6s finished


In [14]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
for i, label in enumerate(subtask3_labels_ids):
    pipeline = make_pipeline(
                        SimpleImputer(strategy='median'),
                        StandardScaler(),
                        HistGradientBoostingRegressor(max_depth=3))
    pipeline = pipeline.fit(x_train, y_train[:, i])
    predictions = pipeline.predict(x_test)
    print("Training score:", metrics.r2_score(y_train[:, i], pipeline.predict(x_train)))
    df[label] = predictions

Training score: 0.46657876158710565
Training score: 0.6432809148552587
Training score: 0.4709852855526181
Training score: 0.6631141320848979


## Save predictions

In [15]:
df.to_csv('prediction.csv', index=False, float_format='%.4f')