In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import make_pipeline

# from sklearn.model_selection import (GridSearchCV, cross_val_score, KFold)
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import sklearn.metrics as metrics

In [2]:
train_data = pd.read_csv('train_features.csv')
labels = pd.read_csv('train_labels.csv')
test_data = pd.read_csv('test_features.csv')

# dataframe for the solution
df = pd.DataFrame({'pid': test_data.iloc[0::12, 0].values})

#### Old method

In [3]:
# def calculate_time_features(data, n_samples):
#     x = []
#     features = [np.nanmedian, np.nanmean, np.nanvar, np.nanmin,
#            np.nanmax]
#     for index in range(int(data.shape[0] / n_samples)):
#         assert data[n_samples * index, 0] == data[n_samples * (index + 1) - 1, 0], \
#         'Ids are {}, {}'.format(data[n_samples * index, 0], data[n_samples * (index + 1) - 1, 0])
#         patient_data = data[n_samples * index:n_samples * (index + 1), 2:]
#         feature_values = np.empty((len(features), data[:, 2:].shape[1]))
#         for i, feature in enumerate(features):
#             feature_values[i] = feature(patient_data, axis=0)
#         x.append(feature_values.ravel())
#     return np.array(x)

In [4]:
# x_train = calculate_time_features(train_data.to_numpy(), 12)
# x_test = calculate_time_features(test_data.to_numpy(), 12)

#### New method

In [5]:
def feature_formatting(data, n_samples):
    x = []
    for index in range(int(data.shape[0] / n_samples)):
        patient_data = data[n_samples * index:n_samples * (index + 1), 2:]
        x.append(patient_data.flatten())
    return np.array(x)

In [6]:
x_train = feature_formatting(train_data.to_numpy(), 12)
x_test = feature_formatting(test_data.to_numpy(), 12)

#### Regression

In [7]:
t3_labels = ['LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate']
y_train = labels[t3_labels].to_numpy()

In [8]:
mean_r2 = 0
score = 0
for i, label in enumerate(t3_labels):
    method = HistGradientBoostingRegressor(max_depth=15)
    method.fit(x_train, y_train[:, i])
    predictions = method.predict(x_test)
    print("Training score:", metrics.r2_score(y_train[:, i], method.predict(x_train)))
    df[label] = predictions
    mean_r2 += metrics.r2_score(y_train[:, i], method.predict(x_train))
    score += 0.5 + 0.5*np.maximum(0, metrics.r2_score(y_train[:, i], method.predict(x_train)))
print("\n----------------------------------------------------------------------------------")
print("Mean R2:\t", mean_r2/4)
print("Score:\t\t", score/4)
print("----------------------------------------------------------------------------------")


Training score: 0.6068055282408376
Training score: 0.7413608170738564
Training score: 0.531351947234971
Training score: 0.734737589026784

----------------------------------------------------------------------------------
Mean R2:	 0.6535639703941123
Score:		 0.8267819851970561
----------------------------------------------------------------------------------


#### Regression with pipeline

In [9]:
# mean_r2 = 0
# score = 0

# for i, label in enumerate(t3_labels):
#     pipeline = make_pipeline(
#                         #SimpleImputer(strategy='median'),
#                         #StandardScaler(),
#                         HistGradientBoostingRegressor(max_depth=15))
#     pipeline = pipeline.fit(x_train, y_train[:, i])
#     predictions = pipeline.predict(x_test)
#     print("Training score:", metrics.r2_score(y_train[:, i], pipeline.predict(x_train)))
#     df[label] = predictions
#     mean_r2 += metrics.r2_score(y_train[:, i], pipeline.predict(x_train))
#     score += 0.5 + 0.5*np.maximum(0, metrics.r2_score(y_train[:, i], pipeline.predict(x_train)))

# print("\n----------------------------------------------------------------------------------")
# print("Mean R2:\t", mean_r2/4)
# print("Score:\t\t", score/4)
# print("----------------------------------------------------------------------------------")

Training score: 0.5751978246041309
Training score: 0.7227957753930083
Training score: 0.5221852247854422
Training score: 0.7601115333363111

----------------------------------------------------------------------------------
Mean R2:	 0.6450725895297231
Score:		 1.6493182799619175
----------------------------------------------------------------------------------


#### Save the results

In [10]:
df.to_csv('predict ion.csv', index=False, float_format='%.4f')