In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.multioutput import MultiOutputClassifier, MultiOutputRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, r2_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.model_selection import KFold

pd.set_option('display.max_columns', 500)

In [2]:
# read in data
train_features = pd.read_csv("train_features.csv")
train_labels = pd.read_csv("train_labels.csv")
test_features = pd.read_csv("test_features.csv")

In [3]:
# assign labels for the 2 subtasks
train_labels1 = train_labels.iloc[:, 1:11].to_numpy()
train_labels2 = train_labels.iloc[:, 11].to_numpy()
train_labels3 = train_labels.iloc[:, [12, 13, 14, 15]].to_numpy()

#label names
labels1 = train_labels.columns.values.tolist()[1:11]
labels2 = train_labels.columns.values.tolist()[11]
labels3 = train_labels.columns.values.tolist()[12:]
labels_all = train_labels.columns.values.tolist()



In [5]:
train_labels

Unnamed: 0,pid,LABEL_BaseExcess,LABEL_Fibrinogen,LABEL_AST,LABEL_Alkalinephos,LABEL_Bilirubin_total,LABEL_Lactate,LABEL_TroponinI,LABEL_SaO2,LABEL_Bilirubin_direct,LABEL_EtCO2,LABEL_Sepsis,LABEL_RRate,LABEL_ABPm,LABEL_SpO2,LABEL_Heartrate
0,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,12.1,85.4,100.0,59.9
1,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.8,100.6,95.5,85.5
2,100,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,16.5,88.3,96.5,108.1
3,1000,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,19.4,77.2,98.3,80.9
4,10000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.6,76.8,97.7,95.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18990,9993,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,17.1,69.8,100.0,110.7
18991,9995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.6,97.3,97.8,59.2
18992,9996,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,17.3,66.3,96.9,100.3
18993,9998,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,18.8,81.5,96.9,99.4


In [6]:
train_labels1

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 1., 0., 0.]])

In [None]:
#FROM LONG TO WIDE!

# transform all "times" to be from 1 to 12 for train and test features
#unique patient ID's 
patient_ids = set(list(train_features['pid']))
times = [x for x in range(1,13)]
times.extend(times * 18994)
train_features.loc[:,'Time'] = times

times = [x for x in range(1,13)]
times.extend(times * ((int(test_features.shape[0]/12)) - 1))
test_features.loc[:,'Time'] = times

# need col names for each variable after Time
col_names = train_features.columns.values.tolist()[2:]
#to get only 1 row per pid, need to transform data from long to wide format
train_features = train_features.pivot(index='pid', columns="Time")
test_features = test_features.pivot(index='pid', columns="Time")
# rename colnames, number each variable for the respective time point
new_colnames = [string+str(i) for string in col_names for i in range(1, 13)]
train_features.columns = train_features.columns.droplevel()
train_features.columns = new_colnames
test_features.columns = test_features.columns.droplevel()
test_features.columns = new_colnames
# needed for the submit dataframe
pid = test_features.index.to_numpy().astype(int)