# Import packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 100)

# Read in and preprocess Data

In [None]:
# read in data
train_features = pd.read_csv("train_features.csv")
train_labels = pd.read_csv("train_labels.csv")
test_features = pd.read_csv("test_features.csv")



In [None]:
# assign labels for the 2 subtasks
train_labels1 = train_labels.iloc[:, 0:11].copy()
train_labels2 = train_labels.iloc[:, [0, 11]].copy()
train_labels3 = train_labels.iloc[:, [0, 12, 13, 14, 15]].copy()

train_labels1

In [None]:
# only need data for the first 12 hours of stay
train_features = train_features.loc[train_features["Time"]<=12]
test_features = test_features.loc[test_features["Time"]<=12]
# need col names for each variable after Time
col_names = train_features.columns.values.tolist()[2:]
#to get only 1 row per pid, need to transform data from long to wide format
train_features = train_features.pivot(index='pid', columns="Time")
test_features = test_features.pivot(index='pid', columns="Time")
# rename colnames, number each variable for the respective time point
new_colnames = [string+str(i) for string in col_names for i in range(1, 13)]
#train_features.columns = train_features.columns.droplevel()
#train_features.columns = new_colnames
#test_features.columns = test_features.columns.droplevel()
#test_features.columns = new_colnames

In [None]:
train_features.shape

In [None]:
len(new_colnames)

In [None]:
train_features

In [None]:
new_colnames = [string+str(i) for string in col_names for i in range(1,13)]

In [None]:
train_features.columns = train_features.columns.droplevel()
train_features

In [None]:
new_colnames

# EDA

The following maps and numbers show, that many values of the features are missing. 

In [None]:
missing = train_features.isnull().sum().sum()
total = train_features.size
print(13*"-", "train features", 13*"-")
print(train_features.shape[0], "observations,", train_features.shape[1], "variables")
print("Nb. of values:  ", total)
print("Nb. of NaN:     ", missing)
print("Portion of NaN: ", round(missing/total, 4))

#plot a missing value heatmap
ax = plt.axes()
sns.heatmap(train_features.isnull(), yticklabels = False, cbar = False, cmap = "viridis", ax = ax)
ax.set_title('Heatmap of the NaN for train features')
plt.show()

In [None]:
missing = train_labels.isnull().sum().sum()
total = train_labels.size
print(13*"-", "train labels", 13*"-")
print(train_labels.shape[0], "observations,", train_labels.shape[1], "variables")
print("Nb. of values:  ", total)
print("Nb. of NaN:     ", missing)
print("Portion of NaN: ", round(missing/total, 4))

#plot a missing value heatmap
ax = plt.axes()
sns.heatmap(train_labels.isnull(), yticklabels = False, cbar = False, cmap = "viridis", ax = ax)
ax.set_title('Heatmap of the NaN for train labels')
plt.show()

In [None]:
missing = test_features.isnull().sum().sum()
total = test_features.size
print(13*"-", "test features", 13*"-")
print(test_features.shape[0], "observations,", test_features.shape[1], "variables")
print("Nb. of values:  ", total)
print("Nb. of NaN:     ", missing)
print("Portion of NaN: ", round(missing/total, 4))

#plot a missing value heatmap
ax = plt.axes()
sns.heatmap(test_features.isnull(), yticklabels = False, cbar = False, cmap = "viridis", ax = ax)
ax.set_title('Heatmap of the NaN for test features')
plt.show()

# Data Imputation

In [None]:
# train features

# Create our imputer to replace missing values with the mean e.g.
imp_train = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_train = imp_train.fit(train_features)
# Impute our data, then train
#col_names = train_features.columns.values.tolist()
train_features = pd.DataFrame(imp_train.transform(train_features))

# test features

# Create our imputer to replace missing values with the mean e.g.
imp_test = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_test = imp_test.fit(test_features)
# Impute our data, then train
#col_names = test_features.columns.values.tolist()
test_features = pd.DataFrame(imp_test.transform(test_features))

missing = train_labels.isnull().sum().sum()
print("Nb. of NaN train features: ", train_features.isnull().sum().sum())
print("Nb. of NaN test features:  ", test_features.isnull().sum().sum())

In [None]:
train_features

# Subtask 1

In [None]:
train_labels["pid"].unique

In [None]:
train_features["pid"].unique().shape

# Subtask 2

In [None]:
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(train_features, train_labels2["LABEL_Sepsis"])

In [None]:
clf.predict(test_features)

In [None]:
print(train_labels.shape)
print(train_features.shape)

