# Training

## Imports

In [1]:
import json
from pathlib import Path
import pickle


import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier


train_data = dict()
train_data_path = Path('../data/train_data.pkl')
with train_data_path.open('rb') as file:
    train_data['in'] = pickle.load(file)

test_data_path = Path('../data/test_data.pkl')
with test_data_path.open('rb') as file:
    test_data = pickle.load(file)

groups_path = Path('../data/groups.json')
with groups_path.open('r') as file:
    groups = json.load(file)

## Functions definitions

### Pre-processing functions

As discussed during the exploratory data analysis, there are several pre-processing steps that could be taken. This section defines those steps as functions.

In [2]:
def drop_redundant_columns(df):
    '''
    Drop redundant columns from the ``DataFrame``.

    Returns the ``DataFrame`` with the ``_diff`` and ``_diff_rel`` columns
    removed.
    '''
    redundant_features = (
        df.columns
        .str.extract('(\w+_diff(?:_rel)?(?:__.+)?)')
        .squeeze()
        .dropna()
    )

    out_df = df.drop(redundant_features, axis=1)

    return out_df


def drop_duplicate_columns(df, groups=groups):
    '''
    Drop ``DataFrame``'s repeated columns.

    Since the ``DataFrame`` only has duplicated columns in the ``labs``
    category, they can be dropped directly.
    '''
    # Since some of the columns may have been dropped in a previous operation,
    # it is necessary to get the intersection of ``groups['labs']`` with the
    # columns present in ``df`` to avoid a ``KeyError``.
    columns = df.columns.intersection(groups['labs'])

    cols_to_drop = (
        df.loc[:, columns]
        .columns
        .str.extract('(\w+_(?:mean|median|min|diff|diff_rel)(?:__.+)?)')
        .squeeze()
        .dropna()
    )

    out_df = df.drop(cols_to_drop, axis=1)

    return out_df


def impute_data(df):
    '''
    Impute missing data.

    Fills ``DataFrame`` by each ``id`` group, using backwards and forwards
    fill, in this order.
    '''
    out_df = (
        df.groupby('id')
        .transform(lambda col: col.bfill().ffill())
    )

    return out_df

def one_hot_encode(df):
    '''
    One-hot encode ``age_percentil`` column.

    Returns ``DataFrame`` with the column ``age_percentil`` substituted by a
    set of columns with its one-hot encoded values.
    '''
    df_ = df.drop(['age_percentil'], axis=1)

    dummies = pd.get_dummies(df['age_percentil'])
    for col in dummies:
        df_.insert(1, col, dummies[col])

    return df_

def reencode_icu(df):
    '''
    Return ``df`` with ``icu`` column reencoded.

    The returned ``DataFrame``'s ``icu`` column is transformed such that
    every row for a given patient is equal to 1 if the patient was
    admitted at any point in time and 0 otherwise.
    '''
    out_df = df.copy()

    out_df.loc[:, 'icu'] = (
        out_df.loc[:, 'icu']
        .groupby('id')
        .transform('max')
    )

    return out_df


def process_rows(df, how='every_row', groups=groups):
    '''
    Process ``DataFrame`` according to discussion in the EDA.
    '''
    in_df = df.copy()

    in_df = reencode_icu(in_df)

    if how == 'every_row':
        # Nothing has to be done in this case.
        out_df = in_df

    elif how == 'first_window':
        out_df = in_df.loc[(slice(None), '0-2'), :]

    elif how == 'aggregate':
        agg_funcs_dict = {
            'demographics': 'first',
            'comorbidities': 'max',
            'labs': 'mean',
            'vitals': 'mean',
        }

        agg_funcs = {
            feature: agg_funcs_dict[group]
            for group, feature_list in groups.items()
            for feature in feature_list
            # Check if given feature is among ``DataFrame``'s columns in case
            # it has been dropped.
            if feature in in_df.columns
        }

        # ``icu`` is not in the groups dictionary.
        agg_funcs['icu'] = 'max'

        out_df = (
            in_df.groupby('id')
            .agg(agg_funcs)
        )

    elif how == 'pivot':
        # Only ``labs`` and ``vitals`` have different values for the distinct
        # time windows.  The intersection prevents a ``KeyError`` due to
        # dropped columns.
        features_to_pivot = in_df.columns.intersection(groups['labs']+groups['vitals'])

        pivoted_df = in_df[features_to_pivot].unstack('window')

        # Flattens the ``MultiIndex`` to a single level.
        pivoted_df.columns = pivoted_df.columns.map(
            lambda multiindex_tuple: '__'.join(multiindex_tuple).replace('-', '_')
        )

        # Reconstructs the whole ``DataFrame`` by concatenating with the
        # remaining features aggregated.
        out_df = pd.concat(
            [
                in_df[groups['demographics']+groups['comorbidities']].groupby('id').max(),
                pivoted_df,
                in_df['icu'].groupby('id').first(),
            ],
            axis=1,
        )
    
    else:
        print('Invalid option passed to "how" parameter.')

    return out_df

## Training

### Pre-processing

Before passing the *train_data* to ``scikit-learn``, it has to go through the pre-processing steps that were discussed during the EDA.

In [3]:
train_data['preprocessed'] = (
    train_data['in']
    .pipe(drop_redundant_columns)
    .pipe(drop_duplicate_columns)
    .pipe(impute_data)
    .pipe(one_hot_encode)
    .pipe(reencode_icu)
)

train_data['preprocessed'].head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,age_above65,above_90th,90th,80th,70th,60th,50th,40th,30th,20th,...,respiratory_rate_min,temperature_min,oxygen_saturation_min,bloodpressure_diastolic_max,bloodpressure_sistolic_max,heart_rate_max,respiratory_rate_max,temperature_max,oxygen_saturation_max,icu
id,window,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,0-2,1,0,1,0,0,0,0,0,0,0,...,1.0,0.318681,1.0,-0.504274,-0.32973,-0.059701,0.636364,-0.275362,1.0,1
1,2-4,1,0,1,0,0,0,0,0,0,0,...,-0.285714,0.318681,0.959596,-0.589744,0.32973,-0.38806,-0.454545,-0.275362,0.894737,1
1,4-6,1,0,1,0,0,0,0,0,0,0,...,-0.357143,0.362637,1.0,-0.589744,0.124324,-0.492537,-0.515152,-0.217391,1.0,1
1,6-12,1,0,1,0,0,0,0,0,0,0,...,-0.142857,0.406593,1.0,-0.57265,0.167568,-0.477612,-0.090909,-0.014493,1.0,1
1,above_12,1,0,1,0,0,0,0,0,0,0,...,-1.0,0.010989,0.79798,0.555556,0.556757,0.298507,0.757576,0.710145,1.0,1
2,0-2,0,0,0,0,0,0,0,0,0,0,...,-0.714286,0.604396,0.959596,-0.435897,-0.491892,0.0,-0.575758,0.101449,1.0,1
2,2-4,0,0,0,0,0,0,0,0,0,0,...,-0.714286,0.604396,0.959596,-0.435897,-0.491892,0.0,-0.575758,0.101449,1.0,1
2,4-6,0,0,0,0,0,0,0,0,0,0,...,-0.714286,0.604396,0.959596,-0.435897,-0.491892,0.0,-0.575758,0.101449,1.0,1
2,6-12,0,0,0,0,0,0,0,0,0,0,...,-0.642857,0.604396,0.79798,-0.57265,-0.762162,0.0,-0.69697,0.101449,1.0,1
2,above_12,0,0,0,0,0,0,0,0,0,0,...,-0.642857,0.142857,0.878788,-0.247863,-0.351351,-0.149254,-0.454545,0.101449,0.947368,1


The *test_data* also goes through all of the pre-processing.

In [4]:
test_data = (
    test_data
    .pipe(drop_redundant_columns)
    .pipe(drop_duplicate_columns)
    .pipe(impute_data)
    .pipe(one_hot_encode)
    .pipe(reencode_icu)
)

test_data.head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,age_above65,above_90th,90th,80th,70th,60th,50th,40th,30th,20th,...,respiratory_rate_min,temperature_min,oxygen_saturation_min,bloodpressure_diastolic_max,bloodpressure_sistolic_max,heart_rate_max,respiratory_rate_max,temperature_max,oxygen_saturation_max,icu
id,window,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,0-2,1,0,0,0,0,1,0,0,0,0,...,-0.5,0.208791,0.89899,-0.247863,-0.459459,-0.432836,-0.636364,-0.42029,0.736842,1
0,2-4,1,0,0,0,0,1,0,0,0,0,...,-0.5,0.714286,0.838384,-0.076923,-0.459459,-0.313433,-0.636364,0.246377,0.578947,1
0,4-6,1,0,0,0,0,1,0,0,0,0,...,-0.857143,0.318681,0.89899,-0.076923,0.286486,0.298507,0.272727,-0.275362,0.736842,1
0,6-12,1,0,0,0,0,1,0,0,0,0,...,-0.857143,0.318681,0.89899,-0.076923,0.286486,0.298507,0.272727,-0.275362,0.736842,1
0,above_12,1,0,0,0,0,1,0,0,0,0,...,-0.857143,0.098901,0.79798,-0.076923,0.286486,0.298507,0.272727,0.362319,0.947368,1
4,0-2,0,0,0,0,0,0,0,0,0,0,...,-0.571429,0.538462,0.939394,-0.076923,-0.351351,-0.044776,-0.575758,0.072464,0.894737,0
4,2-4,0,0,0,0,0,0,0,0,0,0,...,-0.571429,0.538462,0.939394,-0.076923,-0.351351,-0.044776,-0.575758,0.072464,0.894737,0
4,4-6,0,0,0,0,0,0,0,0,0,0,...,-0.571429,0.538462,0.939394,-0.076923,-0.351351,-0.044776,-0.575758,0.072464,0.894737,0
4,6-12,0,0,0,0,0,0,0,0,0,0,...,-0.428571,0.340659,0.939394,-0.247863,-0.351351,0.0,-0.575758,0.130435,0.947368,0
4,above_12,0,0,0,0,0,0,0,0,0,0,...,-0.5,0.208791,0.89899,0.094017,-0.178378,0.104478,-0.454545,0.014493,0.894737,0


### Training

In [5]:
from tempfile import mkdtemp
from shutil import rmtree

cachedir = mkdtemp()

train_X = train_data['preprocessed'].drop(['icu'], axis=1)
train_y = train_data['preprocessed']['icu']

pipeline = make_pipeline(
    SimpleImputer(),
    LogisticRegression(max_iter=1000),
    memory=cachedir,
)

pipeline.fit(train_X, train_y)

rmtree(cachedir)

In [6]:
test_X = test_data.drop(['icu'], axis=1)
test_y = test_data['icu']

results = pd.concat(
    [
        test_y,
        pd.Series(pipeline.predict(test_X), index=test_y.index, name='prediction').to_frame(),
    ],
    axis=1,
)

results.eval('correct = icu == prediction', inplace=True)

results['correct'].sum() / results.shape[0]

0.766

In [7]:
from sklearn.metrics import accuracy_score, recall_score

predictions = pd.Series(
    pipeline.predict(test_X),
    index=test_y.index,
    name='prediction'
)

time_windows = ['0-2', '2-4', '4-6', '6-12', 'above_12']
for time_window in time_windows:
    rows_to_consider = (slice(None), time_window) 
    test = test_y[rows_to_consider]
    pred = predictions[rows_to_consider]
    print(
        f'Time window: {time_window}',
        f'    accuracy: {accuracy_score(test, pred)}',
        f'    recall: {recall_score(test, pred)}',
        '\n',
        sep='\n',
    )

Time window: 0-2
    accuracy: 0.7
    recall: 0.7058823529411765


Time window: 2-4
    accuracy: 0.67
    recall: 0.6470588235294118


Time window: 4-6
    accuracy: 0.81
    recall: 0.8235294117647058


Time window: 6-12
    accuracy: 0.81
    recall: 0.8627450980392157


Time window: above_12
    accuracy: 0.84
    recall: 0.8627450980392157


