<a href="https://colab.research.google.com/github/elvisbui/Predicting-Length-of-Stay/blob/master/Neural_Network_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  Neural Networks
In this notebook, I will be creating a deep neural network to predict the amount a patient will stay at a hospital. 

### Load Libraries 
The first step is to load the libraries we will be using. 

In [None]:
import warnings
warnings.filterwarnings("ignore")

import gc
import math
import pickle
import optuna

from time import time
from tqdm.notebook import tqdm


# import data processing and linear algebra libraries 
import pandas as pd
import numpy as np


# import data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.notebook import tqdm
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import StandardScaler


import tensorflow as tf
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
import tensorflow_addons as tfa
import tensorflow.keras.backend as K
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping

from hyperopt import hp, fmin, atpe, tpe, Trials
from hyperopt.pyll.base import scope


np.random.seed(24)
tf.random.set_seed(24)
RANDOM_STATE = 24
SEED = 24

### Load Data
Next, is getting the data and loading it. 

In [None]:
TRAIN_DIR = '../input/av-healthcare-analytics-ii/healthcare/train_data.csv'
TEST_DIR = '../input/av-healthcare-analytics-ii/healthcare/test_data.csv'
SAMPLE_SUBM = '../input/av-healthcare-analytics-ii/healthcare/sample_sub.csv'
TRAIN_DICT_DIR = '../input/av-healthcare-analytics-ii/healthcare/train_data_dictionary.csv'

def read_csv(*paths: str) -> tuple:
    '''
    Gets a list of cvs paths and returns all cvs in a tuple

            Parameters:
                    *paths (tuple of str): A decimal integer

            Returns:
                    binary_sum (tuple of dataframe): tuple of cvs dataframes
    '''
    result = []
    for dir in paths:
        csv = pd.read_csv(dir)
        result.append(csv)
    return tuple(result)

train, test, sample_subm, train_dict = read_csv(TRAIN_DIR, TEST_DIR, SAMPLE_SUBM, TRAIN_DICT_DIR)

# Preprocessing
Preprocessing data for neural networks is different compared to preprocessing data for a gradient boosting tree.
Let's first deal with missing data. 

In [None]:
train.isnull().sum()

We see that 'Bed Grade' and 'City Code Patient' both have missings values. We can replace the 'Bed Grade' missing values with the median. 

In [None]:
train['Bed Grade'].median()

In [None]:
train['Bed Grade'].fillna(train['Bed Grade'].median(), inplace=True)

In [None]:
train['Bed Grade'].isnull().sum()

For 'City Code Patient', we can predict the missing values but looking at entries with similiar data. 

In [None]:
train['City_Code_Patient'].nunique()

In [None]:
train['Hospital_type_code'].nunique()

In [None]:
train['Department'].nunique()

In [None]:
train.groupby(['City_Code_Hospital', 'Hospital_type_code','Department'])['City_Code_Patient'].agg(pd.Series.mode)

In [None]:
train['City_Code_Patient'] = train.groupby(['City_Code_Hospital', 'Hospital_type_code','Department'], sort=False)['City_Code_Patient'].apply(lambda x: x.fillna(x.value_counts().index[0]))

In [None]:
train['City_Code_Patient'].unique()

In [None]:
train.isnull().sum()

In [None]:
cat_columns = ['Hospital_code', 'Hospital_type_code', 'City_Code_Hospital', 'Hospital_region_code',
               'Department', 'Ward_Type', 'Ward_Facility_Code', 'City_Code_Patient']

In [None]:
train = train.astype({'Hospital_code':'category', 'Hospital_type_code':'category', 
                     'City_Code_Hospital':'category', 
                     'Hospital_region_code':'category','Department':'category', 
                     'Ward_Type':'category', 'Ward_Facility_Code':'category', 
                     'City_Code_Patient':'category'})

In [None]:
train.dtypes

In [None]:
train_one_hot = pd.get_dummies(train, columns = cat_columns)

In [None]:
train_one_hot.head()

In [None]:
stay_encode = {'0-10': 0,
               '11-20': 1,
               '21-30': 2,
               '31-40': 3,
               '41-50': 4,
               '51-60': 5,
               '61-70': 6,
               '71-80': 7,
               '81-90': 8,
               '91-100': 9,
               'More than 100 Days': 10}

admission_type_encode = {'Trauma': 0, 
                         'Emergency': 1, 
                         'Urgent': 2}

illness_encode = {'Minor': 0,
                  'Moderate': 1,
                  'Extreme': 2}

age_encode = {'0-10': 0,
              '11-20': 1,
              '21-30': 2,
              '31-40': 3,
              '41-50': 4,
              '51-60': 5,
              '61-70': 6,
              '71-80': 7,
              '81-90': 8,
              '91-100': 9}

In [None]:
train_one_hot['Stay'] = train_one_hot['Stay'].map(stay_encode)
train_one_hot['Type of Admission'] = train_one_hot['Type of Admission'].map(admission_type_encode)
train_one_hot['Severity of Illness'] = train_one_hot['Severity of Illness'].map(illness_encode)
train_one_hot['Age'] = train_one_hot['Age'].map(age_encode)

We will be removing the patientid column since it is a categorical column with many unique values, and would not help improve the model. However, we will get the amount of times a patientid shows up in the data. 

In [None]:
train_one_hot['patient_deposit_mean'] = train.groupby(['patientid'])['Admission_Deposit'].transform('count')

In [None]:
train_one_hot.head()

# Feature Engineering

We do not need to do any feature engineering because the neural net model will do the feature engineering for us. 

# Baseline Model
We will now make a baseline neural net model that we can iterate from. 

In [None]:
y = train_one_hot.loc[:,['Stay']]
X = train_one_hot.drop(['Stay','case_id', 'patientid'], axis=1)
scaler = StandardScaler()
X = scaler.fit_transform(X)
X=pd.DataFrame(X)
y = pd.get_dummies(y, columns=['Stay'])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=RANDOM_STATE)

In [None]:
def create_nn(num_cols):
    model = tf.keras.Sequential([
    tf.keras.layers.Input(num_cols),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(11, activation="softmax")
    ])
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

Let's now check if our model is working. 

In [None]:
model = create_nn(X_train.shape[1])
model.fit(X_train,y_train,validation_data=(X_val, y_val), epochs=20, batch_size=64, verbose=2)

Now that be know our model is working, we need to create the validation schema. We will use the same schema as the one we create with Catboost. 

# Improving the model
Now that we have our baseline model working, it is now time to improve it. 
I will test the model with the following architecture to see if performance improves. 

1. Dropout
2. Batch Normalization
3. Weight Normalization
4. lookahead
5. Different amount of layers
6. Different amount of units for each layer
7. Ealry Stopping
8. Checkpoints
9. Reducing learning rate using ReduceLROnPlateau



## Architecture Tuning

In [None]:
# used for splitting using StratifiedKFold
# StratifiedKFold cannot split with one-hot encoded labels
skf_y = train_one_hot.loc[:,['Stay']]

In [None]:
def create_nn2(num_columns, hidden_units, dropout_rate):
    
    inp = tf.keras.layers.Input(shape = (num_columns, ))
    x = tf.keras.layers.BatchNormalization()(inp)
    x = tf.keras.layers.Dropout(dropout_rate)(x)
    
    for units in hidden_units:
        
        x = tfa.layers.WeightNormalization(tf.keras.layers.Dense(units, activation = 'relu'))(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Dropout(dropout_rate)(x)
        
    out = tfa.layers.WeightNormalization(tf.keras.layers.Dense(11, activation = 'softmax'))(x)
    
    model = tf.keras.models.Model(inputs = inp, outputs = out)
    
    model.compile(optimizer = tfa.optimizers.Lookahead(tf.optimizers.Adam()), 
                  loss='categorical_crossentropy', 
                  metrics = ['accuracy'])
    
    return model

In [None]:
def optimise(params):
    
    print(params)
    
    N_SPLITS=2
    
    cv_result = y.copy()
    cv_result.loc[:, y.columns] = 0
    
    historys = {}

    skf = StratifiedKFold(n_splits = N_SPLITS, random_state = RANDOM_STATE, shuffle = True)

    for n, (train_ind, val_ind) in enumerate(skf.split(skf_y, skf_y)):

        print('Fold', n)

        x_tr, x_val = X.values[train_ind], X.values[val_ind]
        y_tr, y_val = y.values[train_ind], y.values[val_ind]
        
        model = create_nn2(X.shape[1], [params['hidden_unit_1'], params['hidden_unit_1'], params['hidden_unit_1']],
                                  params['dropout_rate'])

        checkpoint_path = f'Fold:{n}.hdf5'
        
        reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, min_lr=1e-5, 
                                           patience=3, verbose=0, mode='min')

        cb_checkpt = ModelCheckpoint(checkpoint_path, monitor = 'val_loss', 
                                     verbose = 0, save_best_only = True, 
                                     save_weights_only = True, mode = 'min')

        early = EarlyStopping(monitor="val_loss", mode="min", 
                              restore_best_weights=True, 
                              patience= 5, verbose = 0)

        history = model.fit(x_tr, y_tr, 
                            validation_data=(x_val, y_val), 
                            epochs=50, 
                            batch_size=128,
                            callbacks=[reduce_lr_loss, cb_checkpt, early],
                            verbose=0)

        hist = pd.DataFrame(history.history)
        
        print('history saved')
        model.load_weights(checkpoint_path)

        cv_result.loc[val_ind, y.columns] += model.predict(x_val)
        print(f'Fold {n} Best Loss:\t', hist['val_loss'].min())

        K.clear_session()
        del model, history, hist
        gc.collect()
    
    cv_result.loc[val_ind, y.columns] / N_SPLITS
    score = log_loss(y.values, cv_result.values)
    print('Total Score', score)
    return score

In [None]:
param_space = {'hidden_unit_1': hp.choice('hidden_unit_1', [1024, 512, 256, 128]), 
               'hidden_unit_2': hp.choice('hidden_unit_2', [1024, 512, 256, 128]), 
               'hidden_unit_3': hp.choice('hidden_unit_3', [512, 256, 128, 0]), 
               'dropout_rate': hp.uniform('dropout_rate', 0.3, 0.5), 
              }

trials = Trials()

hopt = fmin(fn = optimise, 
            space = param_space, 
            algo = tpe.suggest, 
            max_evals = 15, 
            timeout = 1800, 
            trials = trials, 
           )

# Improved Model
Let's create a new improved model from the architecture tuning results.

In [None]:
def improved_nn(num_cols):
    model = tf.keras.Sequential([      
    tf.keras.layers.Input(num_cols),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.4),
    tfa.layers.WeightNormalization(tf.keras.layers.Dense(256, activation='relu')),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.4),
    tfa.layers.WeightNormalization(tf.keras.layers.Dense(128, activation='relu')),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.4),
    tfa.layers.WeightNormalization(tf.keras.layers.Dense(11, activation='softmax'))
    ])
    model.compile(loss = 'categorical_crossentropy', 
                  optimizer = tfa.optimizers.Lookahead(tf.optimizers.Adam()),
                  metrics = ['accuracy'])
    return model

In [None]:
def run_improved_nn(X, y):

    N_SPLITS=5
    N_STARTS=3 
    
    cv_result = y.copy()
    cv_result.loc[:, y.columns] = 0
    
    for seed in range(N_STARTS):
        skf = StratifiedKFold(n_splits = N_SPLITS, random_state = seed, shuffle = True)
        for n, (train_ind, val_ind) in enumerate(skf.split(skf_y, skf_y)):

            print(f'Seed: {seed} ------------- Fold:{n}')

            x_tr, x_val = X.values[train_ind], X.values[val_ind]
            y_tr, y_val = y.values[train_ind], y.values[val_ind]

            model = improved_nn(X.shape[1])

            checkpoint_path = f'Seed:{seed}-Fold:{n}.hdf5'

            reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, min_lr=1e-5, 
                                               patience=3, verbose=0, mode='min')

            cb_checkpt = ModelCheckpoint(checkpoint_path, monitor = 'val_loss', 
                                         verbose = 0, save_best_only = True, 
                                         save_weights_only = True, mode = 'min')

            early = EarlyStopping(monitor="val_loss", mode="min", 
                                  restore_best_weights=True, 
                                  patience= 5, verbose = 0)

            history = model.fit(x_tr, y_tr, 
                                validation_data=(x_val, y_val), 
                                epochs=50, 
                                batch_size=128,
                                callbacks=[reduce_lr_loss, cb_checkpt, early],
                                verbose=2)

            hist = pd.DataFrame(history.history)

            print('history saved')
            model.load_weights(checkpoint_path)

            cv_result.loc[val_ind, y.columns] += model.predict(x_val)
            print(f'Fold {n} Best Loss:\t', hist['val_loss'].min())

            K.clear_session()
            del model, history, hist
            gc.collect()
    
    cv_result.loc[val_ind, y.columns] =/ (N_STARTS)
    score = log_loss(y.values, cv_result.values)
    print('Total Score', score)
    return cv_result

In [None]:
cs_result = run_final_nn(X, y)