<a href="https://colab.research.google.com/github/elvisbui/Predicting-Length-of-Stay/blob/master/Ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Ensemble
In this notebook, use stacking to get a better result from our neural network and catboost models. 

### Load Libraries 
The first step is to load the libraries we will be using. 

In [None]:
import warnings
warnings.filterwarnings("ignore")

import gc
import math
import pickle
import optuna

from time import time
from tqdm.notebook import tqdm


# import data processing and linear algebra libraries 
import pandas as pd
import numpy as np


# import data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.notebook import tqdm
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import StandardScaler

from catboost import Pool, CatBoostClassifier

import tensorflow as tf
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
import tensorflow_addons as tfa
import tensorflow.keras.backend as K
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping

from hyperopt import hp, fmin, atpe, tpe, Trials
from hyperopt.pyll.base import scope


np.random.seed(24)
tf.random.set_seed(24)
RANDOM_STATE = 24
SEED = 24

### Load Data
Next, is getting the data and loading it. 

In [None]:
TRAIN_DIR = '../input/av-healthcare-analytics-ii/healthcare/train_data.csv'
TEST_DIR = '../input/av-healthcare-analytics-ii/healthcare/test_data.csv'
SAMPLE_SUBM = '../input/av-healthcare-analytics-ii/healthcare/sample_sub.csv'
TRAIN_DICT_DIR = '../input/av-healthcare-analytics-ii/healthcare/train_data_dictionary.csv'

def read_csv(*paths: str) -> tuple:
    '''
    Gets a list of cvs paths and returns all cvs in a tuple

            Parameters:
                    *paths (tuple of str): A decimal integer

            Returns:
                    binary_sum (tuple of dataframe): tuple of cvs dataframes
    '''
    result = []
    for dir in paths:
        csv = pd.read_csv(dir)
        result.append(csv)
    return tuple(result)

train, test, sample_subm, train_dict = read_csv(TRAIN_DIR, TEST_DIR, SAMPLE_SUBM, TRAIN_DICT_DIR)

# The training set for catboost
train_cb = train.copy()

In [None]:
target = train.loc[:,['Stay']]
train = train.drop(['Stay'], axis=1)

In [None]:
cat_columns = ['Hospital_code', 'Hospital_type_code', 'City_Code_Hospital', 'Hospital_region_code',
               'Department', 'Ward_Type', 'Ward_Facility_Code', 'City_Code_Patient']

In [None]:
stay_encode = {'0-10': 0,
               '11-20': 1,
               '21-30': 2,
               '31-40': 3,
               '41-50': 4,
               '51-60': 5,
               '61-70': 6,
               '71-80': 7,
               '81-90': 8,
               '91-100': 9,
               'More than 100 Days': 10}

admission_type_encode = {'Trauma': 0, 
                         'Emergency': 1, 
                         'Urgent': 2}

illness_encode = {'Minor': 0,
                  'Moderate': 1,
                  'Extreme': 2}

age_encode = {'0-10': 0,
              '11-20': 1,
              '21-30': 2,
              '31-40': 3,
              '41-50': 4,
              '51-60': 5,
              '61-70': 6,
              '71-80': 7,
              '81-90': 8,
              '91-100': 9}

In [None]:
hospital_type_code_encode={'a': 0,
                           'b': 1,
                           'c': 2,
                           'e': 3,
                           'd': 4,
                           'f': 5,
                           'g': 6} 

hospital_region_encode = {'X': 0, 
                          'Y': 1, 
                          'Z': 2}

department_encode={'anesthesia': 0,
                   'gynecology': 1,
                   'radiotherapy': 2,
                   'surgery': 3,
                   'TB & Chest disease': 4,}

word_type_encode = {'P': 0,
                    'Q': 1,
                    'R': 2,
                    'S': 3,
                    'T': 4,
                    'U': 5}

ward_facility_code_encode ={'A': 0, 
                            'B': 1, 
                            'C': 2, 
                            'D': 3, 
                            'E': 4, 
                            'F': 5}

In [None]:
# preprocess for nueral networks
def preprocess_nn(df):
    df['Bed Grade'].fillna(train['Bed Grade'].median(), inplace=True)
    df['City_Code_Patient'] = df.groupby(['City_Code_Hospital', 'Hospital_type_code','Department'], sort=False)['City_Code_Patient'].apply(lambda x: x.fillna(x.value_counts().index[0]))
    df = df.astype({'Hospital_code':'category', 'Hospital_type_code':'category', 
                     'City_Code_Hospital':'category', 
                     'Hospital_region_code':'category','Department':'category', 
                     'Ward_Type':'category', 'Ward_Facility_Code':'category', 
                     'City_Code_Patient':'category'})
    df_one_hot = pd.get_dummies(df, columns = cat_columns)
    
    df_one_hot['Type of Admission'] = df_one_hot['Type of Admission'].map(admission_type_encode)
    df_one_hot['Severity of Illness'] = df_one_hot['Severity of Illness'].map(illness_encode)
    df_one_hot['Age'] = df_one_hot['Age'].map(age_encode)
    df_one_hot['patient_deposit_mean'] = df.groupby(['patientid'])['Admission_Deposit'].transform('count')
    
    return df_one_hot.drop(['case_id', 'patientid'], axis=1)

In [None]:
def groupby_features(df, col1, col2, fe_stats = ('mean','max','min', 'sum','count', 'count', 'nunique', 'std')):
    fe_df = df.copy()
    for stat in fe_stats:
        fe_df[f'{col1}_{col2}_{stat}'] = df.groupby([col1])[col2].transform(stat)
        fe_df[f'{col1}_{col2}_{stat}_diff'] = fe_df[f'{col1}_{col2}_{stat}'] - fe_df[col2]
        fe_df[f'{col1}_{col2}_{stat}_div'] = fe_df[f'{col1}_{col2}_{stat}'] / fe_df[col2]
    return fe_df

In [None]:
def preprocess_cb(df):
    # preprocess for catboost
    df['Hospital_type_code'] = df['Hospital_type_code'].map(hospital_type_code_encode)
    df['Hospital_region_code'] = df['Hospital_region_code'].map(hospital_region_encode)
    df['Department'] = df['Department'].map(department_encode)
    df['Ward_Type'] = df['Ward_Type'].map(word_type_encode)
    df['Ward_Facility_Code'] = df['Ward_Facility_Code'].map(ward_facility_code_encode)
    df['Type of Admission'] = df['Type of Admission'].map(admission_type_encode)
    df['Severity of Illness'] = df['Severity of Illness'].map(illness_encode)
    df['Age'] = df['Age'].map(age_encode)
    
    fe_train = df.copy()
    fe_train = groupby_features(fe_train, 'patientid', 'Admission_Deposit')
    fe_train = groupby_features(fe_train, 'Severity of Illness', 'Admission_Deposit')
    fe_train = groupby_features(fe_train, 'Type of Admission', 'Admission_Deposit')
    fe_train = groupby_features(fe_train, 'Bed Grade', 'Admission_Deposit')
    fe_train = groupby_features(fe_train, 'Hospital_code', 'Admission_Deposit')
    fe_train = groupby_features(fe_train, 'Bed Grade', 'Admission_Deposit')
    fe_train = groupby_features(fe_train, 'patientid', 'Visitors with Patient')
    
    return fe_train.drop(['case_id'], axis=1)

In [None]:
X_nn = preprocess_nn(train.copy())
test_nn = preprocess_nn(test.copy())

scaler = StandardScaler()
X_nn = scaler.fit_transform(X_nn)
X_nn = pd.DataFrame(X_nn)

test_nn = scaler.transform(test_nn)
test_nn = pd.DataFrame(test_nn)

y = target.copy()
y['Stay'] = y['Stay'].map(stay_encode)
# used for splitting using StratifiedKFold
# StratifiedKFold cannot split with one-hot encoded labels
skf_y = y['Stay']
y = pd.get_dummies(target, columns=['Stay'])

In [None]:
def improved_nn(num_cols):
    model = tf.keras.Sequential([      
    tf.keras.layers.Input(num_cols),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.4),
    tfa.layers.WeightNormalization(tf.keras.layers.Dense(256, activation='relu')),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.4),
    tfa.layers.WeightNormalization(tf.keras.layers.Dense(128, activation='relu')),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.4),
    tfa.layers.WeightNormalization(tf.keras.layers.Dense(11, activation='softmax'))
    ])
    model.compile(loss = 'categorical_crossentropy', optimizer = tfa.optimizers.Lookahead(tf.optimizers.Adam())
                  , metrics = ['accuracy'])
    return model

In [None]:
def run_final_nn(X, y, test):

    N_SPLITS=5
    N_STARTS=3 
    
    cv_result = y.copy()
    cv_result.loc[:, y.columns] = 0
    
    test_result = np.zeros((len(test), 11))
    
    historys = {}

    skf = StratifiedKFold(n_splits = N_SPLITS, random_state = RANDOM_STATE, shuffle = True)
    
    for seed in range(N_STARTS):
        for n, (train_ind, val_ind) in enumerate(skf.split(skf_y, skf_y)):

            print(f'Seed: {seed} ------------- Fold:{n}')

            x_tr, x_val = X.values[train_ind], X.values[val_ind]
            y_tr, y_val = y.values[train_ind], y.values[val_ind]

            model = improved_nn(X.shape[1])

            checkpoint_path = f'Seed:{seed}-Fold:{n}.hdf5'

            reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, min_lr=1e-5, 
                                               patience=3, verbose=0, mode='min')

            cb_checkpt = ModelCheckpoint(checkpoint_path, monitor = 'val_loss', 
                                         verbose = 0, save_best_only = True, 
                                         save_weights_only = True, mode = 'min')

            early = EarlyStopping(monitor="val_loss", mode="min", 
                                  restore_best_weights=True, 
                                  patience= 5, verbose = 0)

            history = model.fit(x_tr, y_tr, 
                                validation_data=(x_val, y_val), 
                                epochs=50, 
                                batch_size=128,
                                callbacks=[reduce_lr_loss, cb_checkpt, early],
                                verbose=2)

            hist = pd.DataFrame(history.history)

            model.load_weights(checkpoint_path)

            cv_result.loc[val_ind, y.columns] += model.predict(x_val)
            test_result += model.predict(test)

            K.clear_session()
            del model, history, hist
            gc.collect()
    
    cv_result.loc[:, y.columns] /= (N_STARTS)
    test_result /= (N_STARTS*N_SPLITS)
    return cv_result, test_result

In [None]:
nn_cv_result, nn_test_result = run_final_nn(X_nn, y, test_nn)

In [None]:
y_cb = target.copy()
y_cb['Stay'] = y_cb['Stay'].map(stay_encode)
X_cb = preprocess_cb(train.copy())
test_cb = preprocess_cb(test.copy())

In [None]:
cat_features = [0,1,2,3,5,6,7,9]

def run_final_cb(X, y, test):
    N_FOLDS = 5
    N_STARTS = 3

    train_score = 0
    val_score = 0
    
    cv_result = pd.get_dummies(y, columns=['Stay'])
    cv_result.loc[:, cv_result.columns] = 0
    
    test_result = np.zeros((len(test), 11))

    for seed in range(N_STARTS):
        for n, (train_idx, val_inx) in enumerate(StratifiedKFold(n_splits=N_FOLDS, random_state=seed, shuffle=True).split(y, y)):
            print(f'seed: {seed} ------ fold: {n}')
            x_tr, x_val = X.iloc[train_idx], X.iloc[val_inx]
            y_tr, y_val = y.iloc[train_idx], y.iloc[val_inx]

            model = CatBoostClassifier(learning_rate=0.073, 
                                       n_estimators=10000,
                                       depth=8,
                                       bagging_temperature=0.3,
                                       task_type="GPU", 
                                       custom_metric='Accuracy', 
                                       eval_metric='Accuracy', 
                                       random_seed=seed)
            
            history = model.fit(x_tr, y_tr, cat_features=cat_features, eval_set=(x_val, y_val), 
                                early_stopping_rounds=100, verbose=200)
            cv_result.loc[val_inx, cv_result.columns] += model.predict_proba(x_val)
            test_result += model.predict_proba(test)
            del model, history
            
    cv_result.loc[:, cv_result.columns] /= (N_STARTS)
    test_result /= (N_STARTS*N_SPLITS)
    return cv_result, test_result

In [None]:
catboost_cv_result, catboost_test_result = run_final_cb(X_cb, y_cb, test_cb)

# Stacking

In [None]:
def concat_results(nn_result, cb_result):
    return pd.concat([pd.DataFrame(nn_result,columns=[col for col in range(0,11)]),
                           pd.DataFrame(cb_result,columns=[col for col in range(11,22)])],axis=1)

In [None]:
cv_concat = concat_results(nn_cv_result.to_numpy(), catboost_cv_result.to_numpy())
test_results_concat = concat_results(nn_test_result, catboost_test_result)

In [None]:
def stack_nn(num_cols):
    model = tf.keras.Sequential([
    tf.keras.layers.Input(num_cols),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dense(11, activation="softmax")
    ])
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

In [None]:
def run_stacking(X, y, test):
    N_SEEDS = 3
    N_FOLDS = 5
    
    cv_result = y.copy()
    cv_result.loc[:, y.columns] = 0
    
    test_result = np.zeros((len(test), 11))
    
    for seed in range(N_STARTS):
        for n, (train_ind, val_ind) in enumerate(skf.split(skf_y, skf_y)):

            print(f'Seed: {seed} ------------- Fold:{n}')

            x_tr, x_val = X.values[train_ind], X.values[val_ind]
            y_tr, y_val = y.values[train_ind], y.values[val_ind]

            model = stack_nn(X.shape[1])

            checkpoint_path = f'Stacking----Seed:{seed}-Fold:{n}.hdf5'

            reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, min_lr=1e-5, 
                                               patience=3, verbose=0, mode='min')

            cb_checkpt = ModelCheckpoint(checkpoint_path, monitor = 'val_loss', 
                                         verbose = 0, save_best_only = True, 
                                         save_weights_only = True, mode = 'min')

            early = EarlyStopping(monitor="val_loss", mode="min", 
                                  restore_best_weights=True, 
                                  patience= 5, verbose = 0)

            history = model.fit(x_tr, y_tr, 
                                validation_data=(x_val, y_val), 
                                epochs=50, 
                                batch_size=128,
                                callbacks=[reduce_lr_loss, cb_checkpt, early],
                                verbose=2)

            hist = pd.DataFrame(history.history)

            model.load_weights(checkpoint_path)

            cv_result.loc[val_ind, y.columns] += model.predict(x_val)
            test_result += model.predict(test)

            K.clear_session()
            del model, history, hist
            gc.collect()
    
    cv_result.loc[:, y.columns] /= (N_STARTS)
    test_result /= (N_STARTS*N_SPLITS)
    return cv_result, test_result
        

In [None]:
cv_result, test_result = run_stacking(cv_concat, y, test_results_concat)