<a href="https://colab.research.google.com/github/elvisbui/Predicting-Length-of-Stay/blob/master/Improving_Catboost_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  Improving the Model

### Load Libraries 
The first step is to load the libraries we will be using. 

In [None]:
import math
import pickle
import optuna

# import data processing and linear algebra libraries 
import pandas as pd
import numpy as np


# import data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.notebook import tqdm
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

RANDOM_STATE = 24

### Load Data
Next, is getting the data and loading it. 

In [None]:
TRAIN_DIR = '../input/av-healthcare-analytics-ii/healthcare/train_data.csv'
TEST_DIR = '../input/av-healthcare-analytics-ii/healthcare/test_data.csv'
SAMPLE_SUBM = '../input/av-healthcare-analytics-ii/healthcare/sample_sub.csv'
TRAIN_DICT_DIR = '../input/av-healthcare-analytics-ii/healthcare/train_data_dictionary.csv'

def read_csv(*paths: str) -> tuple:
    '''
    Gets a list of cvs paths and returns all cvs in a tuple

            Parameters:
                    *paths (tuple of str): A decimal integer

            Returns:
                    binary_sum (tuple of dataframe): tuple of cvs dataframes
    '''
    result = []
    for dir in paths:
        csv = pd.read_csv(dir)
        result.append(csv)
    return tuple(result)

train, test, sample_subm, train_dict = read_csv(TRAIN_DIR, TEST_DIR, SAMPLE_SUBM, TRAIN_DICT_DIR)

# Preprocessing

In [None]:
stay_encode = {'0-10': 0,
               '11-20': 1,
               '21-30': 2,
               '31-40': 3,
               '41-50': 4,
               '51-60': 5,
               '61-70': 6,
               '71-80': 7,
               '81-90': 8,
               '91-100': 9,
               'More than 100 Days': 10}

hospital_type_code_encode={'a': 0,
                           'b': 1,
                           'c': 2,
                           'e': 3,
                           'd': 4,
                           'f': 5,
                           'g': 6} 

hospital_region_encode = {'X': 0, 
                          'Y': 1, 
                          'Z': 2}

department_encode={'anesthesia': 0,
                   'gynecology': 1,
                   'radiotherapy': 2,
                   'surgery': 3,
                   'TB & Chest disease': 4,}

word_type_encode = {'P': 0,
                    'Q': 1,
                    'R': 2,
                    'S': 3,
                    'T': 4,
                    'U': 5}

ward_facility_code_encode ={'A': 0, 
                            'B': 1, 
                            'C': 2, 
                            'D': 3, 
                            'E': 4, 
                            'F': 5}

admission_type_encode = {'Trauma': 0, 
                         'Emergency': 1, 
                         'Urgent': 2}

illness_encode = {'Minor': 0,
                  'Moderate ': 1,
                  'Extreme': 2}

age_encode = {'0-10': 0,
              '11-20': 1,
              '21-30': 2,
              '31-40': 3,
              '41-50': 4,
              '51-60': 5,
              '61-70': 6,
              '71-80': 7,
              '81-90': 8,
              '91-100': 9}

In [None]:
train['Stay'] = train['Stay'].map(stay_encode)
train['Hospital_type_code'] = train['Hospital_type_code'].map(hospital_type_code_encode)
train['Hospital_region_code'] = train['Hospital_region_code'].map(hospital_region_encode)
train['Department'] = train['Department'].map(department_encode)
train['Ward_Type'] = train['Ward_Type'].map(word_type_encode)
train['Ward_Facility_Code'] = train['Ward_Facility_Code'].map(ward_facility_code_encode)
train['Type of Admission'] = train['Type of Admission'].map(admission_type_encode)
train['Severity of Illness'] = train['Severity of Illness'].map(illness_encode)
train['Age'] = train['Age'].map(age_encode)

# Feature Engineering

For featuere engineering, I like to create as much features as I can. This is because poor features might perform while with another poor feature. 

From exploring the data, we know that there are entries with the same patients. We create features by grouping by patient ID and finding the mean, max, and min of the other features. 

In [None]:
def groupby_features(df, col1, col2, fe_stats = ('mean','max','min', 'sum','count', 'count', 'nunique', 'std')):
    fe_df = df.copy()
    for stat in fe_stats:
        fe_df[f'{col1}_{col2}_{stat}'] = df.groupby([col1])[col2].transform(stat)
        fe_df[f'{col1}_{col2}_{stat}_diff'] = fe_df[f'{col1}_{col2}_{stat}'] - fe_df[col2]
        fe_df[f'{col1}_{col2}_{stat}_div'] = fe_df[f'{col1}_{col2}_{stat}'] / fe_df[col2]
    return fe_df


In [None]:
fe_train = train.copy()
fe_train = groupby_features(fe_train, 'patientid', 'Admission_Deposit')
fe_train = groupby_features(fe_train, 'Severity of Illness', 'Admission_Deposit')
fe_train = groupby_features(fe_train, 'Type of Admission', 'Admission_Deposit')
fe_train = groupby_features(fe_train, 'Bed Grade', 'Admission_Deposit')
fe_train = groupby_features(fe_train, 'Hospital_code', 'Admission_Deposit')
fe_train = groupby_features(fe_train, 'Bed Grade', 'Admission_Deposit')
fe_train = groupby_features(fe_train, 'patientid', 'Visitors with Patient')

We can do this type of feature engineering with the other types of datas. We first group by a categorical feature, get a numerical feature, and perform some statistics to create a new feature. 

In [None]:
cat_feats = ['Hospital_code', 'Hospital_type_code', 'City_Code_Hospital',
             'Hospital_region_code', 'Department', 'Ward_Type', 
             'Ward_Facility_Code','patientid', 'City_Code_Patient', 
             'Type of Admission', 'Severity of Illness']

num_feats = ['Available Extra Rooms in Hospital','Bed Grade', 
             'Visitors with Patient', 'Age', 'Admission_Deposit']

Below if the code to create the new features. But since kaggle notebooks have limited ram and space, I will only be using the previous features I created. 

In [None]:

fe_train = train.copy()

for cat in cat_feats:
    for num in num_feats:
        fe_train = groupby_features(fe_train, cat, num)


In [None]:
len(fe_train.columns)

# Feature Selection
Now that we have lots of features, it is time to select the best ones. There are many methods for feature selection. Here are the steps I will take to select the right features:

1. Get feature importances of each feature
2. remove feature with lowest importances
3. run k-fold cross validation with on the dataset with the removed feature
4. if cross validation improves, remove the next least important feature
5. if cross validation does not improve, add the feature back in and remove the next least important feature
6. run cross validation again and repeat steps 2-5 until we tested all features

In [None]:
cat_features = [0,1,2,3,5,6,7,9]

y = fe_train.loc[:,['Stay']]
X = fe_train.drop(['Stay','case_id'], axis=1)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=RANDOM_STATE)

### First step is to get the feature importances. I will use catboost to get feature importance. 

In [None]:
# clf = CatBoostClassifier(task_type="GPU", custom_metric='Accuracy', eval_metric='Accuracy', random_seed=RANDOM_STATE)
# history = clf.fit(X_train, y_train, cat_features=cat_features, eval_set=(X_val, y_val), plot=True, early_stopping_rounds=100)

In [None]:
# let's save the feature importances for later use
feature_importances = "feature_importances.pkl"  
'''
with open(history.feature_importances_, 'wb') as file:  
    pickle.dump(fi, file)
'''

In [None]:
# Load feature importances
with open('../input/features-importance-of-health-care-stay/feature_importances.pkl', 'rb') as file:  
    feature_importances = pickle.load(file)

In [None]:
feature_importances

In [None]:
np.argmin(feature_importances)

### K-Fold cross validation 

For the validation schema, I will use stratified k-hold cross validation with bagging to reduce randomness. 


In [None]:
def run_clf(X, y, params={}):
    N_FOLDS = 2
    N_STARTS = 1

    train_score = 0
    val_score = 0

    for seed in range(N_STARTS):
        for n, (train_idx, val_inx) in enumerate(StratifiedKFold(n_splits=N_FOLDS, random_state=RANDOM_STATE, shuffle=True).split(y, y)):
            print(f'seed: {seed} ------ fold: {n}')
            x_tr, x_val = X.iloc[train_idx], X.iloc[val_inx]
            y_tr, y_val = y.iloc[train_idx], y.iloc[val_inx]

            model = CatBoostClassifier(**params, task_type="GPU", custom_metric='Accuracy', eval_metric='Accuracy', 
                                       random_seed=RANDOM_STATE)
            history = model.fit(x_tr, y_tr, cat_features=cat_features, eval_set=(x_val, y_val), 
                                early_stopping_rounds=100, verbose=5)
            train_score += model.score(X_train, y_train)/ (N_FOLDS * N_STARTS)
            val_score += model.score(x_val, y_val)/(N_FOLDS * N_STARTS)
            
    return train_score, val_score, history


In [None]:
train_score, val_score, history = run_clf(X, y)

The function below will run catboost on all features, get the feature importances, sort the features by importances, remove the least important feature and runs catboost again on the new dataset. If the cross validation score improves by removing the feature, we will remove the next least important feature and repeat the process again. If the cross validation score decreases, then we know that the feature boosts the score and should not be removed. After testing all the features, the funciton returns the list of all important features. 

In [None]:
def reduce_features(train, target):
    X = train.copy()
    y = target.copy()
    
    prev_ts, prev_cv, prev_hist= run_clf(X, y)
    
    fi = prev_hist.feature_importances_
    fi = [(idx,val) for idx, val in enumerate(fi)]
    fi = sorted(fi, key=lambda x: x[1])
    
    feats = X.columns
    
    d = t.drop(X.columns[0], axis=1)
    
    
    for (idx, val) in tqdm(fi):
        print(f'Checking feature: {feats[idx]}')
        X_test = X.drop(feats[idx], axis=1)
        curr_ts, curr_cv, curr_hist= run_clf(X_test, y)
        
        print(f'Previous CV: {prev_cv} ---- Current CV: {curr_cv}')
        if curr_cv < prev_cv:
            print('Feature Keep')
        else:
            prev_cv = curr_cv
            X = X_test
            print('Feature Dropped')
            
    return X.columns

In [None]:
important_features = reduce_features(X, y)

# Hyperparameter Tuning

To find the right hyperparameters, we can use a search grid but it search grids can take a lot of time to run. We can also use random search grid. It will not find the most optimal hyperparameters, but it will be a lost faster compared to a search grid. We can also use bayesian optimization or a hyperparameter optimization framework. 

For tuning, I will be using Optuna. 

Here are the most common hyperpatameters to tune for Catboost. 
1. Number of trees: iterations
    - The more trees we have, we run the risk of overfitting. 
2. Learning rate: learning_rate
    - The faster the learnign rate the faster the model fits our training data.
3. Tree depth: depth
    - A high number for tree depth can overfit our data. 
4. Regularizer Coefficient
5. Random strength 
6. Bagging temperature
7. Column Subsample




For hyperparameter tuning, I like manually pick hyperpatameters that would over fit the training data and then use a framework to seach for better hyperparameters. 

In [None]:

def objective(trial): 
    param = {
        "iterations": trial.suggest_int("iterations", 300, 1000),
        "learning_rate": trial.suggest_float("learning_rate", .01, .3),
        "depth": trial.suggest_int("depth", 1, 12),
        'num_leaves': trial.suggest_int("num_leaves", 25, 100),
        'bagging_temperature': trial.suggest_float("bagging_temperature", .1, .5),
    }
    
    train_score, cv_score, history = run_clf(X, y, param)
    
    return cv_score

study = optuna.create_study()
study.optimize(objective, n_trials=50)

study.best_params  

In [None]:
study.best_params  

In [None]:
y = pd.get_dummies(y, columns=['Stay'])

# Improved Model
Now that we have our features and the right hyperparameters, we can create our improved model. 

In [None]:
def run_final_cb(X, y, params={}):
    N_FOLDS = 5
    N_STARTS = 3

    train_score = 0
    val_score = 0
    
    cv_result = pd.get_dummies(y, columns=['Stay'])
    cv_result.loc[:, cv_result.columns] = 0

    for seed in range(N_STARTS):
        for n, (train_idx, val_inx) in enumerate(StratifiedKFold(n_splits=N_FOLDS, random_state=seed, shuffle=True).split(y, y)):
            print(f'seed: {seed} ------ fold: {n}')
            x_tr, x_val = X.iloc[train_idx], X.iloc[val_inx]
            y_tr, y_val = y.iloc[train_idx], y.iloc[val_inx]

            model = CatBoostClassifier(learning_rate=0.073, 
                                       n_estimators=10000,
                                       depth=8,
                                       bagging_temperature=0.3,
                                       task_type="GPU", 
                                       custom_metric='Accuracy', 
                                       eval_metric='Accuracy', 
                                       random_seed=seed)
            
            history = model.fit(x_tr, y_tr, cat_features=cat_features, eval_set=(x_val, y_val), 
                                early_stopping_rounds=100, verbose=200)
            cv_result.loc[val_inx, cv_result.columns] += model.predict_proba(x_val)
            del model, history
            
    cv_result[:, cv_result.columns] /= (N_FOLDS*N_STARTS)
    return cv_result

In [None]:
res = run_final_cb(X, y)
log_loss(pd.get_dummies(y, columns=['Stay']),res)