In [1]:
print('Import essential packages.')

import pandas as pd
import numpy as np
import os

## supress warnings
import warnings
warnings.filterwarnings('ignore')

import xgboost as xgb
import lightgbm as lgb
import pickle
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter,
                               AutoMinorLocator)
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve

import time

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.utils import resample
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import sys, os
import math
from IPython.utils import io
import copy

# pd.set_option('display.max_columns', 1000)
# pd.set_option('display.max_rows', 92000)
# pd.set_option('max_info_columns', 91713)

Import essential packages.


In [None]:
def import_data(file):
    '''create a dataframe and optimize its memory usage'''
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True, index_col = 'encounter_id')
    #     df = reduce_mem_usage(df)
    return df


def hospital_source_observation(df):
    
    """
    Returns a dataframe without observation and update it with the most logical guess.
    
    Parameters
    ----------
    df: dataframe
    """
    
    #Fill missing values in the hospital admit source with ICU admit source
    df_hos_NA = df[df['hospital_admit_source'].isna()==True]    
    
    #for the rows with missing value in the training data set's hospital admit source
    for i in df_hos_NA.index:
        
        #if the icu amdit source is Accident & Emergency
        if df.loc[i,'icu_admit_source']=='Accident & Emergency':
            #hospital admit source is filled with Emergency Department
            df.loc[i,'hospital_admit_source'] = 'Emergency Department'
        
        #if the icu admit source is Operating Room/Recovery
        elif df.loc[i,'icu_admit_source'] == 'Operating Room / Recovery':
            #if there is post operation, fill it with Operating Room
            if df.loc[i,'apache_post_operative'] == 1:
                df.loc[i,'hospital_admit_source'] = 'Operating Room'
            #if not, fill it with Recovery Room
            else:
                df.loc[i,'hospital_admit_source'] = 'Recovery Room'
        #Fill the rest of the hospital admit source with ICU admit source
        else:
            df.loc[i,'hospital_admit_source'] = df.loc[i,'icu_admit_source']
                
    observation = df[df['hospital_admit_source']=='Observation']
    
    for i in observation.index:
        
        if df.loc[i,'icu_admit_source']!='Floor':
            if df.loc[i,'icu_admit_source']=='Operating Room / Recovery':
                if df.loc[i,'apache_post_operative'] == 0:
                    df.loc[i,'hospital_admit_source'] = 'Operating Room'
                else:
                    df.loc[i,'hospital_admit_source'] = 'Recovery Room'
         
            else:
                df['icu_type'] = df['icu_type'].str.lower()
                if df.loc[i,'icu_type'].find('s')!= -1:
                    if df.loc[i,'apache_post_operative'] == 0:
                        df.loc[i,'hospital_admit_source'] = 'Operating Room'
                    else:
                        df.loc[i,'hospital_admit_source'] = 'Recovery Room'
                else:
                    df.loc[i,'hospital_admit_source'] = 'Emergency Department'
                        
        else:
            df.loc[i,'hospital_admit_source'] = df.loc[i,'icu_admit_source']

    return df



def assess_NA(data):
    """
    Returns a dataframe denoting the total number of NA values and the percentage of NA values in each column.
    The column names are notes on the index.
    
    Parameters
    ----------
    data: dataframe
    """
    #pandas series denoting features and the sum of their null values
    null_sum = data.isnull().sum()
    
    #instantiate columns for missing data
    total = null_sum
    percent = (((null_sum/len(data.index))*100).round(2))
    
    #concatenate along the columns to create the complete dataframe
    train_NA = pd.concat([total,percent],axis=1,keys=['Number of NA','Percent NA'])
    
    #drop rows that don't have any missing data
    train_NA = train_NA[(train_NA.T !=0).any()].sort_values(ascending=False, by='Percent NA')
    
    return train_NA

def data_processing(df, train, change_to_bool, change_to_cat, excludeColumns, tag):
    
    ## Drop outcome variable
    #     if tag == 'train':
    #         df = df.drop(columns=['hospital_death'])
                
    '''Revise certain Variables
    '''
    print('- Revise hospital_admit_source, and apache_2_bodysystem')#pre_icu_los_days
    # Modify negative value to positive of pre_icu_los_days
    # df['pre_icu_los_days'] = abs(df['pre_icu_los_days'])
    # Rename some categorical variable's list name
    df['hospital_admit_source'] = df['hospital_admit_source'].replace(
        {'Other ICU': 'ICU', 
         'ICU to SDU':'SDU', 
         'Step-Down Unit (SDU)': 'SDU',
         'Other':'Other Hospital', 
         'Observation': 'Recovery Room',
         'Acute Care/Floor': 'Acute Care'})
    df['apache_2_bodysystem'] = df['apache_2_bodysystem'].replace(
        {'Undefined diagnoses': 'Undefined Diagnoses'}
    )
    # print('- Change -1 to nan for apache_4a_hospital_death_prob and apache_4a_icu_death_prob')
    # Modify negative value of apache_4a_hospital_death_prob to nan
    #     df.loc[df['apache_4a_hospital_death_prob'] == -1, 'apache_4a_hospital_death_prob'] = np.nan
    #     df.loc[df['apache_4a_icu_death_prob'] == -1, 'apache_4a_icu_death_prob'] = np.nan
    
    ### Deal with d1, h1
    new_d1_h1_cols = []
    
    ## Method 0
    #print('- Remain h1_d1 as original value')
    
    ## method 1
    ## Change all d1/h1 into booline by assigning 1 to those with values and 0 to those with NaN
    #     d1_cols = [col for col in df.columns if (col.find('d1_') != -1)]
    #     h1_cols = [col for col in df.columns if (col.find('h1_') != -1)]    
    #     for col in d1_cols:
    #         df[col] = df[col].apply(lambda x: 1 if math.isnan(x) else 0)
    #     for col in h1_cols:
    #         df[col] = df[col].apply(lambda x: 1 if math.isnan(x) else 0)
    
    ## method 1-2
    ## Creat booline columns for d1/h1 by assigning 0 to those with values and 1 to those with NaN
    #     print('- Creat booline columns for d1/h1')
    #     d1_cols = [col for col in df.columns if (col.find('d1_') != -1)]
    #     h1_cols = [col for col in df.columns if (col.find('h1_') != -1)]    
    #     for col in d1_cols:
    #         df[col+'_bool'] = df[col].apply(lambda x: 1 if math.isnan(x) else 0)
    #     for col in h1_cols:
    #         df[col+'_bool'] = df[col].apply(lambda x: 1 if math.isnan(x) else 0)
    
    ## method 2
    #     print('- Create columns of combination of h1_d1_min_max: ')
    #     # Add new features by aggrigating d1/h1/and their min/max
    #     d1_cols_min = [col for col in df.columns 
    #                if (col.find('d1_') != -1 and col.find('_min') != -1)]
    #     d1_cols_max = [col for col in df.columns 
    #                if (col.find('d1_') != -1 and col.find('_max') != -1)]
    #     h1_cols_max = [col for col in df.columns 
    #            if (col.find('h1_') != -1 and col.find('_max') != -1)]

    #     for d1_min in d1_cols_min:
    #         d1_max = d1_min.replace('_min', '_max')
    #         h1_min = d1_min.replace('d1', 'h1')
    #         h1_max = h1_min.replace('_min', '_max')

    #         df['range_' + d1_min.strip('_min')] = df[d1_max] - df[d1_min]; new_d1_h1_cols.append(df['range_' + d1_min.strip('_min')])
    #         df['range_' + h1_min.strip('_min')] = df[h1_max] - df[h1_min]; new_d1_h1_cols.append(df['range_' + h1_min.strip('_min')])

    #         df['d1_h1_' + d1_min.strip('d1_').strip('_min') + '_MinMin'] = df[d1_min] - df[h1_min]; new_d1_h1_cols.append(df['d1_h1_' + d1_min.strip('d1_').strip('_min') + '_MinMin'])
    #         df['d1_h1_' + d1_min.strip('d1_').strip('_min') + '_MinMax'] = df[d1_min] - df[h1_max]; new_d1_h1_cols.append( df['d1_h1_' + d1_min.strip('d1_').strip('_min') + '_MinMax'])
    #         df['d1_h1_' + d1_min.strip('d1_').strip('_min') + '_MaxMin'] = df[d1_max] - df[h1_min]; new_d1_h1_cols.append(df['d1_h1_' + d1_min.strip('d1_').strip('_min') + '_MaxMin'])
    #         df['d1_h1_' + d1_min.strip('d1_').strip('_min') + '_MaxMax'] = df[d1_max] - df[h1_max]; new_d1_h1_cols.append(df['d1_h1_' + d1_min.strip('d1_').strip('_min') + '_MaxMax'])

    #     print('  There are %d columns adding in' % (len(new_d1_h1_cols)))    

    #     ## Record the max columns for all d1 and h1 for future dropping list
    #     if tag == 'train':
    #         excludeColumns = set(excludeColumns).union(d1_cols_max).union(h1_cols_max)

    ### hospital_id & icu_id
    ## Method 1
    #     print('- Include hospital_id and icu_id and remain as original value')
    ## Method 2
    print('- Transfer icu_id and hopstipal_id to categorical data')
    if tag=='train':
        change_to_cat.append('icu_id')
        change_to_cat.append('hospital_id')
    ## Method 2.2
    #     print('Transfer icu_id and to categorical data and drop hopstipal_id')
    #     change_to_cat.append('icu_id')
    #     if tag == 'train':
    #         excludeColumns.add('apache_3j_diagnosis')
    ## Method 3
    #     print('- Exclude columns patient_id, icu_id, hospital_id, and apache_3j_diagnosis')
    #     if tag == 'train':
    #         excludeColumns.add('icu_id')
    #         excludeColumns.add('hospital_id')
    #         excludeColumns.add('patient_id')
    #         excludeColumns.add('apache_3j_diagnosis')
    

    ''' Drop certain columns by some criteria
    '''
    ### Exclude collinear (highly correlated) features 
    ## by dropping columns with correlations above threshold
    if tag == 'train':
        threshold = 0.9 # Threshold for removing correlated variables
        corr_matrix = df.drop(columns='hospital_death').corr().abs() # Absolute value correlation matrix
        # corr_matrix = df.corr().abs() # Absolute value correlation matrix
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)) # Upper triangle of correlations
        to_drop = set( [column for column in upper.columns if any(upper[column] > threshold)] )
        print('- There are %d columns to remove due to high correlation with threshold=0.9.' % (len(to_drop)))

        # record these columns into the list, excludeColumns
        excludeColumns = excludeColumns.union(to_drop)

#         ## Exclude h1_xxx_max and d1_xxx_max features 
#         df = df[df.columns.drop(list(df.filter(regex='_max')))]
    
    
    '''Deal with missing values
    '''
    ### fillna in height/weight/bmi
    if tag=='train':
        print('- Fill NaN of age, height, weight, and bmi')
        df['age'] = df['age'].fillna(df['age'].median())
        df['height'] = df.groupby(['ethnicity', 'gender'])['height'].apply(lambda x: x.fillna(x.median()))  
        df['weight'] = df.groupby(['ethnicity', 'gender', 'height'])['weight'].apply(lambda x: x.fillna(x.median()))    
        # print('Drop height and weight, remain only column bmi ')    
    #     if tag == 'train':
    #         excludeColumns.add('height')
    #         excludeColumns.add('weight')
    else:
        print('- Fill NaN of age, height, weight, and bmi by values from training data set')
        df['age'] = df['age'].fillna(train['age'].median())
        df['height'] = df['height'].fillna(train.groupby(['ethnicity', 'gender'])['height'].transform('median'))
        df['weight'] = df['weight'].fillna(train.groupby(['ethnicity', 'gender', 'height'])['weight'].transform('median'))
    
    df['bmi'] = df.apply( lambda x: x['weight']/(0.01 * x['height'])**2 if np.isnan(x['bmi']) else x['bmi'], axis=1)
    
    ### fillna for hospital_admit_source
    print('- Fill NaN of hospital_admit_source by using icu_type')
    df = hospital_source_observation(df)

    print('- Fill NaN of hospital_admit_source by using most frequent value in icu_type')    
    imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
    imputer.fit(df['hospital_admit_source'].values.reshape(-1, 1)) # Try it only when get 1-D, 2-D array error
    if tag == 'train':
        df['hospital_admit_source'] = imputer.transform(df['hospital_admit_source'].values.reshape(-1, 1))
    else:
        most_freq = train['hospital_admit_source'].value_counts().keys()[0]
        df['hospital_admit_source'] = df['hospital_admit_source'].fillna(most_freq)

    ### fillna for apache_4a_hospital_death_prob and apache_4a_icu_death_prob
    print('- Fill NaN of apache_4a_hospital_death_prob and apache_4a_icu_death_prob by each other')
    df['apache_4a_hospital_death_prob'
        ] = pd.DataFrame(df.loc[:,['apache_4a_hospital_death_prob', 'apache_4a_icu_death_prob']
                                 ].apply(lambda x: x['apache_4a_icu_death_prob'] if np.isnan(x['apache_4a_hospital_death_prob']) \
                                                                                 else x['apache_4a_hospital_death_prob'], 
                                         axis=1))
    df['apache_4a_icu_death_prob'
        ] = pd.DataFrame(df.loc[:,['apache_4a_hospital_death_prob', 'apache_4a_icu_death_prob']
                                 ].apply(lambda x: x['apache_4a_hospital_death_prob'] if np.isnan(x['apache_4a_icu_death_prob']) \
                                                                                      else x['apache_4a_icu_death_prob'], 
                                         axis=1))

    ### fillna for all numerical and object variables using '9999'
    #     print('- Fill NaN as "9999" for the following object columns:')
    #     i = 0
    #     for col in change_to_cat:
    #         if i==0:
    #             print("=> ["+col, end=', ')
    #         elif i!=len(change_to_cat)-1:
    #             print(col, end=', ')
    #         else:
    #             print(col+']')
    #         i+=1
    #     for col in change_to_cat:
    #         if col != 'age':
    #             df[col] = df[col].fillna('9999')
    #             df[col] = df[col].astype('str')

    #     print('- Fill NaN as "9999" for the remaining numeric columns')
    #     for col in set(df.columns).difference(set(change_to_cat)):
    #         df[col] = df[col].fillna(9999)
    
    ### Binary variables
    ## Method 1: make NaN as a new group NaNN: 
    # which represents a patient either doesn't need to fill the form,
    # or they couldn't fill the form
    # print('Create an NANN category for boolean variables')
    # for col in change_to_bool:
    #     df[col] = df[col].fillna('NANN')
    #     df[col] = df[col].astype('category')
    ## Method 2: change NaN --> 0
    #     for col in change_to_bool:
    #         df[col] = df[col].fillna(0)
    
    ## change binary columns to bool type
    # for col in change_to_bool:
    #     df[col] = df[col].astype('bool')
    
    
    
    # if tag == 'train':
    # Method 0:
    print('- Does not impute NaN for the remaining variables')
    
    # Method 1:
    # int variables
    #     print('Impute NaN by their median value for numerical variables')
    #     imputer = SimpleImputer(missing_values = np.nan, strategy = 'median')
    #     for col, dtype in zip(df.columns, df.dtypes):
    #         if dtype not in ['float', 'int', 'int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'bool', 'uint8'] or col == 'hospital_death': continue
    #         imputer.fit(df[col].values.reshape(-1, 1)) # Try it only when get 1-D, 2-D array error
    #         df[col] = imputer.transform(df[col].values.reshape(-1, 1))
    #     # object variables
    #     print('Impute NaN by their frequent value for category variables')
    #     imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
    #     for col, dtype in zip(df.columns, df.dtypes):
    #         if dtype not in ['object'] or col == 'hospital_death': continue
    #         imputer.fit(df[col].values.reshape(-1, 1)) # Try it only when get 1-D, 2-D array error
    #         df[col] = imputer.transform(df[col].values.reshape(-1, 1))

    
    
    '''Dealing with categorical variables
    '''
    
    ### Segment age
    ## Method 1: if age column doesn't have NaN value:
    #     print('- Cut age into groups and add to set change_to_cat')
    #     # df['age'] = df['age'].fillna(-1)
    #     bins = [0, 20, 40, 60, 80, 100]
    #     group_names = ['0-20', '20-39', '40-59', '60-79', '80-99']
    #     df['age'] = pd.cut(df['age'], bins, labels=group_names)
    #     if tag == 'train':
    #         change_to_cat.append('age')
    
    ## Method 2: if age column has NaN value:
    #     df['age'] = df['age'].fillna(-1)
    #     bins = [-1, 0, 20, 40, 60, 80, 100]
    #     group_names = ['NANN', '0-20', '20-39', '40-59', '60-79', '80-99']
    #     df['age'] = pd.cut(df['age'], bins, labels=group_names)
    
    
    ## Method 1: if column doesn't have NaN value:
    #     print('- Change the following list into category type: ')
    #     print(*change_to_cat)
    #     for col in change_to_cat:
    #         df[col] = df[col].astype('category')
    
    ## Method 2: if column has NaN value:
    # Change factor type to category and insert NANN category
    #     for col in change_to_cat:
    #         df[col] = df[col].fillna('NANN')
    #         df[col] = df[col].astype('category')
    
    
    '''Change categorical data into different form
    '''
    for col in change_to_cat:
        if (df[col].dtype == 'O') | (col=='age'):
            df[col] = df[col].astype('object')
        else:
            df[col] = df[col].fillna(-9999)
            df[col] = df[col].astype('str').str.replace('.','')
            df[col] = df[col].replace('-9999', np.nan) 
    
    ## if column has NaN value:
    # Change factor type to category and insert NANN category
    for col in change_to_cat:
        df[col] = df[col].fillna('NANN')
    
    ### Method 0:
    #     print('- Remain categorical variables as object')
            
    ### Method 1: Transforming into dummy variables
    #     print('- Change categorical data into dummies')       
    #     df = pd.get_dummies(df, columns=change_to_cat, dummy_na=True, drop_first=False)  
    
    
    
    # Checking for columns with only one value
    print('- Drop the following columns due to containing only one value: ')
    if tag=='train':
        i = 0
        for col in df.columns:
            if df[col].nunique() == 1:
                excludeColumns.add(col)
                if i==0:
                    print("=> ["+col, end=', ')
                else:
                    print(col, end=', ')
                i+=1
        print(']')
#         print('- Drop the following columns due to containing only one value: ')
#         for col in df.columns:
#             if df[col].nunique() == 1:
#                 excludeColumns.add(col)
#                 print(col, end=', ')
    

    
    ### Method 1-2: Transforming into dummy variables
    #     print('- Change categorical data into dummies, keep NaN as a category and drop the first category')
    #     df = pd.get_dummies(df, dummy_na=True, drop_first=False)  

    #     # Checking for columns with only one value
    #     if tag == 'train':
    #         print('- Drop the following columns due to containing only one value')
    #         for col in df.columns:
    #             if df[col].nunique() == 1:
    #                 excludeColumns.add(col)
    #                 print(col, end=', ')
    #     print()                

    ### Method 2: Encoding into integers
    #     print('- Change factor variables to Label Encoding')
    #     for col in change_to_cat:
    #         df[col] = df[col].astype('str')
    
    
    #     label_encoder = LabelEncoder()
    #     label_class_train = {}
    #     for col in change_to_cat:
    #         label_encoder = label_encoder.fit(df[col])
    #         df[col] = label_encoder.transform(df[col])
    #         label_class_train[col] = list(label_encoder.classes_)
    
    ### Method 3: One hot encoding



    
    '''Rename columns by replacing space into underscore
    '''
    # in order to avoid plot error
    df.columns = df.columns.str.replace(" ", "_")
    
       
    '''
    Imbalance dataset:
    '''
    ## Method 0: Do nothing with sparse data set
    print('- Does NOT balance sparse data set')
    ## Method 1: Balance sparse data set by duplicating death data
    #     if tag == 'train':
    #         print('- Balance sparse data set by duplicating death data')
    #         #print(len(df[df.hospital_death==1])/len(df))
    #         #print('before:')
    #         #print(train.hospital_death.value_counts())
    #         # Separate majority and minority classes
    #         df_majority = df[df.hospital_death==0]
    #         df_minority = df[df.hospital_death==1]

    #         # Resampling the minority levels to match the majority level
    #         # Upsample minority class
    #         df_minority_upsampled = resample(df_minority, 
    #                                          replace=True,     # sample with replacement
    #                                          n_samples=df.hospital_death.value_counts()[0],    # to match majority class
    #                                          random_state= 2020) # reproducible results
    #         # Combine majority class with upsampled minority class
    #         df = pd.concat([df_majority, df_minority_upsampled])

    #         # Display new class counts
    #         #print('after:')
    #         #print(train.hospital_death.value_counts())

    
    '''Scalling data
    '''
    #     print('- Scalling data by MinMaxScaler')
    #     scaler = MinMaxScaler()
    #     cols_to_scale = list(set(df.columns).difference(set(change_to_cat)).difference(set(excludeColumns)))
    #     df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])
    
    if tag == 'train':
        return df, excludeColumns, change_to_cat, new_d1_h1_cols
    else:
        return df, new_d1_h1_cols


###　Define lists of columns to be transformed
tag='train+test'
# tag = ''
# error_count = 0
# while ((len(tag) == 0) or (tag not in ['train', 'test', 'train+test'])) and (error_count<=2):
#     tag = input('Please type "train", "test", or "train+test" indicating this preprocess is for training, testing data set or both: ')
#     error_count += 1
if tag in ['train', 'test', 'train+test']:
    if (tag == 'train') or (tag == 'train+test'):
        change_to_bool = ['elective_surgery', 'readmission_status',
                          'apache_post_operative', 'arf_apache',
                          'gcs_unable_apache', 'intubated_apache',
                          'ventilated_apache', 'aids', 'cirrhosis',
                          'diabetes_mellitus', 'hepatic_failure',
                          'immunosuppression', 'leukemia', 'lymphoma',
                          'solid_tumor_with_metastasis'] ## columns will transform to boolin type
        change_to_cat = ['ethnicity', 'gender', 'hospital_admit_source',
                         'icu_admit_source', 'icu_stay_type', 'icu_type',
                         'apache_3j_diagnosis', 'apache_2_diagnosis',
                         'apache_3j_bodysystem', 'apache_2_bodysystem'] ## columns will transform to categorical type
        excludeColumns = ['hospital_death', 'patient_id'] ## Predefine unwanted columns

        outcome = 'hospital_death'

    ### Data preprocessing
    # train = pd.read_csv("../Dataset/training_v2.csv",  index_col = 'encounter_id')
    # test = pd.read_csv("../Dataset/unlabeled.csv",  index_col = 'encounter_id')

    if tag == 'train':
        train, excludeColumns, change_to_cat, new_d1_h1_cols = data_processing(import_data('training_v2.csv'), pd.DataFrame(), change_to_bool, change_to_cat, set(excludeColumns), 'train')
    elif tag == 'test':
        test, new_d1_h1_cols_test = data_processing(import_data('unlabeled.csv'), import_data('training_v2.csv'), change_to_bool, change_to_cat, excludeColumns, 'test')
    elif tag == 'train+test':
        train, excludeColumns, change_to_cat, new_d1_h1_cols = data_processing(import_data('training_v2.csv'), pd.DataFrame(), change_to_bool, change_to_cat, set(excludeColumns), 'train')
        with io.capture_output() as captured:
            test, new_d1_h1_cols_test = data_processing(import_data('unlabeled.csv'), import_data('training_v2.csv'), change_to_bool, change_to_cat, excludeColumns, 'test')
        # train, ohe, label_encoders = OHE(train, OneHotEncoder(), change_to_cat, 'train', {})
        # test = OHE(test, ohe, change_to_cat, 'test', label_encoders)
    else:
        print('Wrong input')
else:
    print('End of program due to wrong input!')
    del train, test

In [171]:
# Label Encoding version
# %run "data_processing.ipynb"

print('- Change factor variables to Label Encoding')
train_len = len(train)
df = pd.concat(objs = [train, test], axis = 0)

label_encoder = LabelEncoder()
label_class = {}
for col in change_to_cat:
    label_encoder = label_encoder.fit(df[col])
    df[col] = label_encoder.transform(df[col])
    label_class[col] = list(label_encoder.classes_)

# save model to file
pickle.dump(label_class, open('label_class_v1', "wb"))

df[change_to_cat] = df[change_to_cat].apply(lambda x: x.astype('category'), axis=0)

train = copy.copy(df[:train_len])
test = copy.copy(df[train_len:])

### Generate predictors and outcome variable for model usage
predictors = [s for s in train.columns if s not in excludeColumns]
outcome = 'hospital_death'
X = train[predictors]
y = train[outcome]

X_test = test[predictors]

### Split dataset into training and testing
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state = 2020)

- Change factor variables to Label Encoding


---

# Hyperparameter Tuning

In [None]:
# modelFile = "xgb_1.pickle"
gsFile = 'xgb_gs_1.pickle'
LogName = 'xgb_1.log'

params = {
    'objective'         :'binary:logistic',
    'n_estimators'      : 20000  , #default = 100
    'random_state'      : 2020  ,
    'seed'              : 2020  ,  
}

model = xgb.XGBClassifier(**params)

params_gs = {
    'learning_rate'     : [0.1, 0.01, 0.001]  , #default = 0.1  # typical range: 0.01 - 0.3
    'max_depth'         : np.linspace(3,10,8,endpoint=True)     , #default = 3
    'subsample'         : [0.8, 0.9, 1]   , #default = 1
    'colsample_bytree'  : np.linspace(0.3,0.8,6,endpoint=True), #default = 1  # many columns: 0.3 - 0.8 , a few columns: 0.8 - 1
    'gamma'             : [0, 1, 5]
}

fit_params={"early_stopping_rounds":100, 
            "eval_metric" : ["error", 'auc']
           }

scoring = {'AUC': 'roc_auc', 
           'Accuracy': make_scorer(accuracy_score), 
           'neg_log_loss':make_scorer(log_loss)}


gs = GridSearchCV(model, 
                  param_grid=params_gs, 
                  cv = 5, 
                  verbose=1,
                  scoring=scoring, 
                  refit='AUC', 
                  return_train_score=True,
                  n_jobs=-1
                 )


fsock=open(LogName,'w')
sys_old_out_put=sys.stdout
sys.stdout=fsock

%time gs.fit(X, y, **fit_params)

sys.stdout=sys_old_out_put
fsock.close()

# save model to file
pickle.dump(gs, open(gsFile, "wb"))

In [None]:
LogName = 'xgb_cv_1.log'

cvFile = "xgb_cv_1.pickle"

dtrain = xgb.DMatrix(X, label = y)

fsock=open(LogName,'w')
sys_old_out_put=sys.stdout
sys.stdout=fsock

%time cv = xgb.cv(gs.best_params_, \
                  dtrain, \
                  num_boost_round=50000, \
                  nfold=5, \
                  metrics=['auc'],\
                  early_stopping_rounds=100,\
                  stratified=True, \
                  seed=2020, \
                  verbose_eval=100)

sys.stdout=sys_old_out_put
fsock.close()

print('Best number of num_boost_round = {}'.format(cv.shape[0]))

pickle.dump(cv, open(cvFile, "wb"))

In [None]:
submitFile = 'submit_xgb_1.csv'
LogName = 'xgb_submit_1.log'


fsock=open(LogName,'w')
sys_old_out_put=sys.stdout
sys.stdout=fsock

model = xgb.XGBClassifier(**gs.best_params_,
                         n_estimator = (cv.shape[0]-1),
                         early_stopping_rounds=100,\
                         )

eval_set = [(X_train, y_train), (X_valid, y_valid)]
%time model.fit(X_train, y_train, \
                eval_metric=["error", 'auc'], \
                eval_set=eval_set, \
                verbose=100)

sys.stdout=sys_old_out_put
fsock.close()

########## prediction of unlabeled data #############

y_pred_unlabeled = model.predict_proba(X_test)[:,1]

data_to_submit = pd.DataFrame({
    'encounter_id':X_test.index,
    'hospital_death':y_pred_unlabeled
})

data_to_submit.to_csv(submitFile, index = False)

data_to_submit.head()

In [None]:
def import_data(file):
    '''create a dataframe and optimize its memory usage'''
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True, index_col = 'encounter_id')
    #     df = reduce_mem_usage(df)
    return df


def hospital_source_observation(df):
    
    """
    Returns a dataframe without observation and update it with the most logical guess.
    
    Parameters
    ----------
    df: dataframe
    """
    
    #Fill missing values in the hospital admit source with ICU admit source
    df_hos_NA = df[df['hospital_admit_source'].isna()==True]    
    
    #for the rows with missing value in the training data set's hospital admit source
    for i in df_hos_NA.index:
        
        #if the icu amdit source is Accident & Emergency
        if df.loc[i,'icu_admit_source']=='Accident & Emergency':
            #hospital admit source is filled with Emergency Department
            df.loc[i,'hospital_admit_source'] = 'Emergency Department'
        
        #if the icu admit source is Operating Room/Recovery
        elif df.loc[i,'icu_admit_source'] == 'Operating Room / Recovery':
            #if there is post operation, fill it with Operating Room
            if df.loc[i,'apache_post_operative'] == 1:
                df.loc[i,'hospital_admit_source'] = 'Operating Room'
            #if not, fill it with Recovery Room
            else:
                df.loc[i,'hospital_admit_source'] = 'Recovery Room'
        #Fill the rest of the hospital admit source with ICU admit source
        else:
            df.loc[i,'hospital_admit_source'] = df.loc[i,'icu_admit_source']
                
    observation = df[df['hospital_admit_source']=='Observation']
    
    for i in observation.index:
        
        if df.loc[i,'icu_admit_source']!='Floor':
            if df.loc[i,'icu_admit_source']=='Operating Room / Recovery':
                if df.loc[i,'apache_post_operative'] == 0:
                    df.loc[i,'hospital_admit_source'] = 'Operating Room'
                else:
                    df.loc[i,'hospital_admit_source'] = 'Recovery Room'
         
            else:
                df['icu_type'] = df['icu_type'].str.lower()
                if df.loc[i,'icu_type'].find('s')!= -1:
                    if df.loc[i,'apache_post_operative'] == 0:
                        df.loc[i,'hospital_admit_source'] = 'Operating Room'
                    else:
                        df.loc[i,'hospital_admit_source'] = 'Recovery Room'
                else:
                    df.loc[i,'hospital_admit_source'] = 'Emergency Department'
                        
        else:
            df.loc[i,'hospital_admit_source'] = df.loc[i,'icu_admit_source']

    return df



def assess_NA(data):
    """
    Returns a dataframe denoting the total number of NA values and the percentage of NA values in each column.
    The column names are notes on the index.
    
    Parameters
    ----------
    data: dataframe
    """
    #pandas series denoting features and the sum of their null values
    null_sum = data.isnull().sum()
    
    #instantiate columns for missing data
    total = null_sum
    percent = (((null_sum/len(data.index))*100).round(2))
    
    #concatenate along the columns to create the complete dataframe
    train_NA = pd.concat([total,percent],axis=1,keys=['Number of NA','Percent NA'])
    
    #drop rows that don't have any missing data
    train_NA = train_NA[(train_NA.T !=0).any()].sort_values(ascending=False, by='Percent NA')
    
    return train_NA


def data_processing(df, train, change_to_bool, change_to_cat, excludeColumns, tag):
    
    ## Drop outcome variable
    #     if tag == 'train':
    #         df = df.drop(columns=['hospital_death'])
                
    '''Revise certain Variables
    '''
    print('- Revise hospital_admit_source, and apache_2_bodysystem')#pre_icu_los_days
    # Modify negative value to positive of pre_icu_los_days
    # df['pre_icu_los_days'] = abs(df['pre_icu_los_days'])
    # Rename some categorical variable's list name
    df['hospital_admit_source'] = df['hospital_admit_source'].replace(
        {'Other ICU': 'ICU', 
         'ICU to SDU':'SDU', 
         'Step-Down Unit (SDU)': 'SDU',
         'Other':'Other Hospital', 
         'Observation': 'Recovery Room',
         'Acute Care/Floor': 'Acute Care'})
    df['apache_2_bodysystem'] = df['apache_2_bodysystem'].replace(
        {'Undefined diagnoses': 'Undefined Diagnoses'}
    )
    # print('- Change -1 to nan for apache_4a_hospital_death_prob and apache_4a_icu_death_prob')
    # Modify negative value of apache_4a_hospital_death_prob to nan
    #     df.loc[df['apache_4a_hospital_death_prob'] == -1, 'apache_4a_hospital_death_prob'] = np.nan
    #     df.loc[df['apache_4a_icu_death_prob'] == -1, 'apache_4a_icu_death_prob'] = np.nan
    
    ### Deal with d1, h1
    new_d1_h1_cols = []
    
    ## Method 0
    #print('- Remain h1_d1 as original value')
    
    ## method 1
    ## Change all d1/h1 into booline by assigning 1 to those with values and 0 to those with NaN
    #     d1_cols = [col for col in df.columns if (col.find('d1_') != -1)]
    #     h1_cols = [col for col in df.columns if (col.find('h1_') != -1)]    
    #     for col in d1_cols:
    #         df[col] = df[col].apply(lambda x: 1 if math.isnan(x) else 0)
    #     for col in h1_cols:
    #         df[col] = df[col].apply(lambda x: 1 if math.isnan(x) else 0)
    
    ## method 1-2
    ## Creat booline columns for d1/h1 by assigning 0 to those with values and 1 to those with NaN
    #     print('- Creat booline columns for d1/h1')
    #     d1_cols = [col for col in df.columns if (col.find('d1_') != -1)]
    #     h1_cols = [col for col in df.columns if (col.find('h1_') != -1)]    
    #     for col in d1_cols:
    #         df[col+'_bool'] = df[col].apply(lambda x: 1 if math.isnan(x) else 0)
    #     for col in h1_cols:
    #         df[col+'_bool'] = df[col].apply(lambda x: 1 if math.isnan(x) else 0)
    
    ## method 2
    #     print('- Create columns of combination of h1_d1_min_max: ')
    #     # Add new features by aggrigating d1/h1/and their min/max
    #     d1_cols_min = [col for col in df.columns 
    #                if (col.find('d1_') != -1 and col.find('_min') != -1)]
    #     d1_cols_max = [col for col in df.columns 
    #                if (col.find('d1_') != -1 and col.find('_max') != -1)]
    #     h1_cols_max = [col for col in df.columns 
    #            if (col.find('h1_') != -1 and col.find('_max') != -1)]

    #     for d1_min in d1_cols_min:
    #         d1_max = d1_min.replace('_min', '_max')
    #         h1_min = d1_min.replace('d1', 'h1')
    #         h1_max = h1_min.replace('_min', '_max')

    #         df['range_' + d1_min.strip('_min')] = df[d1_max] - df[d1_min]; new_d1_h1_cols.append(df['range_' + d1_min.strip('_min')])
    #         df['range_' + h1_min.strip('_min')] = df[h1_max] - df[h1_min]; new_d1_h1_cols.append(df['range_' + h1_min.strip('_min')])

    #         df['d1_h1_' + d1_min.strip('d1_').strip('_min') + '_MinMin'] = df[d1_min] - df[h1_min]; new_d1_h1_cols.append(df['d1_h1_' + d1_min.strip('d1_').strip('_min') + '_MinMin'])
    #         df['d1_h1_' + d1_min.strip('d1_').strip('_min') + '_MinMax'] = df[d1_min] - df[h1_max]; new_d1_h1_cols.append( df['d1_h1_' + d1_min.strip('d1_').strip('_min') + '_MinMax'])
    #         df['d1_h1_' + d1_min.strip('d1_').strip('_min') + '_MaxMin'] = df[d1_max] - df[h1_min]; new_d1_h1_cols.append(df['d1_h1_' + d1_min.strip('d1_').strip('_min') + '_MaxMin'])
    #         df['d1_h1_' + d1_min.strip('d1_').strip('_min') + '_MaxMax'] = df[d1_max] - df[h1_max]; new_d1_h1_cols.append(df['d1_h1_' + d1_min.strip('d1_').strip('_min') + '_MaxMax'])

    #     print('  There are %d columns adding in' % (len(new_d1_h1_cols)))    

    #     ## Record the max columns for all d1 and h1 for future dropping list
    #     if tag == 'train':
    #         excludeColumns = set(excludeColumns).union(d1_cols_max).union(h1_cols_max)

    ### hospital_id & icu_id
    ## Method 1
    #     print('- Include hospital_id and icu_id and remain as original value')
    ## Method 2
    print('- Transfer icu_id and hopstipal_id to categorical data')
    if tag=='train':
        change_to_cat.append('icu_id')
        change_to_cat.append('hospital_id')
    ## Method 2.2
    #     print('Transfer icu_id and to categorical data and drop hopstipal_id')
    #     change_to_cat.append('icu_id')
    #     if tag == 'train':
    #         excludeColumns.add('apache_3j_diagnosis')
    ## Method 3
    #     print('- Exclude columns patient_id, icu_id, hospital_id, and apache_3j_diagnosis')
    #     if tag == 'train':
    #         excludeColumns.add('icu_id')
    #         excludeColumns.add('hospital_id')
    #         excludeColumns.add('patient_id')
    #         excludeColumns.add('apache_3j_diagnosis')
    

    ''' Drop certain columns by some criteria
    '''
    ### Exclude collinear (highly correlated) features 
    ## by dropping columns with correlations above threshold
    if tag == 'train':
        threshold = 0.9 # Threshold for removing correlated variables
        corr_matrix = df.drop(columns='hospital_death').corr().abs() # Absolute value correlation matrix
        # corr_matrix = df.corr().abs() # Absolute value correlation matrix
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)) # Upper triangle of correlations
        to_drop = set( [column for column in upper.columns if any(upper[column] > threshold)] )
        print('- There are %d columns to remove due to high correlation with threshold=0.9.' % (len(to_drop)))

        # record these columns into the list, excludeColumns
        excludeColumns = excludeColumns.union(to_drop)

#         ## Exclude h1_xxx_max and d1_xxx_max features 
#         df = df[df.columns.drop(list(df.filter(regex='_max')))]
    
    
    '''Deal with missing values
    '''
    ### fillna in height/weight/bmi
    if tag=='train':
        print('- Fill NaN of age, height, weight, and bmi')
        df['age'] = df['age'].fillna(df['age'].median())
        df['height'] = df.groupby(['ethnicity', 'gender'])['height'].apply(lambda x: x.fillna(x.median()))  
        df['weight'] = df.groupby(['ethnicity', 'gender', 'height'])['weight'].apply(lambda x: x.fillna(x.median()))    
        # print('Drop height and weight, remain only column bmi ')    
    #     if tag == 'train':
    #         excludeColumns.add('height')
    #         excludeColumns.add('weight')
    else:
        print('- Fill NaN of age, height, weight, and bmi by values from training data set')
        df['age'] = df['age'].fillna(train['age'].median())
        df['height'] = df['height'].fillna(train.groupby(['ethnicity', 'gender'])['height'].transform('median'))
        df['weight'] = df['weight'].fillna(train.groupby(['ethnicity', 'gender', 'height'])['weight'].transform('median'))
    
    df['bmi'] = df.apply( lambda x: x['weight']/(0.01 * x['height'])**2 if np.isnan(x['bmi']) else x['bmi'], axis=1)
    
    ### fillna for hospital_admit_source
    print('- Fill NaN of hospital_admit_source by using icu_type')
    df = hospital_source_observation(df)

    print('- Fill NaN of hospital_admit_source by using most frequent value in icu_type')    
    imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
    imputer.fit(df['hospital_admit_source'].values.reshape(-1, 1)) # Try it only when get 1-D, 2-D array error
    if tag == 'train':
        df['hospital_admit_source'] = imputer.transform(df['hospital_admit_source'].values.reshape(-1, 1))
    else:
        most_freq = train['hospital_admit_source'].value_counts().keys()[0]
        df['hospital_admit_source'] = df['hospital_admit_source'].fillna(most_freq)

    ### fillna for apache_4a_hospital_death_prob and apache_4a_icu_death_prob
    print('- Fill NaN of apache_4a_hospital_death_prob and apache_4a_icu_death_prob by each other')
    df['apache_4a_hospital_death_prob'
        ] = pd.DataFrame(df.loc[:,['apache_4a_hospital_death_prob', 'apache_4a_icu_death_prob']
                                 ].apply(lambda x: x['apache_4a_icu_death_prob'] if np.isnan(x['apache_4a_hospital_death_prob']) \
                                                                                 else x['apache_4a_hospital_death_prob'], 
                                         axis=1))
    df['apache_4a_icu_death_prob'
        ] = pd.DataFrame(df.loc[:,['apache_4a_hospital_death_prob', 'apache_4a_icu_death_prob']
                                 ].apply(lambda x: x['apache_4a_hospital_death_prob'] if np.isnan(x['apache_4a_icu_death_prob']) \
                                                                                      else x['apache_4a_icu_death_prob'], 
                                         axis=1))

    ### fillna for all numerical and object variables using '9999'
    #     print('- Fill NaN as "9999" for the following object columns:')
    #     i = 0
    #     for col in change_to_cat:
    #         if i==0:
    #             print("=> ["+col, end=', ')
    #         elif i!=len(change_to_cat)-1:
    #             print(col, end=', ')
    #         else:
    #             print(col+']')
    #         i+=1
    #     for col in change_to_cat:
    #         if col != 'age':
    #             df[col] = df[col].fillna('9999')
    #             df[col] = df[col].astype('str')

    #     print('- Fill NaN as "9999" for the remaining numeric columns')
    #     for col in set(df.columns).difference(set(change_to_cat)):
    #         df[col] = df[col].fillna(9999)
    
    ### Binary variables
    ## Method 1: make NaN as a new group NaNN: 
    # which represents a patient either doesn't need to fill the form,
    # or they couldn't fill the form
    # print('Create an NANN category for boolean variables')
    # for col in change_to_bool:
    #     df[col] = df[col].fillna('NANN')
    #     df[col] = df[col].astype('category')
    ## Method 2: change NaN --> 0
    #     for col in change_to_bool:
    #         df[col] = df[col].fillna(0)
    
    ## change binary columns to bool type
    # for col in change_to_bool:
    #     df[col] = df[col].astype('bool')
    
    
    
    # if tag == 'train':
    # Method 0:
    print('- Does not impute NaN for the remaining variables')
    
    # Method 1:
    # int variables
    #     print('Impute NaN by their median value for numerical variables')
    #     imputer = SimpleImputer(missing_values = np.nan, strategy = 'median')
    #     for col, dtype in zip(df.columns, df.dtypes):
    #         if dtype not in ['float', 'int', 'int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'bool', 'uint8'] or col == 'hospital_death': continue
    #         imputer.fit(df[col].values.reshape(-1, 1)) # Try it only when get 1-D, 2-D array error
    #         df[col] = imputer.transform(df[col].values.reshape(-1, 1))
    #     # object variables
    #     print('Impute NaN by their frequent value for category variables')
    #     imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
    #     for col, dtype in zip(df.columns, df.dtypes):
    #         if dtype not in ['object'] or col == 'hospital_death': continue
    #         imputer.fit(df[col].values.reshape(-1, 1)) # Try it only when get 1-D, 2-D array error
    #         df[col] = imputer.transform(df[col].values.reshape(-1, 1))

    
    
    '''Dealing with categorical variables
    '''
    
    ### Segment age
    ## Method 1: if age column doesn't have NaN value:
    #     print('- Cut age into groups and add to set change_to_cat')
    #     # df['age'] = df['age'].fillna(-1)
    #     bins = [0, 20, 40, 60, 80, 100]
    #     group_names = ['0-20', '20-39', '40-59', '60-79', '80-99']
    #     df['age'] = pd.cut(df['age'], bins, labels=group_names)
    #     if tag == 'train':
    #         change_to_cat.append('age')
    
    ## Method 2: if age column has NaN value:
    #     df['age'] = df['age'].fillna(-1)
    #     bins = [-1, 0, 20, 40, 60, 80, 100]
    #     group_names = ['NANN', '0-20', '20-39', '40-59', '60-79', '80-99']
    #     df['age'] = pd.cut(df['age'], bins, labels=group_names)
    
    
    ## Method 1: if column doesn't have NaN value:
    #     print('- Change the following list into category type: ')
    #     print(*change_to_cat)
    #     for col in change_to_cat:
    #         df[col] = df[col].astype('category')
    
    ## Method 2: if column has NaN value:
    # Change factor type to category and insert NANN category
    #     for col in change_to_cat:
    #         df[col] = df[col].fillna('NANN')
    #         df[col] = df[col].astype('category')
    
    
    '''Change categorical data into different form
    '''
    for col in change_to_cat:
        if (df[col].dtype == 'O') | (col=='age'):
            df[col] = df[col].astype('object')
        else:
            df[col] = df[col].fillna(-9999)
            df[col] = df[col].astype('str').str.replace('.','')
            df[col] = df[col].replace('-9999', np.nan) 
    
    ## if column has NaN value:
    # Change factor type to category and insert NANN category
    for col in change_to_cat:
        df[col] = df[col].fillna('NANN')
    
    ### Method 0:
    #     print('- Remain categorical variables as object')
            
    ### Method 1: Transforming into dummy variables
    print('- Change categorical data into dummies')       
    df = pd.get_dummies(df, columns=change_to_cat, dummy_na=True, drop_first=False)  
    
    
    
    # Checking for columns with only one value
    print('- Drop the following columns due to containing only one value: ')
    if tag=='train':
        i = 0
        for col in df.columns:
            if df[col].nunique() == 1:
                excludeColumns.add(col)
                if i==0:
                    print("=> ["+col, end=', ')
                else:
                    print(col, end=', ')
                i+=1
        print(']')
#         print('- Drop the following columns due to containing only one value: ')
#         for col in df.columns:
#             if df[col].nunique() == 1:
#                 excludeColumns.add(col)
#                 print(col, end=', ')
    

    
    ### Method 1-2: Transforming into dummy variables
    #     print('- Change categorical data into dummies, keep NaN as a category and drop the first category')
    #     df = pd.get_dummies(df, dummy_na=True, drop_first=False)  

    #     # Checking for columns with only one value
    #     if tag == 'train':
    #         print('- Drop the following columns due to containing only one value')
    #         for col in df.columns:
    #             if df[col].nunique() == 1:
    #                 excludeColumns.add(col)
    #                 print(col, end=', ')
    #     print()                

    ### Method 2: Encoding into integers
    #     print('- Change factor variables to Label Encoding')
    #     for col in change_to_cat:
    #         df[col] = df[col].astype('str')
    
    
    #     label_encoder = LabelEncoder()
    #     label_class_train = {}
    #     for col in change_to_cat:
    #         label_encoder = label_encoder.fit(df[col])
    #         df[col] = label_encoder.transform(df[col])
    #         label_class_train[col] = list(label_encoder.classes_)
    
    ### Method 3: One hot encoding



    
    '''Rename columns by replacing space into underscore
    '''
    # in order to avoid plot error
    df.columns = df.columns.str.replace(" ", "_")
    
       
    '''
    Imbalance dataset:
    '''
    ## Method 0: Do nothing with sparse data set
    print('- Does NOT balance sparse data set')
    ## Method 1: Balance sparse data set by duplicating death data
    #     if tag == 'train':
    #         print('- Balance sparse data set by duplicating death data')
    #         #print(len(df[df.hospital_death==1])/len(df))
    #         #print('before:')
    #         #print(train.hospital_death.value_counts())
    #         # Separate majority and minority classes
    #         df_majority = df[df.hospital_death==0]
    #         df_minority = df[df.hospital_death==1]

    #         # Resampling the minority levels to match the majority level
    #         # Upsample minority class
    #         df_minority_upsampled = resample(df_minority, 
    #                                          replace=True,     # sample with replacement
    #                                          n_samples=df.hospital_death.value_counts()[0],    # to match majority class
    #                                          random_state= 2020) # reproducible results
    #         # Combine majority class with upsampled minority class
    #         df = pd.concat([df_majority, df_minority_upsampled])

    #         # Display new class counts
    #         #print('after:')
    #         #print(train.hospital_death.value_counts())

    
    '''Scalling data
    '''
    #     print('- Scalling data by MinMaxScaler')
    #     scaler = MinMaxScaler()
    #     cols_to_scale = list(set(df.columns).difference(set(change_to_cat)).difference(set(excludeColumns)))
    #     df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])
    
    if tag == 'train':
        return df, excludeColumns, change_to_cat, new_d1_h1_cols
    else:
        return df, new_d1_h1_cols


###　Define lists of columns to be transformed
tag='train+test'
# tag = ''
# error_count = 0
# while ((len(tag) == 0) or (tag not in ['train', 'test', 'train+test'])) and (error_count<=2):
#     tag = input('Please type "train", "test", or "train+test" indicating this preprocess is for training, testing data set or both: ')
#     error_count += 1
if tag in ['train', 'test', 'train+test']:
    if (tag == 'train') or (tag == 'train+test'):
        change_to_bool = ['elective_surgery', 'readmission_status',
                          'apache_post_operative', 'arf_apache',
                          'gcs_unable_apache', 'intubated_apache',
                          'ventilated_apache', 'aids', 'cirrhosis',
                          'diabetes_mellitus', 'hepatic_failure',
                          'immunosuppression', 'leukemia', 'lymphoma',
                          'solid_tumor_with_metastasis'] ## columns will transform to boolin type
        change_to_cat = ['ethnicity', 'gender', 'hospital_admit_source',
                         'icu_admit_source', 'icu_stay_type', 'icu_type',
                         'apache_3j_diagnosis', 'apache_2_diagnosis',
                         'apache_3j_bodysystem', 'apache_2_bodysystem'] ## columns will transform to categorical type
        excludeColumns = ['hospital_death', 'patient_id'] ## Predefine unwanted columns

        outcome = 'hospital_death'

    ### Data preprocessing
    # train = pd.read_csv("../Dataset/training_v2.csv",  index_col = 'encounter_id')
    # test = pd.read_csv("../Dataset/unlabeled.csv",  index_col = 'encounter_id')

    if tag == 'train':
        train, excludeColumns, change_to_cat, new_d1_h1_cols = data_processing(import_data('training_v2.csv'), pd.DataFrame(), change_to_bool, change_to_cat, set(excludeColumns), 'train')
    elif tag == 'test':
        test, new_d1_h1_cols_test = data_processing(import_data('unlabeled.csv'), import_data('training_v2.csv'), change_to_bool, change_to_cat, excludeColumns, 'test')
    elif tag == 'train+test':
        train, excludeColumns, change_to_cat, new_d1_h1_cols = data_processing(import_data('training_v2.csv'), pd.DataFrame(), change_to_bool, change_to_cat, set(excludeColumns), 'train')
        with io.capture_output() as captured:
            test, new_d1_h1_cols_test = data_processing(import_data('unlabeled.csv'), import_data('training_v2.csv'), change_to_bool, change_to_cat, excludeColumns, 'test')
        # train, ohe, label_encoders = OHE(train, OneHotEncoder(), change_to_cat, 'train', {})
        # test = OHE(test, ohe, change_to_cat, 'test', label_encoders)
    else:
        print('Wrong input')
else:
    print('End of program due to wrong input!')
    del train, test

In [None]:
# Dummy Variable version
# %run "data_processing2.ipynb"

### Generate predictors and outcome variable for model usage
predictors = [s for s in train.columns if s not in excludeColumns]
outcome = 'hospital_death'
X = train[predictors]
y = train[outcome]

### Split dataset into training and testing
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state = 2020)

# X_test = test[predictors]
cols_when_model_builds  = set(X_train.columns)
X_test= test.copy()
for col in set(cols_when_model_builds).difference(X_test.columns):
    X_test[col] = np.nan
X_test = X_test[cols_when_model_builds]
X_test = X_test[X_train.columns]

In [22]:
# modelFile = "xgb_1.pickle"
gsFile = 'xgb_gs_2.pickle'
LogName = 'xgb_2.log'

params = {
    'objective'         :'binary:logistic',
    'n_estimators'      : 20000  , #default = 100
    'random_state'      : 2020  ,
    'seed'              : 2020  ,  
}

model = xgb.XGBClassifier(**params)

params_gs = {
    'learning_rate'     : [0.1, 0.01, 0.001]  , #default = 0.1  # typical range: 0.01 - 0.3
    'max_depth'         : np.linspace(3,10,8,endpoint=True)     , #default = 3
    'subsample'         : [0.8, 0.9, 1]   , #default = 1
    'colsample_bytree'  : np.linspace(0.3,0.8,6,endpoint=True), #default = 1  # many columns: 0.3 - 0.8 , a few columns: 0.8 - 1
    'gamma'             : [0, 1, 5]
}

fit_params={"early_stopping_rounds":100, 
            "eval_metric" : ["error", 'auc']
           }

scoring = {'AUC': 'roc_auc', 
           'Accuracy': make_scorer(accuracy_score), 
           'neg_log_loss':make_scorer(log_loss)}


gs = GridSearchCV(model, 
                  param_grid=params_gs, 
                  cv = 5, 
                  verbose=1,
                  scoring=scoring, 
                  refit='AUC', 
                  return_train_score=True,
                  n_jobs=-1
                 )


fsock=open(LogName,'w')
sys_old_out_put=sys.stdout
sys.stdout=fsock

%time gs.fit(X, y, **fit_params)

sys.stdout=sys_old_out_put
fsock.close()

# save model to file
pickle.dump(gs, open(gsFile, "wb"))

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [None]:
LogName = 'xgb_cv_2.log'

cvFile = "xgb_cv_2.pickle"

dtrain = xgb.DMatrix(X, label = y)

fsock=open(LogName,'w')
sys_old_out_put=sys.stdout
sys.stdout=fsock

%time cv = xgb.cv(gs.best_params_, \
                  dtrain, \
                  num_boost_round=50000, \
                  nfold=5, \
                  metrics=['auc'],\
                  early_stopping_rounds=100,\
                  stratified=True, \
                  seed=2020, \
                  verbose_eval=100)

sys.stdout=sys_old_out_put
fsock.close()

print('Best number of num_boost_round = {}'.format(cv.shape[0]))

pickle.dump(cv, open(cvFile, "wb"))

In [34]:
submitFile = 'submit_xgb_2.csv'
LogName = 'xgb_submit_2.log'


fsock=open(LogName,'w')
sys_old_out_put=sys.stdout
sys.stdout=fsock

model = xgb.XGBClassifier(**gs.best_params_,
                         n_estimator = (cv.shape[0]-1),
                         early_stopping_rounds=100,\
                         )

eval_set = [(X_train, y_train), (X_valid, y_valid)]
%time model.fit(X_train, y_train, \
                eval_metric=["error", 'auc'], \
                eval_set=eval_set, \
                verbose=100)

sys.stdout=sys_old_out_put
fsock.close()

########## prediction of unlabeled data #############

y_pred_unlabeled = model.predict_proba(X_test)[:,1]

data_to_submit = pd.DataFrame({
    'encounter_id':X_test.index,
    'hospital_death':y_pred_unlabeled
})

data_to_submit.to_csv(submitFile, index = False)

data_to_submit.head()

Unnamed: 0,encounter_id,hospital_death
0,2,0.93412
1,5,0.919474
2,7,0.920643
3,8,0.968877
4,10,0.947625
