In [3]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, roc_auc_score

### 1. importing dataset
* separating out x and y
* adding monotonic constraints (mc)
* adding indexes of categorical variables (ic)
* converting to categorical variables 

In [4]:
df = pd.read_csv('01 data prep.csv')
y = df['readmitted']
x = df.drop(['readmitted'], axis=1)
mc = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1]

ic = ['race','gender','age','admission_type_id','admission_source_id','metformin','repaglinide',
         'nateglinide','glimepiride','glipizide','glyburide','pioglitazone','rosiglitazone','acarbose',
         'insulin','glyburide.metformin','change','diabetesMed']
for i in ic:
      x[i] = x[i].astype('category')
df.shape

(10000, 26)

### 2. Hyper-parameters 
* parameters - objective, metric, boosting_type, monotone_constraints, categorical_feature, missing
* hyper parameters - learning_rate, max_depth, min_data_in_leaf, n_estimators, lambda_l1, lambda_l2
* model - LGBMClassifier

In [5]:
params = {'objective':'binary',
          'metric':'auc',
          'boosting_type':'gbdt',
          'monotone_constraints':mc,
          'categorical_feature':ic,
          'missing':'?'}

param_grid = {"lambda_l1":[0,0.1,0.2,0.5,1],
              "lambda_l2":[0,0.1,0.2,0.5,1],
              "learning_rate":[0.1,0.2,0.5,1],
              "max_depth":[1,2,5,10],
              "min_data_in_leaf":[10,20,50,100],
              "n_estimators":[10,20,50,100]}

model = lgb.LGBMClassifier(**params, random_state=0, verbose=-1)

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(x, y)
print(grid_search.best_score_)
print(grid_search.best_params_)

Fitting 3 folds for each of 6400 candidates, totalling 19200 fits
0.6534001930486989
{'lambda_l1': 0.1, 'lambda_l2': 0.2, 'learning_rate': 0.2, 'max_depth': 2, 'min_data_in_leaf': 50, 'n_estimators': 100}


In [6]:
model = lgb.LGBMClassifier(objective='binary',metric='auc',boosting_type='gbdt',
                           monotone_constraints=mc,categorical_feature=ic,missing='?',
                           lambda_l1=0.1,lambda_l2=0.2,learning_rate=0.2,max_depth=2,
                           min_data_in_leaf=50,n_estimators=100,random_state=0,verbose=-1)
model.fit(x,y)
importances = model.feature_importances_ 
gb = pd.DataFrame({'Feature':x.columns, 'Gain':importances}).sort_values(by='Gain', ascending=False)
print(gb)

                Feature  Gain
4   admission_source_id    34
2                   age    30
23     number_inpatient    22
19   num_lab_procedures    21
3     admission_type_id    18
14              insulin    15
22     number_emergency    14
0                  race    13
10            glyburide    13
24     number_diagnoses    13
11         pioglitazone    12
18     time_in_hospital    10
21    number_outpatient    10
5             metformin     9
9             glipizide     8
1                gender     8
20      num_medications     8
17          diabetesMed     6
7           nateglinide     6
8           glimepiride     4
16               change     4
6           repaglinide     3
12        rosiglitazone     2
13             acarbose     0
15  glyburide.metformin     0


### 3. optimal cut-off for f1-score
* cut-off from 0.05 to 0.65 are tested
* the cut-off where the f1-score is highest is selected

In [8]:
for i in range(5,60,5):
    pred2 = []
    pred1 = model.predict_proba(x)[:,1]
    for j in pred1:
        if j > i/100: pred2.append(1)
        else: pred2.append(0)
    c1 = confusion_matrix(y,pred2)
    p = c1[1][1] / (c1[0][1]+c1[1][1])
    r = c1[1][1] / (c1[1][0]+c1[1][1])
    f1 = (2*p*r) / (p+r)
    print(i,'\t',np.round(f1,3))

5 	 0.568
10 	 0.568
15 	 0.573
20 	 0.586
25 	 0.6
30 	 0.607
35 	 0.602
40 	 0.575
45 	 0.529
50 	 0.46
55 	 0.372


### 4. data splits
* top 5 variables are used to sort the data
* 0/1 are separated out and 3 splits are created
* dropped low importance variables - glimepiride, change, repaglinide, rosiglitazone, acarbose and glyburide.metformin

In [9]:
df['admission_source_id bin'] = np.where(df['admission_source_id'].isin(['Emergency Room','?']), 'high',
                                         np.where(df['admission_source_id'].isin(['Transfer from a Skilled Nursing Facility (SNF)','Transfer from a hospital','Transfer from another health care facility']), 'low', 'mid'))
df['age bin'] = np.where(df['age'].isin(['[80-90)','[70-80)','[60-70)']), 'high',
                         np.where(df['age'].isin(['[30-40)','Others']), 'low', 'mid'))
df['number_inpatient bin'] = np.where(df['number_inpatient'].isin([0,1]), 'low', 
                                      np.where(df['number_inpatient'].isin([2,3,7]), 'mid', 'high'))
df['num_lab_procedures bin'] = np.where(df['num_lab_procedures'].isin([101,104,108,83,73,71,81,75]),'high',
                                                                      np.where(df['num_lab_procedures'].isin([9,18,38,5,26,23,4,15,6,89,95,120,113,109,96,114,107,98,103,]),'low','mid'))
df['admission_type_id bin'] = np.where(df['admission_type_id'].isin(['?','Emergency']), 'high', 'low')

df = df.sort_values(['admission_source_id bin','age bin','number_inpatient bin',
                     'num_lab_procedures bin','admission_type_id bin']).reset_index(drop=True)
df0 = df[df['readmitted']==0].reset_index(drop=True).reset_index()
df1 = df[df['readmitted']==1].reset_index(drop=True).reset_index()
print('non event :',df0.shape)
print('event :',df1.shape)

non event : (6035, 32)
event : (3965, 32)


In [10]:
split0 = pd.concat([df0[df0['index']%3==0], df1[df1['index']%3==0]])
split0 = split0.drop(['admission_source_id bin','age bin','number_inpatient bin','num_lab_procedures bin',
                      'admission_type_id bin','glimepiride','change','repaglinide','rosiglitazone',
                      'acarbose','glyburide.metformin','index'], axis=1)
print('1st split :',split0.shape)

split1 = pd.concat([df0[df0['index']%3==1], df1[df1['index']%3==1]])
split1 = split1.drop(['admission_source_id bin','age bin','number_inpatient bin','num_lab_procedures bin',
                      'admission_type_id bin','glimepiride','change','repaglinide','rosiglitazone',
                      'acarbose','glyburide.metformin','index'], axis=1)
print('2nd split :',split1.shape)

split2 = pd.concat([df0[df0['index']%3==2], df1[df1['index']%3==2]])
split2 = split2.drop(['admission_source_id bin','age bin','number_inpatient bin','num_lab_procedures bin',
                      'admission_type_id bin','glimepiride','change','repaglinide','rosiglitazone',
                      'acarbose','glyburide.metformin','index'], axis=1)
print('3rd split :',split2.shape)

1st split : (3334, 20)
2nd split : (3334, 20)
3rd split : (3332, 20)


### 5. exporting dataset
* exporting all the three datasets
* training on s0+s1 and testing on s2
* training on s1+s2 and testing on s0
* training on s2+s0 and testing on s1

In [11]:
split0.to_csv('02 data split 0.csv', index=False)
split1.to_csv('02 data split 1.csv', index=False)
split2.to_csv('02 data split 2.csv', index=False)

df = pd.concat([split0,split1,split2])
df.to_csv('04 model test data.csv', index=False)
df.shape

(10000, 20)