In [5]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
from tqdm import tqdm
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, roc_auc_score

### 1. importing dataset
* separating out x and y
* adding monotonic constraints

In [6]:
s0 = pd.read_csv('02 data split 0.csv')
s1 = pd.read_csv('02 data split 1.csv')
s2 = pd.read_csv('02 data split 2.csv')
mc = [0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1]
ic = ['race','gender','age','admission_type_id','admission_source_id','metformin','nateglinide',
      'glipizide','glyburide','pioglitazone','insulin','diabetesMed']

### 2. Light GBM
* identifying optimal hyper-parameters
* performance

In [7]:
objective_l = []
metric_l = []
boosting_type = []
lambda_l1_l = []
lambda_l2_l = []
learning_rate_l = []
max_depth_l = []
min_data_in_leaf_l = []
n_estimators_l = []
split_l = []
auc_train_l = []
f1_train_l = []
auc_test_l = []
f1_test_l = []

for lambda_l1_i in [0,0.1,0.2,0.5,1]:
    for lambda_l2_i in [0,0.1,0.2,0.5,1]:
        for learning_rate_i in tqdm([0.1,0.2,0.5,1], colour='blue'):
            for max_depth_i in [2,5,10,20,50]:
                for min_data_in_leaf_i in [10,20,50,100]:
                    for n_estimators_i in [10,20,50,100]:
                        for i in [0,1,2]:

                            model = lgb.LGBMClassifier(objective='binary',metric='auc',boosting_type='gbdt',
                                                    monotone_constraints=mc,categorical_feature=ic,missing='?',
                                                    lambda_l1=lambda_l1_i,lambda_l2=lambda_l2_i,learning_rate=learning_rate_i,
                                                    max_depth=max_depth_i,min_data_in_leaf=min_data_in_leaf_i,n_estimators=n_estimators_i,
                                                    random_state=0,verbose=-1)
                            objective_l.append('binary')
                            metric_l.append('auc')
                            boosting_type.append('gbdt')
                            lambda_l1_l.append(lambda_l1_i)
                            lambda_l2_l.append(lambda_l2_i)
                            learning_rate_l.append(learning_rate_i)
                            max_depth_l.append(max_depth_i)
                            min_data_in_leaf_l.append(min_data_in_leaf_i)
                            n_estimators_l.append(n_estimators_i)
                            split_l.append(i)

                            if i==0:
                                train = pd.concat([s1,s2])
                                test = s0
                            if i==1:
                                train = pd.concat([s2,s0])
                                test = s1
                            if i==2:
                                train = pd.concat([s0,s1])
                                test = s2
                            for j in ic:
                                train[j] = train[j].astype('category')
                                test[j] = test[j].astype('category')
                                
                            y = train['readmitted']
                            x = train.drop(['readmitted'], axis=1)
                            model.fit(x,y)
                            pred2 = []
                            pred1 = model.predict_proba(x)[:,1]
                            a1 = roc_auc_score(y,pred1)
                            for j in pred1:
                                if j > 0.30: pred2.append(1)
                                else: pred2.append(0)
                            c1 = confusion_matrix(y,pred2)
                            p = c1[1][1] / (c1[0][1]+c1[1][1])
                            r = c1[1][1] / (c1[1][0]+c1[1][1])
                            f1 = (2*p*r) / (p+r)
                            auc_train_l.append(a1)
                            f1_train_l.append(f1)
                            
                            y = test['readmitted']
                            x = test.drop(['readmitted'], axis=1)
                            pred2 = []
                            pred1 = model.predict_proba(x)[:,1]
                            a1 = roc_auc_score(y,pred1)
                            for j in pred1:
                                if j > 0.30: pred2.append(1)
                                else: pred2.append(0)
                            c1 = confusion_matrix(y,pred2)
                            p = c1[1][1] / (c1[0][1]+c1[1][1])
                            r = c1[1][1] / (c1[1][0]+c1[1][1])
                            f1 = (2*p*r) / (p+r)
                            auc_test_l.append(a1)
                            f1_test_l.append(f1)
                            

100%|[34m██████████[0m| 4/4 [01:10<00:00, 17.72s/it]
100%|[34m██████████[0m| 4/4 [01:23<00:00, 20.88s/it]
100%|[34m██████████[0m| 4/4 [01:32<00:00, 23.07s/it]
100%|[34m██████████[0m| 4/4 [01:32<00:00, 23.20s/it]
100%|[34m██████████[0m| 4/4 [01:34<00:00, 23.53s/it]
100%|[34m██████████[0m| 4/4 [01:33<00:00, 23.42s/it]
100%|[34m██████████[0m| 4/4 [01:34<00:00, 23.59s/it]
100%|[34m██████████[0m| 4/4 [01:34<00:00, 23.70s/it]
100%|[34m██████████[0m| 4/4 [01:34<00:00, 23.69s/it]
100%|[34m██████████[0m| 4/4 [01:35<00:00, 23.93s/it]
100%|[34m██████████[0m| 4/4 [01:38<00:00, 24.59s/it]
100%|[34m██████████[0m| 4/4 [01:38<00:00, 24.60s/it]
100%|[34m██████████[0m| 4/4 [01:40<00:00, 25.07s/it]
100%|[34m██████████[0m| 4/4 [01:40<00:00, 25.09s/it]
100%|[34m██████████[0m| 4/4 [01:39<00:00, 24.96s/it]
100%|[34m██████████[0m| 4/4 [01:40<00:00, 25.07s/it]
100%|[34m██████████[0m| 4/4 [01:40<00:00, 25.09s/it]
100%|[34m██████████[0m| 4/4 [01:41<00:00, 25.32s/it]
100%|[34m

### 3. test
* adding monotonic constraints
* performance

In [8]:
df = pd.DataFrame({'objective':objective_l, 'metric':metric_l, 'lambda_l1': lambda_l1_l, 'lambda_l2':lambda_l2_l,
                   'learning_rate':learning_rate_l, 'max_depth':max_depth_l, 'min_data_in_leaf':min_data_in_leaf_l,
                   'n_estimators':n_estimators_l, 'split':split_l, 'auc_train':auc_train_l, 'f1_train':f1_train_l,
                   'auc_test':auc_test_l, 'f1_test':f1_test_l})
gb = df.groupby(['objective','metric','lambda_l1','lambda_l2','learning_rate','max_depth','min_data_in_leaf',
                 'n_estimators']).agg({'auc_train':'mean','f1_train':'mean','auc_test':'mean','f1_test':'mean'}).reset_index()
gb.to_csv('03 hyper-parameters.csv', index=False)
gb.shape

(8000, 12)