# GBDT - LightGBM

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
import numpy as np
import pandas as pd
import lightgbm
import pickle
import json
import functools
from os import listdir
from os.path import isfile, join
from nilib import *

In [3]:
def train_gradient_boosting_baseline(train_file, valid_file, test_file, output_model_file, drop_cols=None):
    
    exp = pd.DataFrame(columns=['num_trees', 'learning_rate', 'num_leaves', 'best_round', 'metric', 'filename'])
    
    # load train/valid/test
    train, valid, test, cat_fx = load_atk_train_valid_test(train_file, valid_file, test_file)
    
    assert "instance_id" not in train.columns.values, "Wrong training set file for GBDT"

    # dropping some of the columns
    if drop_cols is not None:
        print ("Dropping columns:", drop_cols)
        train.drop(columns=drop_cols, inplace=True)
        valid.drop(columns=drop_cols, inplace=True)
        test.drop(columns=drop_cols, inplace=True)
    
    
    # get index of categorical features 
    cat_fx = np.where(train.columns.isin(cat_fx))[0]
    cat_fx = list([int(x) for x in cat_fx])  
    print("CatFX:", train.columns.values[cat_fx])
    

    for num_trees in [100, 500]:
        for learning_rate in [0.05]: #[0.01, 0.05]:
            for num_leaves in [24]: #[16, 24]:
                # datasets
                lgbm_train = lightgbm.Dataset(data=train.iloc[:,:-1].values, 
                                              label=train.iloc[:,-1].values,
                                              categorical_feature = cat_fx)

                lgbm_valid = lightgbm.Dataset(data=valid.iloc[:,:-1].values, 
                                              label=valid.iloc[:,-1].values,
                                              categorical_feature = cat_fx)

                # run train
                lgbm_params = { 'learning_rate': learning_rate, 
                                'num_leaves': num_leaves,
                                'objective': 'regression'
                              } 
                lgbm_info = {}
                lgbm_model = lightgbm.train(lgbm_params, lgbm_train, 
                                            num_boost_round = num_trees,
#                                             fobj            = optimize_log_loss, 
#                                             feval           = avg_log_loss,
                                            evals_result    = lgbm_info,
                                            valid_sets      = [lgbm_train, lgbm_valid], 
                                            valid_names     = ['train', 'valid'],
                                            verbose_eval    = 50)
                
                best_valid_iter = np.argmin(lgbm_info['valid']['l2'])
                
                model_file_name = "{:s}_T{:d}_S{:04d}_L{:d}_R{:d}.model".format(output_model_file,
                                                                            num_trees,
                                                                            int(learning_rate*1000),
                                                                            num_leaves,
                                                                            best_valid_iter + 1
                                                                           )
                
                # update experimental results
                exp = exp.append({'num_trees': num_trees, 
                                  'learning_rate':learning_rate,
                                  'num_leaves':num_leaves, 
                                  'best_round':best_valid_iter+1, 
                                  'metric':lgbm_info['valid']['l2'][best_valid_iter],
                                  'filename':model_file_name},
                                 ignore_index=True)
                
        
                lgbm_model.save_model(model_file_name)
                print("Model saved to", model_file_name)

    
    return exp

# WINE Dataset

In [4]:
DATASET_NAME="wine"
DATASET_DIR="../data/{}".format(DATASET_NAME)
ATK_DIR=DATASET_DIR + "/attacks"
MODELS_DIR="../out/models/{}".format(DATASET_NAME)
TRAINING_FILENAME=DATASET_DIR + "/" + "train.csv.bz2"
VALIDATION_FILENAME=DATASET_DIR + "/" + "valid.csv.bz2"
TEST_FILENAME=DATASET_DIR + "/" + "test.csv.bz2"
MODEL_FILENAME=MODELS_DIR + "/std-gbdt_{}".format(DATASET_NAME)
RED_MODEL_FILENAME=MODELS_DIR + "/red-gbdt_{}".format(DATASET_NAME)

In [5]:
# FULL GDBT

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               MODEL_FILENAME
                                              )  

experiments.to_csv(MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

Pre-processing original files...
Loading: ../data/wine/train.csv.bz2
Loading: ../data/wine/valid.csv.bz2
Loading: ../data/wine/test.csv.bz2
Train/Valid/Test sizes: (4547, 13) (650, 13) (1300, 13)
Train/Valid/Test split: 0.70 0.10 0.20
CatFX: []
Train/Valid/Test sizes: (3898, 13) (1299, 13) (1300, 13)
Train/Valid/Test split: 0.60 0.20 0.20
Saving processed files *.cat.bz2
CatFX: []




[50]	train's l2: 0.0696087	valid's l2: 0.100541
[100]	train's l2: 0.0521307	valid's l2: 0.0955715
Model saved to ../out/models/wine/std-gbdt_wine_T100_S0050_L24_R100.model
[50]	train's l2: 0.0696087	valid's l2: 0.100541
[100]	train's l2: 0.0521307	valid's l2: 0.0955715
[150]	train's l2: 0.042543	valid's l2: 0.0940157
[200]	train's l2: 0.0351522	valid's l2: 0.0935352
[250]	train's l2: 0.0296449	valid's l2: 0.0925692
[300]	train's l2: 0.0254733	valid's l2: 0.0919512
[350]	train's l2: 0.0222848	valid's l2: 0.0913982
[400]	train's l2: 0.0198003	valid's l2: 0.0909062
[450]	train's l2: 0.0178177	valid's l2: 0.0902531
[500]	train's l2: 0.0159348	valid's l2: 0.0897776
Model saved to ../out/models/wine/std-gbdt_wine_T500_S0050_L24_R497.model
  num_trees  learning_rate num_leaves best_round    metric  \
0       100           0.05         24        100  0.095571   
1       500           0.05         24        497  0.089722   

                                            filename  
0  ../out/model

In [6]:
# REDUCED GDBT

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               RED_MODEL_FILENAME,
                                               drop_cols=["alcohol", "residual_sugar", "volatile_acidity"]
                                              )  

experiments.to_csv(RED_MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

Pre-processing original files...
Loading: ../data/wine/train.csv.bz2
Loading: ../data/wine/valid.csv.bz2
Loading: ../data/wine/test.csv.bz2
Train/Valid/Test sizes: (4547, 13) (650, 13) (1300, 13)
Train/Valid/Test split: 0.70 0.10 0.20
CatFX: []
Train/Valid/Test sizes: (3898, 13) (1299, 13) (1300, 13)
Train/Valid/Test split: 0.60 0.20 0.20
Saving processed files *.cat.bz2
Dropping columns: ['alcohol', 'residual_sugar', 'volatile_acidity']
CatFX: []
[50]	train's l2: 0.0779232	valid's l2: 0.110961
[100]	train's l2: 0.0623714	valid's l2: 0.107046
Model saved to ../out/models/wine/red-gbdt_wine_T100_S0050_L24_R100.model
[50]	train's l2: 0.0779232	valid's l2: 0.110961
[100]	train's l2: 0.0623714	valid's l2: 0.107046
[150]	train's l2: 0.0524584	valid's l2: 0.103999
[200]	train's l2: 0.0447081	valid's l2: 0.102165
[250]	train's l2: 0.0388256	valid's l2: 0.100769
[300]	train's l2: 0.034352	valid's l2: 0.0997083
[350]	train's l2: 0.0306908	valid's l2: 0.0987698
[400]	train's l2: 0.027596	valid's

# CENSUS Dataset

In [7]:
DATASET_NAME="census"
DATASET_DIR="../data/{}".format(DATASET_NAME)
ATK_DIR=DATASET_DIR + "/attacks"
MODELS_DIR="../out/models/{}".format(DATASET_NAME)
TRAINING_FILENAME=DATASET_DIR + "/" + "train.csv.bz2"
VALIDATION_FILENAME=DATASET_DIR + "/" + "valid.csv.bz2"
TEST_FILENAME=DATASET_DIR + "/" + "test.csv.bz2"
MODEL_FILENAME=MODELS_DIR + "/std-gbdt_{}".format(DATASET_NAME)
RED_MODEL_FILENAME=MODELS_DIR + "/red-gbdt_{}".format(DATASET_NAME)

In [8]:
# FULL GDBT

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               MODEL_FILENAME
                                              )  

experiments.to_csv(MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

Pre-processing original files...
Loading: ../data/census/train.csv.bz2
Loading: ../data/census/valid.csv.bz2
Loading: ../data/census/test.csv.bz2
Train/Valid/Test sizes: (27144, 14) (3017, 14) (15059, 14)
Train/Valid/Test split: 0.60 0.07 0.33
CatFX: ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']
Train/Valid/Test sizes: (27132, 14) (9044, 14) (9044, 14)
Train/Valid/Test split: 0.60 0.20 0.20
Saving processed files *.cat.bz2
CatFX: ['workclass' 'marital_status' 'occupation' 'relationship' 'race' 'sex'
 'native_country']




[50]	train's l2: 0.370145	valid's l2: 0.382836
[100]	train's l2: 0.346384	valid's l2: 0.369207
Model saved to ../out/models/census/std-gbdt_census_T100_S0050_L24_R100.model
[50]	train's l2: 0.370145	valid's l2: 0.382836
[100]	train's l2: 0.346384	valid's l2: 0.369207
[150]	train's l2: 0.337393	valid's l2: 0.367081
[200]	train's l2: 0.330219	valid's l2: 0.366698
[250]	train's l2: 0.324314	valid's l2: 0.366915
[300]	train's l2: 0.31865	valid's l2: 0.367649
[350]	train's l2: 0.313778	valid's l2: 0.368106
[400]	train's l2: 0.308633	valid's l2: 0.368487
[450]	train's l2: 0.303494	valid's l2: 0.368954
[500]	train's l2: 0.299372	valid's l2: 0.369278
Model saved to ../out/models/census/std-gbdt_census_T500_S0050_L24_R207.model
  num_trees  learning_rate num_leaves best_round    metric  \
0       100           0.05         24        100  0.369207   
1       500           0.05         24        207  0.366592   

                                            filename  
0  ../out/models/census/std-g

In [9]:
# REDUCED GDBT

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               RED_MODEL_FILENAME,
                                               drop_cols=['workclass','marital_status',
                                                          'occupation', 'education_num',
                                                          'hours_per_week','capital_gain' ]
                                              )

experiments.to_csv(RED_MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

Loading pre-processed files...
Dropping columns: ['workclass', 'marital_status', 'occupation', 'education_num', 'hours_per_week', 'capital_gain']
CatFX: ['relationship' 'race' 'sex' 'native_country']
[50]	train's l2: 0.515896	valid's l2: 0.524046
[100]	train's l2: 0.503698	valid's l2: 0.52112
Model saved to ../out/models/census/red-gbdt_census_T100_S0050_L24_R98.model
[50]	train's l2: 0.515896	valid's l2: 0.524046
[100]	train's l2: 0.503698	valid's l2: 0.52112
[150]	train's l2: 0.496748	valid's l2: 0.521476
[200]	train's l2: 0.490996	valid's l2: 0.521882
[250]	train's l2: 0.486288	valid's l2: 0.522456
[300]	train's l2: 0.482528	valid's l2: 0.523083
[350]	train's l2: 0.478771	valid's l2: 0.523674
[400]	train's l2: 0.475296	valid's l2: 0.524305
[450]	train's l2: 0.471838	valid's l2: 0.524926
[500]	train's l2: 0.46855	valid's l2: 0.525744
Model saved to ../out/models/census/red-gbdt_census_T500_S0050_L24_R123.model
  num_trees  learning_rate num_leaves best_round    metric  \
0       100 

# SPAM

In [15]:
DATASET_NAME="spam"
DATASET_DIR="../data/{}".format(DATASET_NAME)
ATK_DIR=DATASET_DIR + "/attacks"
MODELS_DIR="../out/models/{}".format(DATASET_NAME)
TRAINING_FILENAME=DATASET_DIR + "/" + "train.csv.bz2"
VALIDATION_FILENAME=DATASET_DIR + "/" + "valid.csv.bz2"
TEST_FILENAME=DATASET_DIR + "/" + "test.csv.bz2"
MODEL_FILENAME=MODELS_DIR + "/std-gbdt_{}".format(DATASET_NAME)
RED_MODEL_FILENAME=MODELS_DIR + "/red-gbdt_{}".format(DATASET_NAME)

In [17]:
# FULL GDBT

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               MODEL_FILENAME
                                              )  

experiments.to_csv(MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

Pre-processing original files...
Loading: ../data/spam/train.csv.bz2
Loading: ../data/spam/valid.csv.bz2
Loading: ../data/spam/test.csv.bz2
Train/Valid/Test sizes: (2760, 58) (921, 58) (920, 58)
Train/Valid/Test split: 0.60 0.20 0.20
CatFX: []
Train/Valid/Test sizes: (2760, 58) (920, 58) (921, 58)
Train/Valid/Test split: 0.60 0.20 0.20
Saving processed files *.cat.bz2
CatFX: []
[50]	train's l2: 0.0348443	valid's l2: 0.0558493
[100]	train's l2: 0.0233714	valid's l2: 0.0484396
Model saved to ../out/models/spam/std-gbdt_spam_T100_S0050_L24_R99.model
  num_trees  learning_rate num_leaves best_round    metric  \
0       100           0.05         24         99  0.048388   

                                            filename  
0  ../out/models/spam/std-gbdt_spam_T100_S0050_L2...  
best model is: ../out/models/spam/std-gbdt_spam_T100_S0050_L24_R99.model


In [21]:
# REDUCED GDBT

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               RED_MODEL_FILENAME,
                                               drop_cols=['char_freq_!', 'word_freq_remove',
                                                          'char_freq_$', 'capital_run_length_average',
                                                          'capital_run_length_total', 'word_freq_hp' ]
                                              )

experiments.to_csv(RED_MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

Pre-processing original files...
Loading: ../data/spam/train.csv.bz2
Loading: ../data/spam/valid.csv.bz2
Loading: ../data/spam/test.csv.bz2
Train/Valid/Test sizes: (2760, 58) (921, 58) (920, 58)
Train/Valid/Test split: 0.60 0.20 0.20
CatFX: []
Train/Valid/Test sizes: (2760, 58) (920, 58) (921, 58)
Train/Valid/Test split: 0.60 0.20 0.20
Saving processed files *.cat.bz2
Dropping columns: ['char_freq_!', 'word_freq_remove', 'char_freq_$', 'capital_run_length_average', 'capital_run_length_total', 'word_freq_hp']
CatFX: []
[50]	train's l2: 0.0434418	valid's l2: 0.0624735
[100]	train's l2: 0.0310334	valid's l2: 0.0554811
Model saved to ../out/models/spam/red-gbdt_spam_T100_S0050_L24_R92.model
  num_trees  learning_rate num_leaves best_round    metric  \
0       100           0.05         24         92  0.055465   

                                            filename  
0  ../out/models/spam/red-gbdt_spam_T100_S0050_L2...  
best model is: ../out/models/spam/red-gbdt_spam_T100_S0050_L24_R92.mo

In [27]:
TRAIN, _, _, _ = load_atk_train_valid_test(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME)
print (TRAIN.columns)

Pre-processing original files...
Loading: ../data/spam/train.csv.bz2
Loading: ../data/spam/valid.csv.bz2
Loading: ../data/spam/test.csv.bz2
Train/Valid/Test sizes: (2760, 58) (921, 58) (920, 58)
Train/Valid/Test split: 0.60 0.20 0.20
CatFX: []
Train/Valid/Test sizes: (2760, 58) (920, 58) (921, 58)
Train/Valid/Test split: 0.60 0.20 0.20
Saving processed files *.cat.bz2
Index(['word_freq_make', 'word_freq_address', 'word_freq_all', 'word_freq_3d',
       'word_freq_our', 'word_freq_over', 'word_freq_remove',
       'word_freq_internet', 'word_freq_order', 'word_freq_mail',
       'word_freq_receive', 'word_freq_will', 'word_freq_people',
       'word_freq_report', 'word_freq_addresses', 'word_freq_free',
       'word_freq_business', 'word_freq_email', 'word_freq_you',
       'word_freq_credit', 'word_freq_your', 'word_freq_font', 'word_freq_000',
       'word_freq_money', 'word_freq_hp', 'word_freq_hpl', 'word_freq_george',
       'word_freq_650', 'word_freq_lab', 'word_freq_labs', 'word

In [58]:
TRAIN[TRAIN['word_freq_hp']>=1]['word_freq_hp'].describe()

count    417.000000
mean       3.370048
std        3.132073
min        1.000000
25%        1.610000
50%        2.320000
75%        3.520000
max       20.830000
Name: word_freq_hp, dtype: float64

In [26]:
###------------_####
def print_fx_imp(model, colnames):
    fx_uses = model.feature_importance(importance_type='split')
    fx_gain = model.feature_importance(importance_type='gain')

    for i,f in enumerate(np.argsort(fx_gain)[::-1]):
        print ("{:2d} {:20s} {:.3f} {:4d}".format(i, colnames[f], fx_gain[f], fx_uses[f]))

        

print(" -- GDBT --")    
gbdt = lightgbm.Booster(model_file="../out/models/spam/std-gbdt_spam_T100_S0050_L24_R99.model")
print(gbdt.num_trees())
print_fx_imp(gbdt, TRAIN.columns)

print(" -- Reduced GDBT --")    
redf = lightgbm.Booster(model_file="../out/models/spam/red-gbdt_spam_T100_S0050_L24_R92.model")
print(redf.num_trees())
print_fx_imp(redf, TRAIN.drop(columns=['char_freq_!', 'word_freq_remove',
                                                          'char_freq_$', 'capital_run_length_average',
                                                          'capital_run_length_total', 'word_freq_hp' ]).columns
            )


# print(" -- Adv. Boosting --")    
# advb = lightgbm.Booster(model_file="../out/models/census/adv-boosting_census_B30_T100_S0050_L24_R100.model")
# print(advb.num_trees())
# print_fx_imp(advb, TRAIN.columns)


 -- GDBT --
100
 0 char_freq_!          1675.403  179
 1 word_freq_remove     898.772   68
 2 char_freq_$          884.417   72
 3 capital_run_length_average 417.196  172
 4 capital_run_length_total 340.651  163
 5 word_freq_hp         336.482  104
 6 word_freq_free       180.480   90
 7 word_freq_money      175.816   25
 8 capital_run_length_longest 171.871  158
 9 word_freq_your       158.627  106
10 word_freq_our        151.961  110
11 word_freq_edu        128.443   74
12 word_freq_george     92.992   75
13 word_freq_you        72.602  124
14 word_freq_business   64.007   47
15 word_freq_000        41.320   34
16 word_freq_re         32.453   54
17 char_freq_(          31.598   76
18 word_freq_will       26.902   70
19 word_freq_1999       26.525   35
20 word_freq_hpl        20.583    6
21 word_freq_internet   20.143   23
22 word_freq_meeting    18.177   39
23 word_freq_technology 13.427   16
24 char_freq_#          13.078   27
25 word_freq_over       12.347   44
26 word_freq_font  