# GBDT - LightGBM

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
import numpy as np
import pandas as pd
import lightgbm
import pickle
import json
import functools
from os import listdir
from os.path import isfile, join
from nilib import *

In [3]:
def train_gradient_boosting_baseline(train_file, valid_file, test_file, output_model_file, 
                                     drop_cols=None, random_forest = False):
    
    exp = pd.DataFrame(columns=['num_trees', 'learning_rate', 'num_leaves', 'best_round', 'metric', 'filename'])
    
    # load train/valid/test
    train, valid, test = load_atk_train_valid_test(train_file, valid_file, test_file)
    
    assert "instance_id" not in train.columns.values, "Wrong training set file for GBDT"

    # dropping some of the columns
    if drop_cols is not None:
        print ("Dropping columns:", drop_cols)
        train.drop(columns=drop_cols, inplace=True)
        valid.drop(columns=drop_cols, inplace=True)
        test.drop(columns=drop_cols, inplace=True)
    

    for num_trees in [100]:
        for learning_rate in [0.05]: #[0.01, 0.05]:
            for num_leaves in [24, 2**8]: #[16, 24]:
                # datasets
                lgbm_train = lightgbm.Dataset(data=train.iloc[:,:-1], 
                                              label=train.iloc[:,-1])

                lgbm_valid = lightgbm.Dataset(data=valid.iloc[:,:-1], 
                                              label=valid.iloc[:,-1])

                # run train
                lgbm_params = { 'learning_rate': learning_rate, 
                                'num_leaves': num_leaves,
                                'max_depth': 8,
                                'objective': 'regression'
                              } 
                if random_forest:
                    lgbm_params['boosting'] = 'rf'
                    lgbm_params['bagging_fraction'] = 0.8
                    lgbm_params['feature_fraction' ] = 0.8
                    lgbm_params['bagging_freq'] = 1

                lgbm_info = {}
                lgbm_model = lightgbm.train(lgbm_params, lgbm_train, 
                                            num_boost_round = num_trees,
#                                             fobj            = optimize_log_loss, 
#                                             feval           = avg_log_loss,
                                            evals_result    = lgbm_info,
                                            valid_sets      = [lgbm_train, lgbm_valid], 
                                            valid_names     = ['train', 'valid'],
                                            verbose_eval    = 50)
                
                best_valid_iter = np.argmin(lgbm_info['valid']['l2'])
                
                model_file_name = "{:s}_T{:d}_S{:04d}_L{:d}_R{:d}.model".format(output_model_file,
                                                                            num_trees,
                                                                            int(learning_rate*1000),
                                                                            num_leaves,
                                                                            best_valid_iter + 1
                                                                           )
                
                # update experimental results
                exp = exp.append({'num_trees': num_trees, 
                                  'learning_rate':learning_rate,
                                  'num_leaves':num_leaves, 
                                  'best_round':best_valid_iter+1, 
                                  'metric':lgbm_info['valid']['l2'][best_valid_iter],
                                  'filename':model_file_name},
                                 ignore_index=True)
                
        
                lgbm_model.save_model(model_file_name)
                print("Model saved to", model_file_name)
    
    return exp

# WINE Dataset

In [4]:
DATASET_NAME="wine"
DATASET_DIR="../data/{}".format(DATASET_NAME)
ATK_DIR=DATASET_DIR + "/attacks"
MODELS_DIR="../out/models/{}".format(DATASET_NAME)
TRAINING_FILENAME=DATASET_DIR + "/" + "train.csv.bz2"
VALIDATION_FILENAME=DATASET_DIR + "/" + "valid.csv.bz2"
TEST_FILENAME=DATASET_DIR + "/" + "test.csv.bz2"
MODEL_FILENAME=MODELS_DIR + "/std-gbdt_{}".format(DATASET_NAME)
RED_MODEL_FILENAME=MODELS_DIR + "/red-gbdt_{}".format(DATASET_NAME)
RF_MODEL_FILENAME=MODELS_DIR + "/rf-gbdt_{}".format(DATASET_NAME)

In [5]:
# FULL GDBT

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               MODEL_FILENAME
                                              )  

experiments.to_csv(MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

Loading pre-processed files...
[50]	train's l2: 0.504397	valid's l2: 0.619868
[100]	train's l2: 0.422295	valid's l2: 0.592451
Model saved to ../out/models/wine/std-gbdt_wine_T100_S0050_L24_R100.model
[50]	train's l2: 0.378532	valid's l2: 0.595098
[100]	train's l2: 0.31194	valid's l2: 0.577067
Model saved to ../out/models/wine/std-gbdt_wine_T100_S0050_L256_R97.model
  num_trees  learning_rate num_leaves best_round    metric  \
0       100           0.05         24        100  0.592451   
1       100           0.05        256         97  0.576870   

                                            filename  
0  ../out/models/wine/std-gbdt_wine_T100_S0050_L2...  
1  ../out/models/wine/std-gbdt_wine_T100_S0050_L2...  
best model is: ../out/models/wine/std-gbdt_wine_T100_S0050_L256_R97.model


In [6]:
# REDUCED GDBT

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               RED_MODEL_FILENAME,
                                               drop_cols=["alcohol", "residual_sugar", "volatile_acidity"]
                                              )  

experiments.to_csv(RED_MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

Loading pre-processed files...
Dropping columns: ['alcohol', 'residual_sugar', 'volatile_acidity']
[50]	train's l2: 0.566579	valid's l2: 0.675114
[100]	train's l2: 0.479925	valid's l2: 0.644402
Model saved to ../out/models/wine/red-gbdt_wine_T100_S0050_L24_R99.model
[50]	train's l2: 0.449016	valid's l2: 0.65299
[100]	train's l2: 0.361203	valid's l2: 0.631228
Model saved to ../out/models/wine/red-gbdt_wine_T100_S0050_L256_R100.model
  num_trees  learning_rate num_leaves best_round    metric  \
0       100           0.05         24         99  0.644273   
1       100           0.05        256        100  0.631228   

                                            filename  
0  ../out/models/wine/red-gbdt_wine_T100_S0050_L2...  
1  ../out/models/wine/red-gbdt_wine_T100_S0050_L2...  
best model is: ../out/models/wine/red-gbdt_wine_T100_S0050_L256_R100.model


In [7]:
# RANDOM FOREST

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               RF_MODEL_FILENAME,
                                               random_forest=True
                                              )  

experiments.to_csv(RF_MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

Loading pre-processed files...
[50]	train's l2: 0.589619	valid's l2: 0.656884
[100]	train's l2: 0.589057	valid's l2: 0.654269
Model saved to ../out/models/wine/rf-gbdt_wine_T100_S0050_L24_R100.model
[50]	train's l2: 0.499379	valid's l2: 0.621385
[100]	train's l2: 0.497748	valid's l2: 0.618154
Model saved to ../out/models/wine/rf-gbdt_wine_T100_S0050_L256_R100.model
  num_trees  learning_rate num_leaves best_round    metric  \
0       100           0.05         24        100  0.654269   
1       100           0.05        256        100  0.618154   

                                            filename  
0  ../out/models/wine/rf-gbdt_wine_T100_S0050_L24...  
1  ../out/models/wine/rf-gbdt_wine_T100_S0050_L25...  
best model is: ../out/models/wine/rf-gbdt_wine_T100_S0050_L256_R100.model


# CENSUS Dataset

In [10]:
DATASET_NAME="census"
DATASET_DIR="../data/{}".format(DATASET_NAME)
ATK_DIR=DATASET_DIR + "/attacks"
MODELS_DIR="../out/models/{}".format(DATASET_NAME)
TRAINING_FILENAME=DATASET_DIR + "/" + "train.csv.bz2"
VALIDATION_FILENAME=DATASET_DIR + "/" + "valid.csv.bz2"
TEST_FILENAME=DATASET_DIR + "/" + "test.csv.bz2"
MODEL_FILENAME=MODELS_DIR + "/std-gbdt_{}".format(DATASET_NAME)
RED_MODEL_FILENAME=MODELS_DIR + "/red-gbdt_{}".format(DATASET_NAME)
RF_MODEL_FILENAME=MODELS_DIR + "/rf-gbdt_{}".format(DATASET_NAME)

In [5]:
# FULL GDBT

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               MODEL_FILENAME
                                              )  

experiments.to_csv(MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

Loading pre-processed files...




[50]	train's l2: 0.371109	valid's l2: 0.383421
[100]	train's l2: 0.34835	valid's l2: 0.3708
Model saved to ../out/models/census/std-gbdt_census_T100_S0050_L24_R100.model
[50]	train's l2: 0.349592	valid's l2: 0.377621
[100]	train's l2: 0.320288	valid's l2: 0.368512
Model saved to ../out/models/census/std-gbdt_census_T100_S0050_L256_R100.model
  num_trees  learning_rate num_leaves best_round    metric  \
0       100           0.05         24        100  0.370800   
1       100           0.05        256        100  0.368512   

                                            filename  
0  ../out/models/census/std-gbdt_census_T100_S005...  
1  ../out/models/census/std-gbdt_census_T100_S005...  
best model is: ../out/models/census/std-gbdt_census_T100_S0050_L256_R100.model


In [6]:
# REDUCED GDBT

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               RED_MODEL_FILENAME,
                                               drop_cols=['workclass','marital_status',
                                                          'occupation', 'education_num',
                                                          'hours_per_week','capital_gain' ]
                                              )

experiments.to_csv(RED_MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

Loading pre-processed files...
Dropping columns: ['workclass', 'marital_status', 'occupation', 'education_num', 'hours_per_week', 'capital_gain']




[50]	train's l2: 0.516679	valid's l2: 0.524166
[100]	train's l2: 0.506028	valid's l2: 0.521019
Model saved to ../out/models/census/red-gbdt_census_T100_S0050_L24_R95.model
[50]	train's l2: 0.50455	valid's l2: 0.526173
[100]	train's l2: 0.490079	valid's l2: 0.523517
Model saved to ../out/models/census/red-gbdt_census_T100_S0050_L256_R93.model
  num_trees  learning_rate num_leaves best_round    metric  \
0       100           0.05         24         95  0.520961   
1       100           0.05        256         93  0.523221   

                                            filename  
0  ../out/models/census/red-gbdt_census_T100_S005...  
1  ../out/models/census/red-gbdt_census_T100_S005...  
best model is: ../out/models/census/red-gbdt_census_T100_S0050_L24_R95.model


In [15]:
# RANDOM FOREST

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               RF_MODEL_FILENAME,
                                               random_forest=True
                                              )  

experiments.to_csv(RF_MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

Loading pre-processed files...




[50]	train's l2: 0.394099	valid's l2: 0.398269
[100]	train's l2: 0.393485	valid's l2: 0.397635
Model saved to ../out/models/census/rf-gbdt_census_T100_S0050_L24_R100.model
[50]	train's l2: 0.378073	valid's l2: 0.39184
[100]	train's l2: 0.377624	valid's l2: 0.39113
Model saved to ../out/models/census/rf-gbdt_census_T100_S0050_L256_R92.model
  num_trees  learning_rate num_leaves best_round    metric  \
0       100           0.05         24        100  0.397635   
1       100           0.05        256         92  0.391093   

                                            filename  
0  ../out/models/census/rf-gbdt_census_T100_S0050...  
1  ../out/models/census/rf-gbdt_census_T100_S0050...  
best model is: ../out/models/census/rf-gbdt_census_T100_S0050_L256_R92.model


# SPAM

In [10]:
DATASET_NAME="spam"
DATASET_DIR="../data/{}".format(DATASET_NAME)
ATK_DIR=DATASET_DIR + "/attacks"
MODELS_DIR="../out/models/{}".format(DATASET_NAME)
TRAINING_FILENAME=DATASET_DIR + "/" + "train.csv.bz2"
VALIDATION_FILENAME=DATASET_DIR + "/" + "valid.csv.bz2"
TEST_FILENAME=DATASET_DIR + "/" + "test.csv.bz2"
MODEL_FILENAME=MODELS_DIR + "/std-gbdt_{}".format(DATASET_NAME)
RED_MODEL_FILENAME=MODELS_DIR + "/red-gbdt_{}".format(DATASET_NAME)

In [11]:
# FULL GDBT

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               MODEL_FILENAME
                                              )  

experiments.to_csv(MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

Pre-processing original files...
Loading: ../data/spam/train.csv.bz2
Loading: ../data/spam/valid.csv.bz2
Loading: ../data/spam/test.csv.bz2


FileNotFoundError: [Errno 2] No such file or directory: '../data/spam/train.csv.bz2'

In [None]:
# REDUCED GDBT

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               RED_MODEL_FILENAME,
                                               drop_cols=['char_freq_!', 'word_freq_remove',
                                                          'char_freq_$', 'capital_run_length_average',
                                                          'capital_run_length_total', 'word_freq_hp' ]
                                              )

experiments.to_csv(RED_MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

In [None]:
TRAIN, _, _, _ = load_atk_train_valid_test(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME)
print (TRAIN.columns)

In [None]:
TRAIN[TRAIN['word_freq_hp']>=1]['word_freq_hp'].describe()

In [None]:
###------------_####
def print_fx_imp(model, colnames):
    fx_uses = model.feature_importance(importance_type='split')
    fx_gain = model.feature_importance(importance_type='gain')

    for i,f in enumerate(np.argsort(fx_gain)[::-1]):
        print ("{:2d} {:20s} {:.3f} {:4d}".format(i, colnames[f], fx_gain[f], fx_uses[f]))

        

print(" -- GDBT --")    
gbdt = lightgbm.Booster(model_file="../out/models/spam/std-gbdt_spam_T100_S0050_L24_R99.model")
print(gbdt.num_trees())
print_fx_imp(gbdt, TRAIN.columns)

print(" -- Reduced GDBT --")    
redf = lightgbm.Booster(model_file="../out/models/spam/red-gbdt_spam_T100_S0050_L24_R92.model")
print(redf.num_trees())
print_fx_imp(redf, TRAIN.drop(columns=['char_freq_!', 'word_freq_remove',
                                                          'char_freq_$', 'capital_run_length_average',
                                                          'capital_run_length_total', 'word_freq_hp' ]).columns
            )


# print(" -- Adv. Boosting --")    
# advb = lightgbm.Booster(model_file="../out/models/census/adv-boosting_census_B30_T100_S0050_L24_R100.model")
# print(advb.num_trees())
# print_fx_imp(advb, TRAIN.columns)
