# GBDT - LightGBM

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
import numpy as np
import pandas as pd
import lightgbm
import pickle
import json
import functools
from os import listdir
from os.path import isfile, join
from nilib import *

In [3]:
def train_gradient_boosting_baseline(train_file, valid_file, test_file, output_model_file, drop_cols=None):
    
    exp = pd.DataFrame(columns=['num_trees', 'learning_rate', 'num_leaves', 'best_round', 'metric', 'filename'])
    
    # load train/valid/test
    train, valid, test = load_atk_train_valid_test(train_file, valid_file, test_file)
    
    assert "instance_id" not in train.columns.values, "Wrong training set file for GBDT"

    # dropping some of the columns
    if drop_cols is not None:
        print ("Dropping columns:", drop_cols)
        train.drop(columns=drop_cols, inplace=True)
        valid.drop(columns=drop_cols, inplace=True)
        test.drop(columns=drop_cols, inplace=True)
    

    for num_trees in [200]:
        for learning_rate in [0.05]: #[0.01, 0.05]:
            for num_leaves in [24, 2**8]: #[16, 24]:
                # datasets
                lgbm_train = lightgbm.Dataset(data=train.iloc[:,:-1], 
                                              label=train.iloc[:,-1])

                lgbm_valid = lightgbm.Dataset(data=valid.iloc[:,:-1], 
                                              label=valid.iloc[:,-1])

                # run train
                lgbm_params = { 'learning_rate': learning_rate, 
                                'num_leaves': num_leaves,
                                'max_depth': 8,
                                'objective': 'regression'
                              } 
                lgbm_info = {}
                lgbm_model = lightgbm.train(lgbm_params, lgbm_train, 
                                            num_boost_round = num_trees,
#                                             fobj            = optimize_log_loss, 
#                                             feval           = avg_log_loss,
                                            evals_result    = lgbm_info,
                                            valid_sets      = [lgbm_train, lgbm_valid], 
                                            valid_names     = ['train', 'valid'],
                                            verbose_eval    = 50)
                
                best_valid_iter = np.argmin(lgbm_info['valid']['l2'])
                
                model_file_name = "{:s}_T{:d}_S{:04d}_L{:d}_R{:d}.model".format(output_model_file,
                                                                            num_trees,
                                                                            int(learning_rate*1000),
                                                                            num_leaves,
                                                                            best_valid_iter + 1
                                                                           )
                
                # update experimental results
                exp = exp.append({'num_trees': num_trees, 
                                  'learning_rate':learning_rate,
                                  'num_leaves':num_leaves, 
                                  'best_round':best_valid_iter+1, 
                                  'metric':lgbm_info['valid']['l2'][best_valid_iter],
                                  'filename':model_file_name},
                                 ignore_index=True)
                
        
                lgbm_model.save_model(model_file_name)
                print("Model saved to", model_file_name)
    
    return exp

# WINE Dataset

In [4]:
DATASET_NAME="wine"
DATASET_DIR="../data/{}".format(DATASET_NAME)
ATK_DIR=DATASET_DIR + "/attacks"
MODELS_DIR="../out/models/{}".format(DATASET_NAME)
TRAINING_FILENAME=DATASET_DIR + "/" + "train.csv.bz2"
VALIDATION_FILENAME=DATASET_DIR + "/" + "valid.csv.bz2"
TEST_FILENAME=DATASET_DIR + "/" + "test.csv.bz2"
MODEL_FILENAME=MODELS_DIR + "/std-gbdt_{}".format(DATASET_NAME)
RED_MODEL_FILENAME=MODELS_DIR + "/red-gbdt_{}".format(DATASET_NAME)

In [5]:
# FULL GDBT

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               MODEL_FILENAME
                                              )  

experiments.to_csv(MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

Loading pre-processed files...
[50]	train's l2: 0.504397	valid's l2: 0.619868
[100]	train's l2: 0.422295	valid's l2: 0.592451
Model saved to ../out/models/wine/std-gbdt_wine_T100_S0050_L24_R100.model
[50]	train's l2: 0.378532	valid's l2: 0.595098
[100]	train's l2: 0.31194	valid's l2: 0.577067
Model saved to ../out/models/wine/std-gbdt_wine_T100_S0050_L1024_R97.model
  num_trees  learning_rate num_leaves best_round    metric  \
0       100           0.05         24        100  0.592451   
1       100           0.05       1024         97  0.576870   

                                            filename  
0  ../out/models/wine/std-gbdt_wine_T100_S0050_L2...  
1  ../out/models/wine/std-gbdt_wine_T100_S0050_L1...  
best model is: ../out/models/wine/std-gbdt_wine_T100_S0050_L1024_R97.model


In [6]:
# REDUCED GDBT

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               RED_MODEL_FILENAME,
                                               drop_cols=["alcohol", "residual_sugar", "volatile_acidity"]
                                              )  

experiments.to_csv(RED_MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

Pre-processing original files...
Loading: ../data/wine/train.csv.bz2
Loading: ../data/wine/valid.csv.bz2
Loading: ../data/wine/test.csv.bz2
Train/Valid/Test sizes: (4547, 13) (650, 13) (1300, 13)
Train/Valid/Test split: 0.70 0.10 0.20
CatFX: []
Train/Valid/Test sizes: (3898, 13) (1299, 13) (1300, 13)
Train/Valid/Test split: 0.60 0.20 0.20
Saving processed files *.atks.bz2
Dropping columns: ['alcohol', 'residual_sugar', 'volatile_acidity']
[50]	train's l2: 0.566579	valid's l2: 0.675114
[100]	train's l2: 0.479925	valid's l2: 0.644402
[150]	train's l2: 0.422051	valid's l2: 0.634787
[200]	train's l2: 0.380104	valid's l2: 0.629002
Model saved to ../out/models/wine/red-gbdt_wine_T200_S0050_L24_R198.model
  num_trees  learning_rate num_leaves best_round    metric  \
0       200           0.05         24        198  0.628488   

                                            filename  
0  ../out/models/wine/red-gbdt_wine_T200_S0050_L2...  
best model is: ../out/models/wine/red-gbdt_wine_T200_S005

# CENSUS Dataset

In [6]:
DATASET_NAME="census"
DATASET_DIR="../data/{}".format(DATASET_NAME)
ATK_DIR=DATASET_DIR + "/attacks"
MODELS_DIR="../out/models/{}".format(DATASET_NAME)
TRAINING_FILENAME=DATASET_DIR + "/" + "train.csv.bz2"
VALIDATION_FILENAME=DATASET_DIR + "/" + "valid.csv.bz2"
TEST_FILENAME=DATASET_DIR + "/" + "test.csv.bz2"
MODEL_FILENAME=MODELS_DIR + "/std-gbdt_{}".format(DATASET_NAME)
RED_MODEL_FILENAME=MODELS_DIR + "/red-gbdt_{}".format(DATASET_NAME)

In [7]:
# FULL GDBT

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               MODEL_FILENAME
                                              )  

experiments.to_csv(MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

Pre-processing original files...
Loading: ../data/census/train.csv.bz2
Loading: ../data/census/valid.csv.bz2
Loading: ../data/census/test.csv.bz2
Train/Valid/Test sizes: (27144, 14) (3017, 14) (15059, 14)
Train/Valid/Test split: 0.60 0.07 0.33
CatFX: ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']
Train/Valid/Test sizes: (27132, 14) (9044, 14) (9044, 14)
Train/Valid/Test split: 0.60 0.20 0.20
Saving processed files *.atks.bz2




[50]	train's l2: 0.371109	valid's l2: 0.383421
[100]	train's l2: 0.34835	valid's l2: 0.3708
[150]	train's l2: 0.338835	valid's l2: 0.368296
[200]	train's l2: 0.33204	valid's l2: 0.367613
Model saved to ../out/models/census/std-gbdt_census_T200_S0050_L24_R195.model
[50]	train's l2: 0.349592	valid's l2: 0.377621
[100]	train's l2: 0.320288	valid's l2: 0.368512
[150]	train's l2: 0.308512	valid's l2: 0.368319
[200]	train's l2: 0.298505	valid's l2: 0.369021
Model saved to ../out/models/census/std-gbdt_census_T200_S0050_L256_R122.model
  num_trees  learning_rate num_leaves best_round    metric  \
0       200           0.05         24        195  0.367547   
1       200           0.05        256        122  0.367793   

                                            filename  
0  ../out/models/census/std-gbdt_census_T200_S005...  
1  ../out/models/census/std-gbdt_census_T200_S005...  
best model is: ../out/models/census/std-gbdt_census_T200_S0050_L24_R195.model


In [7]:
# REDUCED GDBT

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               RED_MODEL_FILENAME,
                                               drop_cols=['workclass','marital_status',
                                                          'occupation', 'education_num',
                                                          'hours_per_week','capital_gain' ]
                                              )

experiments.to_csv(RED_MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

Loading pre-processed files...
Dropping columns: ['workclass', 'marital_status', 'occupation', 'education_num', 'hours_per_week', 'capital_gain']


ValueError: DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in fields relationship, race, sex, native_country

# SPAM

In [10]:
DATASET_NAME="spam"
DATASET_DIR="../data/{}".format(DATASET_NAME)
ATK_DIR=DATASET_DIR + "/attacks"
MODELS_DIR="../out/models/{}".format(DATASET_NAME)
TRAINING_FILENAME=DATASET_DIR + "/" + "train.csv.bz2"
VALIDATION_FILENAME=DATASET_DIR + "/" + "valid.csv.bz2"
TEST_FILENAME=DATASET_DIR + "/" + "test.csv.bz2"
MODEL_FILENAME=MODELS_DIR + "/std-gbdt_{}".format(DATASET_NAME)
RED_MODEL_FILENAME=MODELS_DIR + "/red-gbdt_{}".format(DATASET_NAME)

In [11]:
# FULL GDBT

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               MODEL_FILENAME
                                              )  

experiments.to_csv(MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

Pre-processing original files...
Loading: ../data/spam/train.csv.bz2
Loading: ../data/spam/valid.csv.bz2
Loading: ../data/spam/test.csv.bz2


FileNotFoundError: [Errno 2] No such file or directory: '../data/spam/train.csv.bz2'

In [None]:
# REDUCED GDBT

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               RED_MODEL_FILENAME,
                                               drop_cols=['char_freq_!', 'word_freq_remove',
                                                          'char_freq_$', 'capital_run_length_average',
                                                          'capital_run_length_total', 'word_freq_hp' ]
                                              )

experiments.to_csv(RED_MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

In [None]:
TRAIN, _, _, _ = load_atk_train_valid_test(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME)
print (TRAIN.columns)

In [None]:
TRAIN[TRAIN['word_freq_hp']>=1]['word_freq_hp'].describe()

In [None]:
###------------_####
def print_fx_imp(model, colnames):
    fx_uses = model.feature_importance(importance_type='split')
    fx_gain = model.feature_importance(importance_type='gain')

    for i,f in enumerate(np.argsort(fx_gain)[::-1]):
        print ("{:2d} {:20s} {:.3f} {:4d}".format(i, colnames[f], fx_gain[f], fx_uses[f]))

        

print(" -- GDBT --")    
gbdt = lightgbm.Booster(model_file="../out/models/spam/std-gbdt_spam_T100_S0050_L24_R99.model")
print(gbdt.num_trees())
print_fx_imp(gbdt, TRAIN.columns)

print(" -- Reduced GDBT --")    
redf = lightgbm.Booster(model_file="../out/models/spam/red-gbdt_spam_T100_S0050_L24_R92.model")
print(redf.num_trees())
print_fx_imp(redf, TRAIN.drop(columns=['char_freq_!', 'word_freq_remove',
                                                          'char_freq_$', 'capital_run_length_average',
                                                          'capital_run_length_total', 'word_freq_hp' ]).columns
            )


# print(" -- Adv. Boosting --")    
# advb = lightgbm.Booster(model_file="../out/models/census/adv-boosting_census_B30_T100_S0050_L24_R100.model")
# print(advb.num_trees())
# print_fx_imp(advb, TRAIN.columns)
