# GBDT - LightGBM

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
import numpy as np
import pandas as pd
import lightgbm
import pickle
import json
import functools
from os import listdir
from os.path import isfile, join
from nilib import *

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


# Path to dataset files

In [3]:
DATASET_NAME="census"

In [4]:
DATASET_DIR="../data/{}".format(DATASET_NAME)
MODELS_DIR="../out/models/{}".format(DATASET_NAME)
TRAINING_FILENAME=DATASET_DIR + "/" + "train_ori.csv.bz2"
VALIDATION_FILENAME=DATASET_DIR + "/" + "valid_ori.csv.bz2"
TEST_FILENAME=DATASET_DIR + "/" + "test_ori.csv.bz2"
MODEL_FILENAME=MODELS_DIR + "/std-gbdt_{}".format(DATASET_NAME)

In [5]:
def train_gradient_boosting_baseline(train_file, valid_file, test_file, output_model_file):
    
    exp = pd.DataFrame(columns=['num_trees', 'learning_rate', 'num_leaves', 'best_round', 'avg_binary_log_loss'])
    
    # load train/valid/test
    train, valid, test, cat_fx = load_atk_train_valid_test(train_file, valid_file, test_file)
    
    assert "instance_id" not in train.columns.values, "Wrong training set file for GBDT"

    # get index of categorical features 
    cat_fx = np.where(train.columns.isin(cat_fx))[0]
    cat_fx = list([int(x) for x in cat_fx])  
    print("CatFX:", train.columns.values[cat_fx])
    

    for num_trees in [200]:
        best_model = None
        best_info = None
        best_loss = np.inf
        for learning_rate in [0.01, 0.05, 0.1]:
            for num_leaves in [8, 16, 24]:
                # datasets
                lgbm_train = lightgbm.Dataset(data=train.iloc[:,:-1].values, 
                                              label=train.iloc[:,-1].values,
                                              categorical_feature = cat_fx)

                lgbm_valid = lightgbm.Dataset(data=valid.iloc[:,:-1].values, 
                                              label=valid.iloc[:,-1].values,
                                              categorical_feature = cat_fx)

                # run train
                lgbm_params = { 'learning_rate': learning_rate, 
                                'num_leaves': num_leaves} 
                lgbm_info = {}
                lgbm_model = lightgbm.train(lgbm_params, lgbm_train, 
                                            num_boost_round = num_trees,
                                            fobj            = optimize_log_loss, 
                                            feval           = avg_log_loss,
                                            evals_result    = lgbm_info,
                                            valid_sets      = [lgbm_train, lgbm_valid], 
                                            valid_names     = ['train', 'valid'],
                                            verbose_eval    = 5)
                
                if np.min(lgbm_info['valid']['avg_binary_log_loss']) < best_loss:
                    best_model = lgbm_model
                    best_info = lgbm_info
                    best_loss = np.min(lgbm_info['valid']['avg_binary_log_loss'])
                    best_info['num_trees'] = num_trees
                    best_info['learning_rate'] = learning_rate
                    best_info['num_leaves'] = num_leaves
                    
                    
                best_valid_iter = np.argmin(lgbm_info['valid']['avg_binary_log_loss'])
                
                # update experimental results
                exp = exp.append({'num_trees': num_trees, 
                                  'learning_rate':learning_rate,
                                  'num_leaves':num_leaves, 
                                  'best_round':best_valid_iter+1, 
                                  'avg_binary_log_loss':lgbm_info['valid']['avg_binary_log_loss'][best_valid_iter]},
                                 ignore_index=True)
                
        
        # save file
        best_valid_iter = np.argmin(best_info['valid']['avg_binary_log_loss'])

        model_file_name = "{:s}_T{:d}_S{:04d}_L{:d}_R{:d}.model".format(output_model_file,
                                                                        best_info['num_trees'],
                                                                        int(best_info['learning_rate']*1000),
                                                                        best_info['num_leaves'],
                                                                        best_valid_iter + 1
                                                                       )
        
        best_model.save_model(model_file_name)
        print("Model saved to", model_file_name)
        
        best_model = lightgbm.Booster(model_file=model_file_name)
        print("Check valid score:", avg_log_loss(preds=best_model.predict(valid.iloc[:,:-1].values),
                                                  train_data=lgbm_valid))

    
    return exp

In [6]:
experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               MODEL_FILENAME
                                              )  

experiments.to_csv(MODEL_FILENAME + ".csv", index=False)

print(experiments)

Pre-processing original files...
Loading: ../data/census/train_ori.csv.bz2
Loading: ../data/census/valid_ori.csv.bz2
Loading: ../data/census/test_ori.csv.bz2
Train/Valid/Test sizes: (27145, 14) (3017, 14) (15060, 14)
Train/Valid/Test split: 0.60 0.07 0.33
CatFX: ['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']
Train/Valid/Test sizes: (27133, 14) (9044, 14) (9045, 14)
Train/Valid/Test split: 0.60 0.20 0.20
Saving processed files *.cat.bz2
CatFX: ['workclass' 'marital_status' 'occupation' 'relationship' 'race' 'sex'
 'native_country']
[5]	train's avg_binary_log_loss: 0.669909	valid's avg_binary_log_loss: 0.670078
[10]	train's avg_binary_log_loss: 0.648818	valid's avg_binary_log_loss: 0.649129
[15]	train's avg_binary_log_loss: 0.629588	valid's avg_binary_log_loss: 0.630009
[20]	train's avg_binary_log_loss: 0.611985	valid's avg_binary_log_loss: 0.612517
[25]	train's avg_binary_log_loss: 0.595845	valid's avg_binary_log_loss: 0.59644
[30]	train's



[65]	train's avg_binary_log_loss: 0.503831	valid's avg_binary_log_loss: 0.504231
[70]	train's avg_binary_log_loss: 0.49565	valid's avg_binary_log_loss: 0.495911
[75]	train's avg_binary_log_loss: 0.488029	valid's avg_binary_log_loss: 0.488241
[80]	train's avg_binary_log_loss: 0.480881	valid's avg_binary_log_loss: 0.481055
[85]	train's avg_binary_log_loss: 0.474249	valid's avg_binary_log_loss: 0.474406
[90]	train's avg_binary_log_loss: 0.467996	valid's avg_binary_log_loss: 0.468079
[95]	train's avg_binary_log_loss: 0.462203	valid's avg_binary_log_loss: 0.46215
[100]	train's avg_binary_log_loss: 0.456843	valid's avg_binary_log_loss: 0.456639
[105]	train's avg_binary_log_loss: 0.451805	valid's avg_binary_log_loss: 0.451524
[110]	train's avg_binary_log_loss: 0.447153	valid's avg_binary_log_loss: 0.446731
[115]	train's avg_binary_log_loss: 0.442751	valid's avg_binary_log_loss: 0.442272
[120]	train's avg_binary_log_loss: 0.438642	valid's avg_binary_log_loss: 0.438092
[125]	train's avg_binary_

[185]	train's avg_binary_log_loss: 0.386033	valid's avg_binary_log_loss: 0.387742
[190]	train's avg_binary_log_loss: 0.384237	valid's avg_binary_log_loss: 0.385993
[195]	train's avg_binary_log_loss: 0.382548	valid's avg_binary_log_loss: 0.384354
[200]	train's avg_binary_log_loss: 0.380945	valid's avg_binary_log_loss: 0.382803
[5]	train's avg_binary_log_loss: 0.594472	valid's avg_binary_log_loss: 0.595068
[10]	train's avg_binary_log_loss: 0.53067	valid's avg_binary_log_loss: 0.531334
[15]	train's avg_binary_log_loss: 0.486288	valid's avg_binary_log_loss: 0.486618
[20]	train's avg_binary_log_loss: 0.455294	valid's avg_binary_log_loss: 0.455416
[25]	train's avg_binary_log_loss: 0.433344	valid's avg_binary_log_loss: 0.432877
[30]	train's avg_binary_log_loss: 0.417288	valid's avg_binary_log_loss: 0.4166
[35]	train's avg_binary_log_loss: 0.404798	valid's avg_binary_log_loss: 0.404028
[40]	train's avg_binary_log_loss: 0.39562	valid's avg_binary_log_loss: 0.3947
[45]	train's avg_binary_log_los

[130]	train's avg_binary_log_loss: 0.30654	valid's avg_binary_log_loss: 0.31739
[135]	train's avg_binary_log_loss: 0.303939	valid's avg_binary_log_loss: 0.315276
[140]	train's avg_binary_log_loss: 0.302275	valid's avg_binary_log_loss: 0.314031
[145]	train's avg_binary_log_loss: 0.300751	valid's avg_binary_log_loss: 0.312939
[150]	train's avg_binary_log_loss: 0.299319	valid's avg_binary_log_loss: 0.311932
[155]	train's avg_binary_log_loss: 0.298426	valid's avg_binary_log_loss: 0.311407
[160]	train's avg_binary_log_loss: 0.296751	valid's avg_binary_log_loss: 0.310081
[165]	train's avg_binary_log_loss: 0.29597	valid's avg_binary_log_loss: 0.30964
[170]	train's avg_binary_log_loss: 0.294823	valid's avg_binary_log_loss: 0.308877
[175]	train's avg_binary_log_loss: 0.294053	valid's avg_binary_log_loss: 0.308364
[180]	train's avg_binary_log_loss: 0.293023	valid's avg_binary_log_loss: 0.307557
[185]	train's avg_binary_log_loss: 0.292079	valid's avg_binary_log_loss: 0.306936
[190]	train's avg_bi

[55]	train's avg_binary_log_loss: 0.314703	valid's avg_binary_log_loss: 0.32338
[60]	train's avg_binary_log_loss: 0.309001	valid's avg_binary_log_loss: 0.31893
[65]	train's avg_binary_log_loss: 0.306029	valid's avg_binary_log_loss: 0.316995
[70]	train's avg_binary_log_loss: 0.303376	valid's avg_binary_log_loss: 0.3151
[75]	train's avg_binary_log_loss: 0.300064	valid's avg_binary_log_loss: 0.312694
[80]	train's avg_binary_log_loss: 0.297271	valid's avg_binary_log_loss: 0.310638
[85]	train's avg_binary_log_loss: 0.294936	valid's avg_binary_log_loss: 0.308804
[90]	train's avg_binary_log_loss: 0.292655	valid's avg_binary_log_loss: 0.30741
[95]	train's avg_binary_log_loss: 0.290052	valid's avg_binary_log_loss: 0.305533
[100]	train's avg_binary_log_loss: 0.289211	valid's avg_binary_log_loss: 0.305254
[105]	train's avg_binary_log_loss: 0.28759	valid's avg_binary_log_loss: 0.304225
[110]	train's avg_binary_log_loss: 0.286925	valid's avg_binary_log_loss: 0.304185
[115]	train's avg_binary_log_lo