# GBDT - LightGBM

In [None]:
%load_ext autoreload
%autoreload 2

In [11]:
import sys
import os
import numpy as np
import pandas as pd
import lightgbm
import pickle
import json
import functools
from os import listdir
from os.path import isfile, join
from nilib import *

# Path to dataset files

In [12]:
DATASET_NAME="wine" # wine

In [13]:
DATASET_DIR="../data/{}".format(DATASET_NAME)
MODELS_DIR="../out/models/{}".format(DATASET_NAME)
TRAINING_FILENAME=DATASET_DIR + "/" + "train_ori.csv.bz2"
VALIDATION_FILENAME=DATASET_DIR + "/" + "valid_ori.csv.bz2"
TEST_FILENAME=DATASET_DIR + "/" + "test_ori.csv.bz2"
MODEL_FILENAME=MODELS_DIR + "/std-gbdt_{}".format(DATASET_NAME)

In [16]:
def train_gradient_boosting_baseline(train_file, valid_file, test_file, output_model_file):
    
    exp = pd.DataFrame(columns=['num_trees', 'learning_rate', 'num_leaves', 'best_round', 'metric', 'filename'])
    
    # load train/valid/test
    train, valid, test, cat_fx = load_atk_train_valid_test(train_file, valid_file, test_file)
    
    assert "instance_id" not in train.columns.values, "Wrong training set file for GBDT"

    # get index of categorical features 
    cat_fx = np.where(train.columns.isin(cat_fx))[0]
    cat_fx = list([int(x) for x in cat_fx])  
    print("CatFX:", train.columns.values[cat_fx])
    

    for num_trees in [200]:
        for learning_rate in [0.01, 0.05, 0.1]:
            for num_leaves in [8, 16, 24]:
                # datasets
                lgbm_train = lightgbm.Dataset(data=train.iloc[:,:-1].values, 
                                              label=train.iloc[:,-1].values,
                                              categorical_feature = cat_fx)

                lgbm_valid = lightgbm.Dataset(data=valid.iloc[:,:-1].values, 
                                              label=valid.iloc[:,-1].values,
                                              categorical_feature = cat_fx)

                # run train
                lgbm_params = { 'learning_rate': learning_rate, 
                                'num_leaves': num_leaves} 
                lgbm_info = {}
                lgbm_model = lightgbm.train(lgbm_params, lgbm_train, 
                                            num_boost_round = num_trees,
                                            fobj            = optimize_log_loss, 
                                            feval           = avg_log_loss,
                                            evals_result    = lgbm_info,
                                            valid_sets      = [lgbm_train, lgbm_valid], 
                                            valid_names     = ['train', 'valid'],
                                            verbose_eval    = 50)
                
                best_valid_iter = np.argmin(lgbm_info['valid']['avg_binary_log_loss'])
                
                model_file_name = "{:s}_T{:d}_S{:04d}_L{:d}_R{:d}.model".format(output_model_file,
                                                                            num_trees,
                                                                            int(learning_rate*1000),
                                                                            num_leaves,
                                                                            best_valid_iter + 1
                                                                           )
                
                # update experimental results
                exp = exp.append({'num_trees': num_trees, 
                                  'learning_rate':learning_rate,
                                  'num_leaves':num_leaves, 
                                  'best_round':best_valid_iter+1, 
                                  'metric':lgbm_info['valid']['avg_binary_log_loss'][best_valid_iter],
                                  'filename':model_file_name},
                                 ignore_index=True)
                
        
                lgbm_model.save_model(model_file_name)
                print("Model saved to", model_file_name)

    
    return exp

In [17]:
experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               MODEL_FILENAME
                                              )  

experiments.to_csv(MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

Loading pre-processed files...
CatFX: []
[50]	train's avg_binary_log_loss: 0.592468	valid's avg_binary_log_loss: 0.597697
[100]	train's avg_binary_log_loss: 0.543457	valid's avg_binary_log_loss: 0.551507
[150]	train's avg_binary_log_loss: 0.516441	valid's avg_binary_log_loss: 0.527646
[200]	train's avg_binary_log_loss: 0.498774	valid's avg_binary_log_loss: 0.513444
Model saved to ../out/models/wine/std-gbdt_wine_T200_S0010_L8_R200.model
[50]	train's avg_binary_log_loss: 0.578771	valid's avg_binary_log_loss: 0.590698
[100]	train's avg_binary_log_loss: 0.522085	valid's avg_binary_log_loss: 0.53955
[150]	train's avg_binary_log_loss: 0.489238	valid's avg_binary_log_loss: 0.51091
[200]	train's avg_binary_log_loss: 0.467173	valid's avg_binary_log_loss: 0.494643
Model saved to ../out/models/wine/std-gbdt_wine_T200_S0010_L16_R200.model
[50]	train's avg_binary_log_loss: 0.569699	valid's avg_binary_log_loss: 0.583678
[100]	train's avg_binary_log_loss: 0.506591	valid's avg_binary_log_loss: 0.5293