# GBDT - LightGBM

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
import numpy as np
import pandas as pd
import lightgbm
import pickle
import json
import functools
from os import listdir
from os.path import isfile, join
from nilib import *

# Path to dataset files

In [3]:
DATASET_NAME="wine2" # wine

In [17]:
DATASET_DIR="../data/{}".format(DATASET_NAME)
MODELS_DIR="../out/models/{}".format(DATASET_NAME)
TRAINING_FILENAME=DATASET_DIR + "/" + "train_ori.csv.bz2"
VALIDATION_FILENAME=DATASET_DIR + "/" + "valid_ori.csv.bz2"
TEST_FILENAME=DATASET_DIR + "/" + "test_ori.csv.bz2"
MODEL_FILENAME=MODELS_DIR + "/std-gbdt_{}".format(DATASET_NAME)
RED_MODEL_FILENAME=MODELS_DIR + "/red-gbdt_{}".format(DATASET_NAME)

In [15]:
def train_gradient_boosting_baseline(train_file, valid_file, test_file, output_model_file, drop_cols=None):
    
    exp = pd.DataFrame(columns=['num_trees', 'learning_rate', 'num_leaves', 'best_round', 'metric', 'filename'])
    
    # load train/valid/test
    train, valid, test, cat_fx = load_atk_train_valid_test(train_file, valid_file, test_file)
    
    assert "instance_id" not in train.columns.values, "Wrong training set file for GBDT"

    # dropping some of the columns
    if drop_cols is not None:
        print ("Dropping columns:", drop_cols)
        train.drop(columns=drop_cols, inplace=True)
        valid.drop(columns=drop_cols, inplace=True)
        test.drop(columns=drop_cols, inplace=True)
    
    
    # get index of categorical features 
    cat_fx = np.where(train.columns.isin(cat_fx))[0]
    cat_fx = list([int(x) for x in cat_fx])  
    print("CatFX:", train.columns.values[cat_fx])
    

    for num_trees in [500]:
        for learning_rate in [0.05]: #[0.01, 0.05]:
            for num_leaves in [24]: #[16, 24]:
                # datasets
                lgbm_train = lightgbm.Dataset(data=train.iloc[:,:-1].values, 
                                              label=train.iloc[:,-1].values,
                                              categorical_feature = cat_fx)

                lgbm_valid = lightgbm.Dataset(data=valid.iloc[:,:-1].values, 
                                              label=valid.iloc[:,-1].values,
                                              categorical_feature = cat_fx)

                # run train
                lgbm_params = { 'learning_rate': learning_rate, 
                                'num_leaves': num_leaves} 
                lgbm_info = {}
                lgbm_model = lightgbm.train(lgbm_params, lgbm_train, 
                                            num_boost_round = num_trees,
                                            fobj            = optimize_log_loss, 
                                            feval           = avg_log_loss,
                                            evals_result    = lgbm_info,
                                            valid_sets      = [lgbm_train, lgbm_valid], 
                                            valid_names     = ['train', 'valid'],
                                            verbose_eval    = 50)
                
                best_valid_iter = np.argmin(lgbm_info['valid']['avg_binary_log_loss'])
                
                model_file_name = "{:s}_T{:d}_S{:04d}_L{:d}_R{:d}.model".format(output_model_file,
                                                                            num_trees,
                                                                            int(learning_rate*1000),
                                                                            num_leaves,
                                                                            best_valid_iter + 1
                                                                           )
                
                # update experimental results
                exp = exp.append({'num_trees': num_trees, 
                                  'learning_rate':learning_rate,
                                  'num_leaves':num_leaves, 
                                  'best_round':best_valid_iter+1, 
                                  'metric':lgbm_info['valid']['avg_binary_log_loss'][best_valid_iter],
                                  'filename':model_file_name},
                                 ignore_index=True)
                
        
                lgbm_model.save_model(model_file_name)
                print("Model saved to", model_file_name)

    
    return exp

# GBDT

In [8]:
experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               MODEL_FILENAME
                                              )  

experiments.to_csv(MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

Loading pre-processed files...
CatFX: []
[50]	train's avg_binary_log_loss: 0.412939	valid's avg_binary_log_loss: 0.476692
[100]	train's avg_binary_log_loss: 0.342071	valid's avg_binary_log_loss: 0.454152
[150]	train's avg_binary_log_loss: 0.294123	valid's avg_binary_log_loss: 0.442476
[200]	train's avg_binary_log_loss: 0.257883	valid's avg_binary_log_loss: 0.438046
[250]	train's avg_binary_log_loss: 0.228122	valid's avg_binary_log_loss: 0.438026
[300]	train's avg_binary_log_loss: 0.202267	valid's avg_binary_log_loss: 0.435789
[350]	train's avg_binary_log_loss: 0.180632	valid's avg_binary_log_loss: 0.438017
[400]	train's avg_binary_log_loss: 0.162684	valid's avg_binary_log_loss: 0.440851
[450]	train's avg_binary_log_loss: 0.147385	valid's avg_binary_log_loss: 0.444448
[500]	train's avg_binary_log_loss: 0.132888	valid's avg_binary_log_loss: 0.447687
Model saved to ../out/models/wine2/std-gbdt_wine2_T500_S0050_L24_R295.model
  num_trees  learning_rate num_leaves best_round    metric  \
0 

# Reduced version

In [18]:
experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               RED_MODEL_FILENAME,
                                               drop_cols=["alcohol", "residual_sugar", "volatile_acidity"]
                                              )  

experiments.to_csv(RED_MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

Loading pre-processed files...
Dropping columns: ['alcohol', 'residual_sugar', 'volatile_acidity']
CatFX: []
[50]	train's avg_binary_log_loss: 0.452733	valid's avg_binary_log_loss: 0.513191
[100]	train's avg_binary_log_loss: 0.385163	valid's avg_binary_log_loss: 0.487611
[150]	train's avg_binary_log_loss: 0.340032	valid's avg_binary_log_loss: 0.480356
[200]	train's avg_binary_log_loss: 0.303349	valid's avg_binary_log_loss: 0.475684
[250]	train's avg_binary_log_loss: 0.272697	valid's avg_binary_log_loss: 0.474043
[300]	train's avg_binary_log_loss: 0.246855	valid's avg_binary_log_loss: 0.47314
[350]	train's avg_binary_log_loss: 0.224221	valid's avg_binary_log_loss: 0.473417
[400]	train's avg_binary_log_loss: 0.204525	valid's avg_binary_log_loss: 0.475837
[450]	train's avg_binary_log_loss: 0.187791	valid's avg_binary_log_loss: 0.47717
[500]	train's avg_binary_log_loss: 0.173028	valid's avg_binary_log_loss: 0.477335
Model saved to ../out/models/wine2/red-gbdt_wine2_T500_S0050_L24_R281.mode