# GBDT - LightGBM

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
import numpy as np
import pandas as pd
import lightgbm
import pickle
import json
import functools
from os import listdir
from os.path import isfile, join
from nilib import *

In [3]:
def train_gradient_boosting_baseline(train_file, valid_file, test_file, output_model_file, 
                                     drop_cols=None, random_forest = False):
    
    exp = pd.DataFrame(columns=['num_trees', 'learning_rate', 'num_leaves', 'best_round', 'metric', 'filename'])
    
    # load train/valid/test
    train, valid, test = load_atk_train_valid_test(train_file, valid_file, test_file)
    
    assert "instance_id" not in train.columns.values, "Wrong training set file for GBDT"

    # dropping some of the columns
    if drop_cols is not None:
        print ("Dropping columns:", drop_cols)
        train.drop(columns=drop_cols, inplace=True)
        valid.drop(columns=drop_cols, inplace=True)
        test.drop(columns=drop_cols, inplace=True)
    

    for num_trees in [100]:
        for learning_rate in [0.05]: #[0.01, 0.05]:
            for num_leaves in [2**8]: #[16, 24]:
                # datasets
                lgbm_train = lightgbm.Dataset(data=train.iloc[:,:-1], 
                                              label=train.iloc[:,-1])

                lgbm_valid = lightgbm.Dataset(data=valid.iloc[:,:-1], 
                                              label=valid.iloc[:,-1])

                # run train
                lgbm_params = { 'learning_rate': learning_rate, 
                                'num_leaves': num_leaves,
                                'max_depth': 8,
                                'objective': 'regression'
                              } 
                if random_forest:
                    lgbm_params['boosting'] = 'rf'
                    lgbm_params['bagging_fraction'] = 0.8
                    lgbm_params['feature_fraction' ] = 0.8
                    lgbm_params['bagging_freq'] = 1

                lgbm_info = {}
                lgbm_model = lightgbm.train(lgbm_params, lgbm_train, 
                                            num_boost_round = num_trees,
#                                             fobj            = optimize_log_loss, 
#                                             feval           = avg_log_loss,
                                            evals_result    = lgbm_info,
                                            valid_sets      = [lgbm_train, lgbm_valid], 
                                            valid_names     = ['train', 'valid'],
                                            verbose_eval    = 10)
                
                best_valid_iter = np.argmin(lgbm_info['valid']['l2'])
                
                model_file_name = "{:s}_T{:d}_S{:04d}_L{:d}_R{:d}.model".format(output_model_file,
                                                                            num_trees,
                                                                            int(learning_rate*1000),
                                                                            num_leaves,
                                                                            best_valid_iter + 1
                                                                           )
                
                # update experimental results
                exp = exp.append({'num_trees': num_trees, 
                                  'learning_rate':learning_rate,
                                  'num_leaves':num_leaves, 
                                  'best_round':best_valid_iter+1, 
                                  'metric':lgbm_info['valid']['l2'][best_valid_iter],
                                  'filename':model_file_name},
                                 ignore_index=True)
                
        
                lgbm_model.save_model(model_file_name)
                print("Model saved to", model_file_name)
    
    return exp

# WINE Dataset

In [4]:
DATASET_NAME="wine"
DATASET_DIR="../data/{}".format(DATASET_NAME)
ATK_DIR=DATASET_DIR + "/attacks"
MODELS_DIR="../out/models/{}".format(DATASET_NAME)
TRAINING_FILENAME=DATASET_DIR + "/" + "train.csv.bz2"
VALIDATION_FILENAME=DATASET_DIR + "/" + "valid.csv.bz2"
TEST_FILENAME=DATASET_DIR + "/" + "test.csv.bz2"
MODEL_FILENAME=MODELS_DIR + "/std-gbdt_{}".format(DATASET_NAME)
RED_MODEL_FILENAME=MODELS_DIR + "/red-gbdt_{}".format(DATASET_NAME)
RF_MODEL_FILENAME=MODELS_DIR + "/rf-gbdt_{}".format(DATASET_NAME)

In [5]:
# FULL GDBT

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               MODEL_FILENAME
                                              )  

experiments.to_csv(MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

Pre-processing original files...
Loading: ../data/wine/train.csv.bz2
Loading: ../data/wine/valid.csv.bz2
Loading: ../data/wine/test.csv.bz2
Train/Valid/Test sizes: (4547, 13) (650, 13) (1300, 13)
Train/Valid/Test split: 0.70 0.10 0.20
CatFX: []
Train/Valid/Test sizes: (3898, 13) (1299, 13) (1300, 13)
Train/Valid/Test split: 0.60 0.20 0.20
Saving processed files *.atks.bz2
[10]	train's l2: 0.658938	valid's l2: 0.73916
[20]	train's l2: 0.527671	valid's l2: 0.660676
[30]	train's l2: 0.451371	valid's l2: 0.624959
[40]	train's l2: 0.407846	valid's l2: 0.609611
[50]	train's l2: 0.371556	valid's l2: 0.598689
[60]	train's l2: 0.34594	valid's l2: 0.593508
[70]	train's l2: 0.331191	valid's l2: 0.58887
[80]	train's l2: 0.317604	valid's l2: 0.585445
[90]	train's l2: 0.306344	valid's l2: 0.583606
[100]	train's l2: 0.292753	valid's l2: 0.58101
Model saved to ../out/models/wine/std-gbdt_wine_T100_S0050_L256_R100.model
  num_trees  learning_rate num_leaves best_round   metric  \
0       100           

In [None]:
# REDUCED GDBT

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               RED_MODEL_FILENAME,
                                               drop_cols=["alcohol", "residual_sugar", "volatile_acidity"]
                                              )  

experiments.to_csv(RED_MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

In [6]:
# RANDOM FOREST

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               RF_MODEL_FILENAME,
                                               random_forest=True
                                              )  

experiments.to_csv(RF_MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

Pre-processing original files...
Loading: ../data/wine/train.csv.bz2
Loading: ../data/wine/valid.csv.bz2
Loading: ../data/wine/test.csv.bz2
Train/Valid/Test sizes: (4547, 13) (650, 13) (1300, 13)
Train/Valid/Test split: 0.70 0.10 0.20
CatFX: []
Train/Valid/Test sizes: (3898, 13) (1299, 13) (1300, 13)
Train/Valid/Test split: 0.60 0.20 0.20
Saving processed files *.atks.bz2
[10]	train's l2: 0.520077	valid's l2: 0.62858
[20]	train's l2: 0.512319	valid's l2: 0.626062
[30]	train's l2: 0.507472	valid's l2: 0.621204
[40]	train's l2: 0.506059	valid's l2: 0.62197
[50]	train's l2: 0.503661	valid's l2: 0.621407
[60]	train's l2: 0.50284	valid's l2: 0.620475
[70]	train's l2: 0.503065	valid's l2: 0.620394
[80]	train's l2: 0.503063	valid's l2: 0.620651
[90]	train's l2: 0.502976	valid's l2: 0.620229
[100]	train's l2: 0.502296	valid's l2: 0.620133
Model saved to ../out/models/wine/rf-gbdt_wine_T100_S0050_L256_R68.model
  num_trees  learning_rate num_leaves best_round    metric  \
0       100           

# CENSUS Dataset

In [None]:
DATASET_NAME="census"
DATASET_DIR="../data/{}".format(DATASET_NAME)
ATK_DIR=DATASET_DIR + "/attacks"
MODELS_DIR="../out/models/{}".format(DATASET_NAME)
TRAINING_FILENAME=DATASET_DIR + "/" + "train.csv.bz2"
VALIDATION_FILENAME=DATASET_DIR + "/" + "valid.csv.bz2"
TEST_FILENAME=DATASET_DIR + "/" + "test.csv.bz2"
MODEL_FILENAME=MODELS_DIR + "/std-gbdt_{}".format(DATASET_NAME)
RED_MODEL_FILENAME=MODELS_DIR + "/red-gbdt_{}".format(DATASET_NAME)
RF_MODEL_FILENAME=MODELS_DIR + "/rf-gbdt_{}".format(DATASET_NAME)

In [None]:
# FULL GDBT

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               MODEL_FILENAME
                                              )  

experiments.to_csv(MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

In [None]:
# REDUCED GDBT

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               RED_MODEL_FILENAME,
                                               drop_cols=['workclass','marital_status',
                                                          'occupation', 'education_num',
                                                          'hours_per_week','capital_gain' ]
                                              )

experiments.to_csv(RED_MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

In [None]:
# RANDOM FOREST

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               RF_MODEL_FILENAME,
                                               random_forest=True
                                              )  

experiments.to_csv(RF_MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

# SPAM Dataset

In [None]:
DATASET_NAME="spam"
DATASET_DIR="../data/{}".format(DATASET_NAME)
ATK_DIR=DATASET_DIR + "/attacks"
MODELS_DIR="../out/models/{}".format(DATASET_NAME)
TRAINING_FILENAME=DATASET_DIR + "/" + "train.csv.bz2"
VALIDATION_FILENAME=DATASET_DIR + "/" + "valid.csv.bz2"
TEST_FILENAME=DATASET_DIR + "/" + "test.csv.bz2"
MODEL_FILENAME=MODELS_DIR + "/std-gbdt_{}".format(DATASET_NAME)
RED_MODEL_FILENAME=MODELS_DIR + "/red-gbdt_{}".format(DATASET_NAME)
RF_MODEL_FILENAME=MODELS_DIR + "/rf-gbdt_{}".format(DATASET_NAME)

In [None]:
# FULL GDBT

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               MODEL_FILENAME
                                              )  

experiments.to_csv(MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

In [None]:
# REDUCED GDBT

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               RED_MODEL_FILENAME,
                                               drop_cols=['char_freq_!', 'word_freq_remove',
                                                          'char_freq_$', 'capital_run_length_average',
                                                          'capital_run_length_total' ]
                                              )

experiments.to_csv(RED_MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

In [None]:
# RANDOM FOREST

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               RF_MODEL_FILENAME,
                                               random_forest=True
                                              )  

experiments.to_csv(RF_MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

# debug


In [None]:
###------------_####
def print_fx_imp(model, colnames):
    fx_uses = model.feature_importance(importance_type='split')
    fx_gain = model.feature_importance(importance_type='gain')

    for i,f in enumerate(np.argsort(fx_gain)[::-1]):
        print ("{:2d} {:20s} {:.3f} {:4d}".format(i, colnames[f], fx_gain[f], fx_uses[f]))

        




In [None]:
df = pd.read_csv("../data/wine/raw/train.csv.bz2")

In [None]:
print(" -- GDBT --")    
gbdt = lightgbm.Booster(model_file="../out/models/wine/std-gbdt_wine_T100_S0050_L256_R97.model")
print(gbdt.num_trees())
print_fx_imp(gbdt, df.columns)

print(" -- Reduced GDBT --")    
redf = lightgbm.Booster(model_file="../out/models/wine/red-gbdt_wine_T100_S0050_L256_R100.model")
print(redf.num_trees())
#print_fx_imp(redf,train.drop(columns=['char_freq_!', 'word_freq_remove',
#                                                          'char_freq_$', 'capital_run_length_average',
#                                                         'capital_run_length_total', 'word_freq_hp' ]).columns
#            )


# print(" -- Adv. Boosting --")    
# advb = lightgbm.Booster(model_file="../out/models/census/adv-boosting_census_B30_T100_S0050_L24_R100.model")
# print(advb.num_trees())
# print_fx_imp(advb, TRAIN.columns)

# CREDIT Dataset

In [None]:
DATASET_NAME="credit"
DATASET_DIR="../data/{}".format(DATASET_NAME)
ATK_DIR=DATASET_DIR + "/attacks"
MODELS_DIR="../out/models/{}".format(DATASET_NAME)
TRAINING_FILENAME=DATASET_DIR + "/" + "train.csv.bz2"
VALIDATION_FILENAME=DATASET_DIR + "/" + "valid.csv.bz2"
TEST_FILENAME=DATASET_DIR + "/" + "test.csv.bz2"
MODEL_FILENAME=MODELS_DIR + "/std-gbdt_{}".format(DATASET_NAME)
RED_MODEL_FILENAME=MODELS_DIR + "/red-gbdt_{}".format(DATASET_NAME)
RF_MODEL_FILENAME=MODELS_DIR + "/rf-gbdt_{}".format(DATASET_NAME)

In [None]:
# FULL GDBT

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               MODEL_FILENAME
                                              )  

experiments.to_csv(MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

In [None]:
# REDUCED GDBT

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               RED_MODEL_FILENAME,
                                               drop_cols=['PAY_0', 'BILL_AMT1', 'PAY_2', 'LIMIT_BAL'] # SET TO THE LIST OF ATTACKED FEATURES
                                              )

experiments.to_csv(RED_MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

In [None]:
# RANDOM FOREST

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               RF_MODEL_FILENAME,
                                               random_forest=True
                                              )  

experiments.to_csv(RF_MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

# WEBSITES Dataset

In [14]:
DATASET_NAME="websites"
DATASET_DIR="../data/{}".format(DATASET_NAME)
ATK_DIR=DATASET_DIR + "/attacks"
MODELS_DIR="../out/models/{}".format(DATASET_NAME)
TRAINING_FILENAME=DATASET_DIR + "/" + "train.csv.bz2"
VALIDATION_FILENAME=DATASET_DIR + "/" + "valid.csv.bz2"
TEST_FILENAME=DATASET_DIR + "/" + "test.csv.bz2"
MODEL_FILENAME=MODELS_DIR + "/std-gbdt_{}".format(DATASET_NAME)
RED_MODEL_FILENAME=MODELS_DIR + "/red-gbdt_{}".format(DATASET_NAME)
RF_MODEL_FILENAME=MODELS_DIR + "/rf-gbdt_{}".format(DATASET_NAME)

In [15]:
# FULL GDBT

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               MODEL_FILENAME
                                              )  

experiments.to_csv(MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

Pre-processing original files...
Loading: ../data/websites/train.csv.bz2
Loading: ../data/websites/valid.csv.bz2
Loading: ../data/websites/test.csv.bz2
Train/Valid/Test sizes: (1068, 14) (356, 14) (356, 14)
Train/Valid/Test split: 0.60 0.20 0.20
CatFX: []
Train/Valid/Test sizes: (1068, 14) (356, 14) (356, 14)
Train/Valid/Test split: 0.60 0.20 0.20
Saving processed files *.atks.bz2
[10]	train's l2: 0.242588	valid's l2: 0.104841
[20]	train's l2: 0.169659	valid's l2: 0.119277
[30]	train's l2: 0.128631	valid's l2: 0.120082
[40]	train's l2: 0.103857	valid's l2: 0.120464
[50]	train's l2: 0.0867307	valid's l2: 0.127315
[60]	train's l2: 0.0753108	valid's l2: 0.126461
[70]	train's l2: 0.0674525	valid's l2: 0.127927
[80]	train's l2: 0.0619007	valid's l2: 0.12823
[90]	train's l2: 0.0572583	valid's l2: 0.128215
[100]	train's l2: 0.0537401	valid's l2: 0.129009
Model saved to ../out/models/websites/std-gbdt_websites_T100_S0050_L256_R5.model
  num_trees  learning_rate num_leaves best_round    metric 

In [None]:
# REDUCED GDBT

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               RED_MODEL_FILENAME,
                                               drop_cols=[] # SET TO THE LIST OF ATTACKED FEATURES
                                              )

experiments.to_csv(RED_MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

In [None]:
# RANDOM FOREST

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               RF_MODEL_FILENAME,
                                               random_forest=True
                                              )  

experiments.to_csv(RF_MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

# Financial Distress 

In [11]:
DATASET_NAME="financial"
DATASET_DIR="../data/{}".format(DATASET_NAME)
ATK_DIR=DATASET_DIR + "/attacks"
MODELS_DIR="../out/models/{}".format(DATASET_NAME)
TRAINING_FILENAME=DATASET_DIR + "/" + "train.csv.bz2"
VALIDATION_FILENAME=DATASET_DIR + "/" + "valid.csv.bz2"
TEST_FILENAME=DATASET_DIR + "/" + "test.csv.bz2"
MODEL_FILENAME=MODELS_DIR + "/std-gbdt_{}".format(DATASET_NAME)
RED_MODEL_FILENAME=MODELS_DIR + "/red-gbdt_{}".format(DATASET_NAME)
RF_MODEL_FILENAME=MODELS_DIR + "/rf-gbdt_{}".format(DATASET_NAME)

In [12]:
# FULL GDBT

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               MODEL_FILENAME
                                              )  

experiments.to_csv(MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

Pre-processing original files...
Loading: ../data/financial/train.csv.bz2
Loading: ../data/financial/valid.csv.bz2
Loading: ../data/financial/test.csv.bz2
Train/Valid/Test sizes: (2202, 84) (734, 84) (736, 84)
Train/Valid/Test split: 0.60 0.20 0.20
CatFX: []
Train/Valid/Test sizes: (2203, 84) (734, 84) (735, 84)
Train/Valid/Test split: 0.60 0.20 0.20
Saving processed files *.atks.bz2
[10]	train's l2: 0.13062	valid's l2: 0.135682
[20]	train's l2: 0.120916	valid's l2: 0.130935
[30]	train's l2: 0.112371	valid's l2: 0.126817
[40]	train's l2: 0.105103	valid's l2: 0.123654
[50]	train's l2: 0.0991287	valid's l2: 0.121445
[60]	train's l2: 0.0939611	valid's l2: 0.119651
[70]	train's l2: 0.0892309	valid's l2: 0.11822
[80]	train's l2: 0.085182	valid's l2: 0.117024
[90]	train's l2: 0.081519	valid's l2: 0.115942
[100]	train's l2: 0.0781675	valid's l2: 0.115107
Model saved to ../out/models/financial/std-gbdt_financial_T100_S0010_L256_R100.model
  num_trees  learning_rate num_leaves best_round    met

In [None]:
# REDUCED GDBT

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               RED_MODEL_FILENAME,
                                               drop_cols=[] # SET TO THE LIST OF ATTACKED FEATURES
                                              )

experiments.to_csv(RED_MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

In [9]:
# RANDOM FOREST

experiments = train_gradient_boosting_baseline(TRAINING_FILENAME, 
                                               VALIDATION_FILENAME,
                                               TEST_FILENAME,
                                               RF_MODEL_FILENAME,
                                               random_forest=True
                                              )  

experiments.to_csv(RF_MODEL_FILENAME + ".csv", index=False)

print(experiments)
print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

Pre-processing original files...
Loading: ../data/financial/train.csv.bz2
Loading: ../data/financial/valid.csv.bz2
Loading: ../data/financial/test.csv.bz2
Train/Valid/Test sizes: (2202, 84) (734, 84) (736, 84)
Train/Valid/Test split: 0.60 0.20 0.20
CatFX: []
Train/Valid/Test sizes: (2203, 84) (734, 84) (735, 84)
Train/Valid/Test split: 0.60 0.20 0.20
Saving processed files *.atks.bz2
[10]	train's l2: 0.0741182	valid's l2: 0.111513
[20]	train's l2: 0.0743626	valid's l2: 0.11311
[30]	train's l2: 0.0745657	valid's l2: 0.112359
[40]	train's l2: 0.0748962	valid's l2: 0.112413
[50]	train's l2: 0.0743016	valid's l2: 0.112821
[60]	train's l2: 0.0745339	valid's l2: 0.112573
[70]	train's l2: 0.0749791	valid's l2: 0.112432
[80]	train's l2: 0.0750759	valid's l2: 0.112305
[90]	train's l2: 0.0751625	valid's l2: 0.11246
[100]	train's l2: 0.0752458	valid's l2: 0.112652
Model saved to ../out/models/financial/rf-gbdt_financial_T100_S0050_L256_R7.model
  num_trees  learning_rate num_leaves best_round    

In [7]:
###------------_####
def print_fx_imp(model, colnames):
    fx_uses = model.feature_importance(importance_type='split')
    fx_gain = model.feature_importance(importance_type='gain')

    for i,f in enumerate(np.argsort(fx_gain)[::-1]):
        print ("{:2d} {:20s} {:.3f} {:4d}".format(i, colnames[f], fx_gain[f], fx_uses[f]))

        




In [10]:
df = pd.read_csv("../data/financial/raw/dataset.csv")

In [12]:
print(" -- GDBT --")    
gbdt = lightgbm.Booster(model_file="../out/models/financial/std-gbdt_financial_T100_S0050_L256_R37.model")
print(gbdt.num_trees())
print_fx_imp(gbdt, df.columns)

#print(" -- Reduced GDBT --")    
#redf = lightgbm.Booster(model_file="../out/models/wine/red-gbdt_wine_T100_S0050_L256_R100.model")
#print(redf.num_trees())
#print_fx_imp(redf,train.drop(columns=['char_freq_!', 'word_freq_remove',
#                                                          'char_freq_$', 'capital_run_length_average',
#                                                         'capital_run_length_total', 'word_freq_hp' ]).columns
#            )


# print(" -- Adv. Boosting --")    
# advb = lightgbm.Booster(model_file="../out/models/census/adv-boosting_census_B30_T100_S0050_L24_R100.model")
# print(advb.num_trees())
# print_fx_imp(advb, TRAIN.columns)

 -- GDBT --
100
 0 x82                  1450191.221  320
 1 Company              397806.522  693
 2 x80                  214362.329  311
 3 x30                  70387.155   98
 4 x33                  64393.008   76
 5 x56                  55164.777   99
 6 x54                  51505.891  120
 7 x26                  48233.233  137
 8 x19                  47749.092   97
 9 x31                  46238.511  101
10 x52                  45855.731   69
11 x51                  43187.993  130
12 x22                  42519.931   85
13 x41                  38283.714   72
14 x40                  37769.541   80
15 x43                  36778.803   81
16 x60                  35857.685   95
17 x34                  33937.908   88
18 x29                  31576.337   69
19 x15                  31082.085   66
20 x11                  30978.140   66
21 x48                  28640.856   69
22 x68                  28603.746   33
23 x27                  26807.277   60
24 x81                  26134.594   53
25 x7