# Adversarial Boosting

# Documentation

 - http://lightgbm.readthedocs.io/en/latest/
 - http://lightgbm.readthedocs.io/en/latest/Python-Intro.html
 - https://github.com/Microsoft/LightGBM/tree/master/examples/python-guide

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
import numpy as np
import pandas as pd
import lightgbm
import pickle
import json
import functools
from os import listdir
from os.path import isfile, join
from nilib import *

# Path to dataset files

In [3]:
DATASET_NAME="census" # wine

In [4]:
DATASET_DIR="../data/{}".format(DATASET_NAME)
MODELS_DIR="../out/models/{}".format(DATASET_NAME)
TRAINING_FILENAME=DATASET_DIR + "/" + "train_ori.csv.bz2"
TRAINING_FILENAME_ATT=DATASET_DIR + "/" + "train_B{}.csv.bz2"
VALIDATION_FILENAME=DATASET_DIR + "/" + "valid_ori.csv.bz2"
VALIDATION_FILENAME_ATT=DATASET_DIR + "/" + "valid_B{}.csv.bz2"
TEST_FILENAME=DATASET_DIR + "/" + "test_ori.csv.bz2"
TEST_FILENAME_ATT=DATASET_DIR + "/" + "test_B{}.csv.bz2"
MODEL_FILENAME=MODELS_DIR + "/adv-boosting_{}_B{}"

In [5]:
TRAINING_BUDGETS= [15] #[5, 15, 150, 300] # [20, 30, 40]

# Adversarial Boosting

In [6]:
def gen_adv_boosting_data(model, data, groups, num_atks=1):
    ''' 
    model  : is the LightGBM Model
    data   : data matrix with all valid attacks (last column is label)
    groups : grouping of same attacked instance 
    returns the new data matrix and new groups
    
    WARNING: currently works only for binary classification
    '''
    # score the datataset
    labels = data[:,-1]
    
    # check mispredictions
    predictions = model.predict(data[:,:-1]) # exclude labels
    matchings = labels * predictions
    
    # select original data + attacked instances
    new_selected = [] # id of selected instances
    new_groups   = []
    
    offset = 0
    for g in groups:
        if g==0:
            print ("Error !!!!")
        elif g==1:
            # there are no attacks, just add original
            new_selected += [offset]
            new_groups   += [1]
        else:
            # get a slice of the matching scores
            g_matchings = matchings[offset:offset+g]

            # most misclassified (smallest margin)
            # skip original
            #adv_instance = np.argmin(g_matchings[1:])+1
            adv_instances = np.argsort(g_matchings[1:])
            adv_instances = adv_instances[:num_atks]
            adv_instances += offset +1

            # add original and adversarial
            new_selected += [offset] + list(adv_instances)
            new_groups   += [1 + len(adv_instances)]
        
        offset += g
    
    new_dataset = data[new_selected,:]
    
    return new_dataset, new_groups

In [7]:
def extend_adv_boosting_model(train, valid, cat_fx, input_model=None, num_trees=1, params=None):
    ''' 
    model  : is the LightGBM Model
    data   : data matrix with all valid attacks (last column is label)
    returns the new model (is model modified inplace?)
    '''
    
    if cat_fx is None or len(cat_fx)==0:
        cat_fx = "auto"
        
    assert train.shape[1]==valid.shape[1], "Train/Valid Mismatch!"

    lgbm_train = lightgbm.Dataset(data=train[:,:-1], 
                                  label=train[:,-1],
                                  categorical_feature = cat_fx)
    
    lgbm_valid = lightgbm.Dataset(data=valid[:,:-1], 
                                  label=valid[:,-1],
                                  categorical_feature = cat_fx)
    
    lgbm_info = {}
    lgbm_model = lightgbm.train(params, lgbm_train, 
                                num_boost_round = num_trees, 
                                init_model = input_model,
                                fobj = optimize_log_loss, 
                                feval = avg_log_loss,
                                evals_result = lgbm_info,
                                valid_sets   = [lgbm_train, lgbm_valid], 
                                valid_names  = ['train', 'valid'],
                                verbose_eval=5)

    return lgbm_model, lgbm_info

In [8]:
def AdvBoosting(atk_train, atk_valid, trees, 
                 cat_fx,
                 params,
                 output_model_file,
                 partial_save=1000, 
                 adv_rounds=1):
    ''' 
    atk_data: full dataset including all valid attacks
    atk_groups: lenght of each attack set
    trees: total number of trees to be produced
    adv_rounds: adversarial instance injecting frequency
    '''
    # temp lgbm file
    temp = output_model_file+".tmp"
    
    # get groups and remove instance ids
    atk_groups = atk_train['instance_id'].value_counts().sort_index().values
    atk_valid_groups = atk_valid['instance_id'].value_counts().sort_index().values
    
    # get index of categorical features 
    cat_fx = np.where(atk_train.columns.isin(cat_fx))[0]
    cat_fx = list([int(x) for x in cat_fx])  
    # print ("CatFX:", atk_train.columns.values[cat_fx])

    # prepare data (avoiding pandas)
    atk_data   = atk_train.iloc[:,1:].values
    atk_valid  = atk_valid.iloc[:,1:].values
    cat_fx = [x - 1 for x in cat_fx]

    # train first trees
    original_ids = np.cumsum(atk_groups[:-1])
    original_ids = np.insert(original_ids, 0, 0)
    
    original_valid_ids = np.cumsum(atk_valid_groups[:-1])
    original_valid_ids = np.insert(original_valid_ids, 0, 0)
    
    model, model_info = extend_adv_boosting_model(atk_data[original_ids, :], 
                                                  atk_valid[original_valid_ids, :],
                                                  cat_fx=cat_fx,
                                                  input_model=None, 
                                                  num_trees=adv_rounds, 
                                                  params=params)
    
    best_model = model
    best_info  = model_info
    best_loss  = np.min(model_info['valid']['avg_binary_log_loss'])
    best_round = 1
        
    # train remaining trees
    for t in range(adv_rounds+1, trees+1, adv_rounds):
        # attack dataset
        adv_data, _       = gen_adv_boosting_data(model, atk_data, atk_groups)
        adv_valid_data, _ = gen_adv_boosting_data(model, atk_valid, atk_valid_groups)
        
        # train additional trees
        model.save_model(temp)
        model, model_info = extend_adv_boosting_model(adv_data, 
                                                      adv_valid_data,
                                                      cat_fx=cat_fx,
                                                      input_model=temp, 
                                                      num_trees=adv_rounds, 
                                                      params=params)

        if np.min(model_info['valid']['avg_binary_log_loss']) < best_loss:
            best_model = model
            best_info  = model_info
            best_loss  = np.min(model_info['valid']['avg_binary_log_loss'])
            best_round = t
            
    
    return best_model, best_info, best_loss, best_round

In [11]:
def train_adversarial_boosting(train_file, valid_file, test_file, output_model_file):
    
    exp = pd.DataFrame(columns=['num_trees', 'learning_rate', 'num_leaves', 'best_round', 'avg_binary_log_loss', 'filename'])
    
    # load train/valid/test
    train, valid, test, cat_fx = load_atk_train_valid_test(train_file, valid_file, test_file)
    assert "instance_id" in train.columns.values, "Wrong training set file for GBDT"

    for num_trees in [50]:
        for learning_rate in [0.01, 0.05, 0.1]:
            for num_leaves in [8, 16, 24]:
                      
                lgbm_params = { 'learning_rate': learning_rate, 
                                'num_leaves': num_leaves} 
                
                lgbm_model, lgbm_info, best_loss, best_valid_iter = AdvBoosting(train,
                                                    valid,
                                                    trees=num_trees, 
                                                    cat_fx = cat_fx, 
                                                    output_model_file=output_model_file, 
                                                    adv_rounds=1,
                                                    params=lgbm_params)
                
                # save file
                model_file_name = "{:s}_T{:d}_S{:04d}_L{:d}_R{:d}.model".format(output_model_file,
                                                                                        num_trees,
                                                                                        int(learning_rate*1000),
                                                                                        num_leaves,
                                                                                        best_valid_iter
                                                                                       )
                ####
                # update experimental results
                exp = exp.append({'num_trees': num_trees, 
                                  'learning_rate':learning_rate,
                                  'num_leaves':num_leaves, 
                                  'best_round':best_valid_iter, 
                                  'avg_binary_log_loss':best_loss,
                                  'filename': model_file_name},
                                 ignore_index=True)
        
                lgbm_model.save_model(model_file_name)
                print ("Model saved to", model_file_name)
                
    return exp

In [12]:
for B in TRAINING_BUDGETS:

        experiments = train_adversarial_boosting(TRAINING_FILENAME_ATT.format(B),
                                                 VALIDATION_FILENAME_ATT.format(B),
                                                 TEST_FILENAME_ATT.format(B),
                                                 MODEL_FILENAME.format(DATASET_NAME, B))  

        experiments.to_csv(MODEL_FILENAME.format(DATASET_NAME, B) + ".csv", index=False)

        print(experiments)

Loading pre-processed files...
[5]	train's avg_binary_log_loss: 0.662905	valid's avg_binary_log_loss: 0.662915
[10]	train's avg_binary_log_loss: 0.63464	valid's avg_binary_log_loss: 0.634613
[15]	train's avg_binary_log_loss: 0.608925	valid's avg_binary_log_loss: 0.608931
[20]	train's avg_binary_log_loss: 0.585436	valid's avg_binary_log_loss: 0.585458
[25]	train's avg_binary_log_loss: 0.563903	valid's avg_binary_log_loss: 0.563936
[30]	train's avg_binary_log_loss: 0.544151	valid's avg_binary_log_loss: 0.544211
[35]	train's avg_binary_log_loss: 0.525977	valid's avg_binary_log_loss: 0.526066
[40]	train's avg_binary_log_loss: 0.509242	valid's avg_binary_log_loss: 0.509347
[45]	train's avg_binary_log_loss: 0.493795	valid's avg_binary_log_loss: 0.493919
[50]	train's avg_binary_log_loss: 0.479516	valid's avg_binary_log_loss: 0.479618
Model saved to ../out/models/census/adv-boosting_census_B15_T50_S0010_L8_R50.model
[5]	train's avg_binary_log_loss: 0.662213	valid's avg_binary_log_loss: 0.66216

In [15]:
experiments.sort_values('avg_binary_log_loss')

Unnamed: 0,num_trees,learning_rate,num_leaves,best_round,avg_binary_log_loss,filename
7,50,0.1,16,50,0.262118,../out/models/census/adv-boosting_census_B15_T...
8,50,0.1,24,47,0.262908,../out/models/census/adv-boosting_census_B15_T...
6,50,0.1,8,50,0.269215,../out/models/census/adv-boosting_census_B15_T...
5,50,0.05,24,50,0.28539,../out/models/census/adv-boosting_census_B15_T...
4,50,0.05,16,50,0.286376,../out/models/census/adv-boosting_census_B15_T...
3,50,0.05,8,50,0.291347,../out/models/census/adv-boosting_census_B15_T...
2,50,0.01,24,50,0.473948,../out/models/census/adv-boosting_census_B15_T...
1,50,0.01,16,50,0.474682,../out/models/census/adv-boosting_census_B15_T...
0,50,0.01,8,50,0.479618,../out/models/census/adv-boosting_census_B15_T...


In [18]:
print ('best model is:', experiments.sort_values('avg_binary_log_loss').iloc[0]['filename'] )

best model is: ../out/models/census/adv-boosting_census_B15_T50_S0100_L16_R50.model
