# Adversarial Boosting

# Documentation

 - http://lightgbm.readthedocs.io/en/latest/
 - http://lightgbm.readthedocs.io/en/latest/Python-Intro.html
 - https://github.com/Microsoft/LightGBM/tree/master/examples/python-guide

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
import numpy as np
import pandas as pd
import lightgbm
import pickle
import json
import functools
from os import listdir
from os.path import isfile, join
from nilib import *

# Adversarial Boosting

In [23]:
def gen_adv_boosting_data(model, data, groups, num_atks=1):
    ''' 
    model  : is the LightGBM Model
    data   : data matrix with all valid attacks (last column is label)
    groups : grouping of same attacked instance 
    returns the new data matrix and new groups
    
    WARNING: currently works only for binary classification
    '''
    # score the datataset
    labels = data.iloc[:,-1]
    
    # check mispredictions
    predictions = model.predict(data.iloc[:,:-1]) # exclude labels
    matchings = labels * predictions
    
    # select original data + attacked instances
    new_selected = [] # id of selected instances
    new_groups   = []
    
    offset = 0
    for g in groups:
        if g==0:
            print ("Error !!!!")
        elif g==1:
            # there are no attacks, just add original
            new_selected += [offset]
            new_groups   += [1]
        else:
            # get a slice of the matching scores
            g_matchings = matchings[offset:offset+g]

            # most misclassified (smallest margin)
            # skip original
            #adv_instance = np.argmin(g_matchings[1:])+1
            adv_instances = np.argsort(g_matchings[1:])
            adv_instances = adv_instances[:num_atks]
            adv_instances += offset +1

            # add original and adversarial
            new_selected += [offset] + list(adv_instances)
            new_groups   += [1 + len(adv_instances)]
        
        offset += g
    
    new_dataset = data.iloc[new_selected,:]
    
    return new_dataset, new_groups

In [24]:
def extend_adv_boosting_model(train, valid, input_model=None, num_trees=1, params=None):
    ''' 
    model  : is the LightGBM Model
    data   : data matrix with all valid attacks (last column is label)
    returns the new model (is model modified inplace?)
    '''
    
    assert train.shape[1]==valid.shape[1], "Train/Valid Mismatch!"

    lgbm_train = lightgbm.Dataset(data=train.iloc[:,:-1], 
                                  label=train.iloc[:,-1])
    
    lgbm_valid = lightgbm.Dataset(data=valid.iloc[:,:-1], 
                                  label=valid.iloc[:,-1])
    
    lgbm_info = {}
    lgbm_model = lightgbm.train(params, lgbm_train, 
                                num_boost_round = num_trees, 
                                init_model = input_model,
#                                 fobj = optimize_log_loss,
#                                 feval = avg_log_loss,
                                evals_result = lgbm_info,
                                valid_sets   = [lgbm_train, lgbm_valid], 
                                valid_names  = ['train', 'valid'],
                                verbose_eval=1)

    return lgbm_model, lgbm_info

In [25]:
def AdvBoosting(atk_train, atk_valid, trees,
                 params,
                 output_model_file,
                 partial_save=1000, 
                 adv_rounds=1):
    ''' 
    atk_data: full dataset including all valid attacks
    atk_groups: lenght of each attack set
    trees: total number of trees to be produced
    adv_rounds: adversarial instance injecting frequency
    '''
    # temp lgbm file
    temp = output_model_file+".tmp"
    
    # get groups and remove instance ids
    atk_groups = atk_train['instance_id'].value_counts().sort_index().values
    atk_valid_groups = atk_valid['instance_id'].value_counts().sort_index().values
    
    # prepare data (avoiding pandas)
    atk_data   = atk_train.iloc[:,1:] #.values
    atk_valid  = atk_valid.iloc[:,1:] #.values

    # train first trees
    original_ids = np.cumsum(atk_groups[:-1])
    original_ids = np.insert(original_ids, 0, 0)
    
    original_valid_ids = np.cumsum(atk_valid_groups[:-1])
    original_valid_ids = np.insert(original_valid_ids, 0, 0)
    
    model, model_info = extend_adv_boosting_model(atk_data.iloc[original_ids, :], 
                                                  atk_valid.iloc[original_valid_ids, :],
                                                  input_model=None, 
                                                  num_trees=adv_rounds, 
                                                  params=params)
    
    adopted_metric = list(model_info['valid'].keys())[0] #'avg_binary_log_loss', 'l2'    
    best_model = model
    best_info  = model_info
    best_loss  = np.min(model_info['valid'][adopted_metric])
    best_round = 1
        
    # train remaining trees
    for t in range(adv_rounds+1, trees+1, adv_rounds):
        # attack dataset
        adv_data, _       = gen_adv_boosting_data(model, atk_data, atk_groups)
        adv_valid_data, _ = gen_adv_boosting_data(model, atk_valid, atk_valid_groups)
        
        # train additional trees
        model.save_model(temp)
        model, model_info = extend_adv_boosting_model(adv_data, 
                                                      adv_valid_data,
                                                      input_model=temp, 
                                                      num_trees=adv_rounds, 
                                                      params=params)

        if np.min(model_info['valid'][adopted_metric]) < best_loss:
            best_model = model
            best_info  = model_info
            best_loss  = np.min(model_info['valid'][adopted_metric])
            best_round = t
            
    
    return best_model, best_info, best_loss, best_round

In [26]:
def train_adversarial_boosting(train_file, valid_file, test_file, output_model_file):
    
    exp = pd.DataFrame(columns=['num_trees', 'learning_rate', 'num_leaves', 'best_round', 'metric', 'filename'])
    
    # load train/valid/test
    train, valid, test = load_atk_train_valid_test(train_file, valid_file, test_file)
    
    # train = train.iloc[:1000, :]
    
    assert "instance_id" in train.columns.values, "Wrong training set file for GBDT"

    for num_trees in [100]:
        for learning_rate in [0.05]: #[0.01, 0.05]:
            for num_leaves in [24, 2**8]: #[16, 24]:
                      
                lgbm_params = { 'learning_rate': learning_rate, 
                                'num_leaves': num_leaves , 
                                'max_depth': 8,
                                'objective': 'regression'
                              } 
                
                lgbm_model, lgbm_info, best_loss, best_valid_iter = AdvBoosting(train,
                                                    valid,
                                                    trees=num_trees,
                                                    output_model_file=output_model_file, 
                                                    adv_rounds=1,
                                                    params=lgbm_params)
                
                # save file
                model_file_name = "{:s}_T{:d}_S{:04d}_L{:d}_R{:d}.model".format(output_model_file,
                                                                                        num_trees,
                                                                                        int(learning_rate*1000),
                                                                                        num_leaves,
                                                                                        best_valid_iter
                                                                                       )
                ####
                # update experimental results
                exp = exp.append({'num_trees': num_trees, 
                                  'learning_rate':learning_rate,
                                  'num_leaves':num_leaves, 
                                  'best_round':best_valid_iter, 
                                  'metric':best_loss,
                                  'filename': model_file_name},
                                 ignore_index=True)
        
                lgbm_model.save_model(model_file_name)
                print ("Model saved to", model_file_name)
                
    return exp

# WINE

In [32]:
DATASET_NAME="wine"
TRAINING_BUDGETS= [30, 60]

DATASET_DIR="../data/{}".format(DATASET_NAME)
ATK_DIR=DATASET_DIR + "/attacks"
MODELS_DIR="../out/models/{}".format(DATASET_NAME)
TRAINING_FILENAME=DATASET_DIR + "/" + "train.csv.bz2"
TRAINING_FILENAME_ATT=ATK_DIR + "/" + "train_B{}.atks.bz2"
VALIDATION_FILENAME=DATASET_DIR + "/" + "valid.csv.bz2"
VALIDATION_FILENAME_ATT=ATK_DIR + "/" + "valid_B{}.atks.bz2"
TEST_FILENAME=DATASET_DIR + "/" + "test.csv.bz2"
TEST_FILENAME_ATT=ATK_DIR + "/" + "test_B{}.atks.bz2"
MODEL_FILENAME=MODELS_DIR + "/adv-boosting_{}_B{}"

In [None]:
for B in TRAINING_BUDGETS:

        experiments = train_adversarial_boosting(TRAINING_FILENAME_ATT.format(B),
                                                 VALIDATION_FILENAME_ATT.format(B),
                                                 TEST_FILENAME_ATT.format(B),
                                                 MODEL_FILENAME.format(DATASET_NAME, B))  

        experiments.to_csv(MODEL_FILENAME.format(DATASET_NAME, B) + ".csv", index=False)

        print(experiments)
        print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

Loading pre-processed files...
[1]	train's l2: 0.89492	valid's l2: 0.920141
[2]	train's l2: 0.934941	valid's l2: 0.966258
[3]	train's l2: 0.915879	valid's l2: 0.948257
[4]	train's l2: 0.901983	valid's l2: 0.936847
[5]	train's l2: 0.891029	valid's l2: 0.92598
[6]	train's l2: 0.878488	valid's l2: 0.91579
[7]	train's l2: 0.867778	valid's l2: 0.906862
[8]	train's l2: 0.855731	valid's l2: 0.896369
[9]	train's l2: 0.843081	valid's l2: 0.885302
[10]	train's l2: 0.835	valid's l2: 0.879808
[11]	train's l2: 0.823186	valid's l2: 0.87135
[12]	train's l2: 0.810999	valid's l2: 0.861731
[13]	train's l2: 0.798075	valid's l2: 0.851189
[14]	train's l2: 0.787624	valid's l2: 0.841863
[15]	train's l2: 0.777763	valid's l2: 0.834817
[16]	train's l2: 0.769694	valid's l2: 0.828913
[17]	train's l2: 0.761553	valid's l2: 0.82225


# Census

In [27]:
DATASET_NAME="census"
TRAINING_BUDGETS= [30, 60]

DATASET_DIR="../data/{}".format(DATASET_NAME)
ATK_DIR=DATASET_DIR + "/attacks"
MODELS_DIR="../out/models/{}".format(DATASET_NAME)
TRAINING_FILENAME=DATASET_DIR + "/" + "train.csv.bz2"
TRAINING_FILENAME_ATT=ATK_DIR + "/" + "train_B{}.atks.bz2"
VALIDATION_FILENAME=DATASET_DIR + "/" + "valid.csv.bz2"
VALIDATION_FILENAME_ATT=ATK_DIR + "/" + "valid_B{}.atks.bz2"
TEST_FILENAME=DATASET_DIR + "/" + "test.csv.bz2"
TEST_FILENAME_ATT=ATK_DIR + "/" + "test_B{}.atks.bz2"
MODEL_FILENAME=MODELS_DIR + "/adv-boosting_{}_B{}"

In [None]:
for B in TRAINING_BUDGETS:

        experiments = train_adversarial_boosting(TRAINING_FILENAME_ATT.format(B),
                                                 VALIDATION_FILENAME_ATT.format(B),
                                                 TEST_FILENAME_ATT.format(B),
                                                 MODEL_FILENAME.format(DATASET_NAME, B))  

        experiments.to_csv(MODEL_FILENAME.format(DATASET_NAME, B) + ".csv", index=False)

        print(experiments)
        print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

Loading pre-processed files...




[1]	train's l2: 0.714488	valid's l2: 0.711535
[2]	train's l2: 0.704827	valid's l2: 0.702457
