# Adversarial Boosting

# Documentation

 - http://lightgbm.readthedocs.io/en/latest/
 - http://lightgbm.readthedocs.io/en/latest/Python-Intro.html
 - https://github.com/Microsoft/LightGBM/tree/master/examples/python-guide

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os
import numpy as np
import pandas as pd
import lightgbm
import pickle
import json
import functools
from os import listdir
from os.path import isfile, join
from nilib import *

# Adversarial Boosting

In [3]:
def gen_adv_boosting_data(model, data, groups, num_atks=1):
    ''' 
    model  : is the LightGBM Model
    data   : data matrix with all valid attacks (last column is label)
    groups : grouping of same attacked instance 
    returns the new data matrix and new groups
    
    WARNING: currently works only for binary classification
    '''
    # score the datataset
    labels = data.iloc[:,-1]
    
    # check mispredictions
    predictions = model.predict(data.iloc[:,:-1]) # exclude labels
    matchings = labels * predictions
    
    # select original data + attacked instances
    new_selected = [] # id of selected instances
    new_groups   = []
    
    offset = 0
    for g in groups:
        if g==0:
            print ("Error !!!!")
        elif g==1:
            # there are no attacks, just add original
            new_selected += [offset]
            new_groups   += [1]
        else:
            # get a slice of the matching scores
            g_matchings = matchings[offset:offset+g]

            # most misclassified (smallest margin)
            # skip original
            #adv_instance = np.argmin(g_matchings[1:])+1
            adv_instances = np.argsort(g_matchings[1:])
            adv_instances = adv_instances[:num_atks]
            adv_instances += offset +1

            # add original and adversarial
            new_selected += [offset] + list(adv_instances)
            new_groups   += [1 + len(adv_instances)]
        
        offset += g
    
    new_dataset = data.iloc[new_selected,:]
    
    return new_dataset, new_groups

In [4]:
def extend_adv_boosting_model(train, valid, input_model=None, num_trees=1, params=None):
    ''' 
    model  : is the LightGBM Model
    data   : data matrix with all valid attacks (last column is label)
    returns the new model (is model modified inplace?)
    '''
    
    assert train.shape[1]==valid.shape[1], "Train/Valid Mismatch!"

    lgbm_train = lightgbm.Dataset(data=train.iloc[:,:-1], 
                                  label=train.iloc[:,-1])
    
    lgbm_valid = lightgbm.Dataset(data=valid.iloc[:,:-1], 
                                  label=valid.iloc[:,-1])
    
    lgbm_info = {}
    lgbm_model = lightgbm.train(params, lgbm_train, 
                                num_boost_round = num_trees, 
                                init_model = input_model,
#                                 fobj = optimize_log_loss,
#                                 feval = avg_log_loss,
                                evals_result = lgbm_info,
                                valid_sets   = [lgbm_train, lgbm_valid], 
                                valid_names  = ['train', 'valid'],
                                verbose_eval=1)

    return lgbm_model, lgbm_info

In [5]:
def AdvBoosting(atk_train, atk_valid, trees,
                 params,
                 output_model_file,
                 partial_save=1000, 
                 adv_rounds=1):
    ''' 
    atk_data: full dataset including all valid attacks
    atk_groups: lenght of each attack set
    trees: total number of trees to be produced
    adv_rounds: adversarial instance injecting frequency
    '''
    # temp lgbm file
    temp = output_model_file+".tmp"
    
    # get groups and remove instance ids
    atk_groups = atk_train['instance_id'].value_counts().sort_index().values
    atk_valid_groups = atk_valid['instance_id'].value_counts().sort_index().values
    
    # prepare data (avoiding pandas)
    atk_data   = atk_train.iloc[:,1:] #.values
    atk_valid  = atk_valid.iloc[:,1:] #.values

    # train first trees
    original_ids = np.cumsum(atk_groups[:-1])
    original_ids = np.insert(original_ids, 0, 0)
    
    original_valid_ids = np.cumsum(atk_valid_groups[:-1])
    original_valid_ids = np.insert(original_valid_ids, 0, 0)
    
    model, model_info = extend_adv_boosting_model(atk_data.iloc[original_ids, :], 
                                                  atk_valid.iloc[original_valid_ids, :],
                                                  input_model=None, 
                                                  num_trees=adv_rounds, 
                                                  params=params)
    
    adopted_metric = list(model_info['valid'].keys())[0] #'avg_binary_log_loss', 'l2'    
    best_model = model
    best_info  = model_info
    best_loss  = np.min(model_info['valid'][adopted_metric])
    best_round = 1
        
    # train remaining trees
    for t in range(adv_rounds+1, trees+1, adv_rounds):
        # attack dataset
        adv_data, _       = gen_adv_boosting_data(model, atk_data, atk_groups)
        adv_valid_data, _ = gen_adv_boosting_data(model, atk_valid, atk_valid_groups)
        
        # train additional trees
        model.save_model(temp)
        model, model_info = extend_adv_boosting_model(adv_data, 
                                                      adv_valid_data,
                                                      input_model=temp, 
                                                      num_trees=adv_rounds, 
                                                      params=params)

        if np.min(model_info['valid'][adopted_metric]) < best_loss:
            best_model = model
            best_info  = model_info
            best_loss  = np.min(model_info['valid'][adopted_metric])
            best_round = t
            
    
    return best_model, best_info, best_loss, best_round

In [6]:
def train_adversarial_boosting(train_file, valid_file, test_file, output_model_file):
    
    exp = pd.DataFrame(columns=['num_trees', 'learning_rate', 'num_leaves', 'best_round', 'metric', 'filename'])
    
    # load train/valid/test
    train, valid, test = load_atk_train_valid_test(train_file, valid_file, test_file)
    
    # train = train.iloc[:1000, :]
    
    assert "instance_id" in train.columns.values, "Wrong training set file for GBDT"

    for num_trees in [100]:
        for learning_rate in [0.05]: #[0.01, 0.05]:
            for num_leaves in [24, 2**8]: #[16, 24]:
                      
                lgbm_params = { 'learning_rate': learning_rate, 
                                'num_leaves': num_leaves , 
                                'max_depth': 8,
                                'objective': 'regression'
                              } 
                
                lgbm_model, lgbm_info, best_loss, best_valid_iter = AdvBoosting(train,
                                                    valid,
                                                    trees=num_trees,
                                                    output_model_file=output_model_file, 
                                                    adv_rounds=1,
                                                    params=lgbm_params)
                
                # save file
                model_file_name = "{:s}_T{:d}_S{:04d}_L{:d}_R{:d}.model".format(output_model_file,
                                                                                        num_trees,
                                                                                        int(learning_rate*1000),
                                                                                        num_leaves,
                                                                                        best_valid_iter
                                                                                       )
                ####
                # update experimental results
                exp = exp.append({'num_trees': num_trees, 
                                  'learning_rate':learning_rate,
                                  'num_leaves':num_leaves, 
                                  'best_round':best_valid_iter, 
                                  'metric':best_loss,
                                  'filename': model_file_name},
                                 ignore_index=True)
        
                lgbm_model.save_model(model_file_name)
                print ("Model saved to", model_file_name)
                
    return exp

# WINE

In [7]:
DATASET_NAME="wine"
TRAINING_BUDGETS= [10, 20,40] #30,60

DATASET_DIR="../data/{}".format(DATASET_NAME)
ATK_DIR=DATASET_DIR + "/attacks"
MODELS_DIR="../out/models/{}".format(DATASET_NAME)
TRAINING_FILENAME=DATASET_DIR + "/" + "train.csv.bz2"
TRAINING_FILENAME_ATT=ATK_DIR + "/" + "train_B{}.atks.bz2"
VALIDATION_FILENAME=DATASET_DIR + "/" + "valid.csv.bz2"
VALIDATION_FILENAME_ATT=ATK_DIR + "/" + "valid_B{}.atks.bz2"
TEST_FILENAME=DATASET_DIR + "/" + "test.csv.bz2"
TEST_FILENAME_ATT=ATK_DIR + "/" + "test_B{}.atks.bz2"
MODEL_FILENAME=MODELS_DIR + "/adv-boosting_{}_B{}"

In [8]:
for B in TRAINING_BUDGETS:

        experiments = train_adversarial_boosting(TRAINING_FILENAME_ATT.format(B),
                                                 VALIDATION_FILENAME_ATT.format(B),
                                                 TEST_FILENAME_ATT.format(B),
                                                 MODEL_FILENAME.format(DATASET_NAME, B))  

        experiments.to_csv(MODEL_FILENAME.format(DATASET_NAME, B) + ".csv", index=False)

        print(experiments)
        print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

Pre-processing original files...
Loading: ../data/wine/attacks/train_B10.atks.bz2
Loading: ../data/wine/attacks/valid_B10.atks.bz2
Loading: ../data/wine/attacks/test_B10.atks.bz2
Train/Valid/Test sizes: (78494, 14) (7042, 14) (15641, 14)
Train/Valid/Test split: 0.78 0.07 0.15
   ... with instance ids
CatFX: []
Train/Valid/Test sizes: (66428, 14) (19108, 14) (15641, 14)
Train/Valid/Test split: 0.66 0.19 0.15
Saving processed files *.atks.bz2
[1]	train's l2: 0.89492	valid's l2: 0.920141
[2]	train's l2: 0.928658	valid's l2: 0.961194
[3]	train's l2: 0.907845	valid's l2: 0.942824
[4]	train's l2: 0.886699	valid's l2: 0.923684
[5]	train's l2: 0.865813	valid's l2: 0.904137
[6]	train's l2: 0.846282	valid's l2: 0.888428
[7]	train's l2: 0.827173	valid's l2: 0.872421
[8]	train's l2: 0.809373	valid's l2: 0.857892
[9]	train's l2: 0.79318	valid's l2: 0.845318
[10]	train's l2: 0.779445	valid's l2: 0.833873
[11]	train's l2: 0.765486	valid's l2: 0.822104
[12]	train's l2: 0.752646	valid's l2: 0.813103
[1

[66]	train's l2: 0.349089	valid's l2: 0.6461
[67]	train's l2: 0.346476	valid's l2: 0.645193
[68]	train's l2: 0.34438	valid's l2: 0.644985
[69]	train's l2: 0.341867	valid's l2: 0.644159
[70]	train's l2: 0.339984	valid's l2: 0.644726
[71]	train's l2: 0.337901	valid's l2: 0.644567
[72]	train's l2: 0.334381	valid's l2: 0.644345
[73]	train's l2: 0.333408	valid's l2: 0.64411
[74]	train's l2: 0.332156	valid's l2: 0.643588
[75]	train's l2: 0.331743	valid's l2: 0.643585
[76]	train's l2: 0.329864	valid's l2: 0.64332
[77]	train's l2: 0.327467	valid's l2: 0.643075
[78]	train's l2: 0.32386	valid's l2: 0.642201
[79]	train's l2: 0.322727	valid's l2: 0.642357
[80]	train's l2: 0.321006	valid's l2: 0.641474
[81]	train's l2: 0.318631	valid's l2: 0.641812
[82]	train's l2: 0.317549	valid's l2: 0.642072
[83]	train's l2: 0.316607	valid's l2: 0.642212
[84]	train's l2: 0.314915	valid's l2: 0.642316
[85]	train's l2: 0.312559	valid's l2: 0.64127
[86]	train's l2: 0.310828	valid's l2: 0.6416
[87]	train's l2: 0.309

[19]	train's l2: 0.610856	valid's l2: 0.763244
[20]	train's l2: 0.59842	valid's l2: 0.757955
[21]	train's l2: 0.587662	valid's l2: 0.752719
[22]	train's l2: 0.574428	valid's l2: 0.747934
[23]	train's l2: 0.564405	valid's l2: 0.745307
[24]	train's l2: 0.554743	valid's l2: 0.740155
[25]	train's l2: 0.548114	valid's l2: 0.738986
[26]	train's l2: 0.539405	valid's l2: 0.734961
[27]	train's l2: 0.532314	valid's l2: 0.731369
[28]	train's l2: 0.524669	valid's l2: 0.72853
[29]	train's l2: 0.518087	valid's l2: 0.725117
[30]	train's l2: 0.51094	valid's l2: 0.721837
[31]	train's l2: 0.50564	valid's l2: 0.719449
[32]	train's l2: 0.500871	valid's l2: 0.716912
[33]	train's l2: 0.497818	valid's l2: 0.716191
[34]	train's l2: 0.493538	valid's l2: 0.714281
[35]	train's l2: 0.488788	valid's l2: 0.711996
[36]	train's l2: 0.482259	valid's l2: 0.709273
[37]	train's l2: 0.478344	valid's l2: 0.707929
[38]	train's l2: 0.474002	valid's l2: 0.706022
[39]	train's l2: 0.467525	valid's l2: 0.703906
[40]	train's l2: 

[74]	train's l2: 0.537951	valid's l2: 0.697403
[75]	train's l2: 0.53771	valid's l2: 0.698238
[76]	train's l2: 0.536463	valid's l2: 0.697508
[77]	train's l2: 0.536338	valid's l2: 0.697002
[78]	train's l2: 0.53523	valid's l2: 0.696566
[79]	train's l2: 0.535551	valid's l2: 0.696673
[80]	train's l2: 0.536207	valid's l2: 0.698159
[81]	train's l2: 0.53509	valid's l2: 0.697902
[82]	train's l2: 0.534719	valid's l2: 0.697906
[83]	train's l2: 0.534222	valid's l2: 0.697377
[84]	train's l2: 0.533826	valid's l2: 0.69826
[85]	train's l2: 0.53247	valid's l2: 0.698975
[86]	train's l2: 0.529953	valid's l2: 0.697806
[87]	train's l2: 0.52757	valid's l2: 0.697594
[88]	train's l2: 0.525465	valid's l2: 0.696221
[89]	train's l2: 0.523233	valid's l2: 0.696669
[90]	train's l2: 0.520592	valid's l2: 0.696593
[91]	train's l2: 0.518454	valid's l2: 0.696531
[92]	train's l2: 0.51609	valid's l2: 0.695657
[93]	train's l2: 0.514157	valid's l2: 0.695239
[94]	train's l2: 0.512081	valid's l2: 0.694592
[95]	train's l2: 0.5

# Census

In [None]:
DATASET_NAME="census"
TRAINING_BUDGETS= [30, 60]

DATASET_DIR="../data/{}".format(DATASET_NAME)
ATK_DIR=DATASET_DIR + "/attacks"
MODELS_DIR="../out/models/{}".format(DATASET_NAME)
TRAINING_FILENAME=DATASET_DIR + "/" + "train.csv.bz2"
TRAINING_FILENAME_ATT=ATK_DIR + "/" + "train_B{}.atks.bz2"
VALIDATION_FILENAME=DATASET_DIR + "/" + "valid.csv.bz2"
VALIDATION_FILENAME_ATT=ATK_DIR + "/" + "valid_B{}.atks.bz2"
TEST_FILENAME=DATASET_DIR + "/" + "test.csv.bz2"
TEST_FILENAME_ATT=ATK_DIR + "/" + "test_B{}.atks.bz2"
MODEL_FILENAME=MODELS_DIR + "/adv-boosting_{}_B{}"

In [None]:
for B in TRAINING_BUDGETS:

        experiments = train_adversarial_boosting(TRAINING_FILENAME_ATT.format(B),
                                                 VALIDATION_FILENAME_ATT.format(B),
                                                 TEST_FILENAME_ATT.format(B),
                                                 MODEL_FILENAME.format(DATASET_NAME, B))  

        experiments.to_csv(MODEL_FILENAME.format(DATASET_NAME, B) + ".csv", index=False)

        print(experiments)
        print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )