** validation split is a bit strange ... **

In [8]:
import os, sys
import numpy as np
import pandas as pd
import lightgbm

## Prepare Data

In [9]:
# def infer_categorical_features(dataset):
#     categorical_features = set([])
#     for column in dataset.columns:
#         if dataset[column].dtype == 'object':
#             categorical_features.add(column)
#     return categorical_features
            
def label_encode(dataset, categorical_features):
    dataset_le = dataset.copy()
    for column in dataset_le.columns:
        if column in categorical_features:
            dataset_le[column] = dataset_le[column].astype('category')
            dataset_le[column] = dataset_le[column].cat.codes.astype(np.int32)
    return dataset_le

def load_atk_train_valid_test( atk_train_file, atk_valid_file, atk_test_file, 
                               train_split=0.6, valid_split=0.2,
                               force=False):
    
    
    if  ( force or 
          not os.path.exists(atk_train_file+".cat.bz2") or
          not os.path.exists(atk_valid_file+".cat.bz2") or
          not os.path.exists(atk_test_file+".cat.bz2") ):
    
        print ("Pre-processing original files...")

        print ("Loading:", atk_train_file)
        print ("Loading:", atk_valid_file)
        print ("Loading:", atk_test_file)

        train = pd.read_csv(atk_train_file)
        valid = pd.read_csv(atk_valid_file)
        test  = pd.read_csv(atk_test_file)
        
        print ("Train/Valid/Test sizes:", train.shape, valid.shape, test.shape)
        print ("Train/Valid/Test split: {:.2f} {:.2f} {:.2f}"
                   .format( train.shape[0]/(train.shape[0]+valid.shape[0]+test.shape[0]),
                            valid.shape[0]/(train.shape[0]+valid.shape[0]+test.shape[0]),
                            test.shape[0] /(train.shape[0]+valid.shape[0]+test.shape[0]) ) )

        # concat to process correctly label encoding
        full = pd.concat( [train, valid, test] )

        # removed because LGBM must handle this in itw own way
        #fx = infer_categorical_features(full)
        #print("List of categorical features: [{}]".format(", ".join([cf for cf in fx])))
        #full = label_encode(full, fx)

        # split-back into train valid test
        train_size = int( full.shape[0]*train_split )
        valid_size = int( full.shape[0]*valid_split )
        train_cat = full.iloc[0:train_size,:]
        valid_cat = full.iloc[train_size:train_size+valid_size,:]
        test_cat  = full.iloc[train_size+valid_size:,:]    

        print ("Train/Valid/Test sizes:", train_cat.shape, valid_cat.shape, test_cat.shape)
        print ("Train/Valid/Test split: {:.2f} {:.2f} {:.2f}"
                   .format( train_cat.shape[0]/(train_cat.shape[0]+valid_cat.shape[0]+test_cat.shape[0]),
                            valid_cat.shape[0]/(train_cat.shape[0]+valid_cat.shape[0]+test_cat.shape[0]),
                            test_cat.shape[0] /(train_cat.shape[0]+valid_cat.shape[0]+test_cat.shape[0]) ) )

        # save to file
        print ("Saving processed files *.cat.bz2")
        train_cat.to_csv(atk_train_file+".cat.bz2", compression="bz2", index=False)
        valid_cat.to_csv(atk_valid_file+".cat.bz2", compression="bz2", index=False)
        test_cat.to_csv (atk_test_file+".cat.bz2",  compression="bz2", index=False)
        
    else:
        print ("Loading pre-processed files...")

        train_cat = pd.read_csv(atk_train_file+".cat.bz2")
        valid_cat = pd.read_csv(atk_valid_file+".cat.bz2")
        test_cat  = pd.read_csv(atk_test_file+".cat.bz2")
    
    # return data
    return train_cat, valid_cat, test_cat


## Adversarial Boosting

In [10]:
def AdvBoosting_gen_data(model, data, groups):
    ''' 
    model  : is the LightGBM Model
    data   : data matrix with all valid attacks (last column is label)
    groups : grouping of same attacked instance 
    returns the new data matrix and new groups
    
    WARNING: currently works only for binary classification
    '''
    # score the datataset
    labels = data[:,-1]
    
    predictions = model.predict(data[:,:-1]) # exclude labels
    # binarize
    predictions = (predictions>0).astype(np.float)
    predictions = 2*predictions - 1
    
    # check mispredictions
    matchings = labels * predictions
    
    # select original data + attacked instances
    new_selected = [] # id of selected instances
    new_groups   = []
    
    offset = 0
    for g in groups:
        if g==0:
            print ("Error !!!!")
        elif g==1:
            # there are no attacks, just add original
            new_selected += [offset]
            new_groups   += [1]
        else:
            # get a slice of the matching scores
            g_matchings = matchings[offset:offset+g]

            # most misclassified (smallest margin)
            # skip original
            adv_instance = np.argmin(g_matchings[1:])+1

            # add original and adversarial
            new_selected += [offset, adv_instance]
            new_groups   += [2]
        
        offset += g
    
    new_dataset = data[new_selected,:]
    
    return new_dataset, new_groups

In [11]:
# Our custom metrics
def binary_log_loss(pred, true_label):

    return np.log(1.0 + np.exp(-pred * true_label))

# self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool
def avg_log_loss(preds, train_data):
    
    labels = train_data.get_label()
    losses = np.log(1.0 + np.exp(-preds*labels))
    avg_loss = np.mean(losses)
    
    return 'avg_binary_log_loss', avg_loss, False


def optimize_log_loss(preds, train_data):
    labels = train_data.get_label()
    exp_pl = np.exp(preds * labels)
    # http://www.wolframalpha.com/input/?i=differentiate+log(1+%2B+exp(-kx)+)
    grads = -labels / (1.0 +  exp_pl)  
    # http://www.wolframalpha.com/input/?i=d%5E2%2Fdx%5E2+log(1+%2B+exp(-kx)+)
    hess = labels**2 * exp_pl / (1.0 + exp_pl)**2 

    # this is to optimize average logloss
    norm = 1.0/len(preds)
    grads *= norm
    hess *= norm
    
    return grads, hess


# self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool
def avg_log_loss_uma(preds, train_data):
    labels = train_data.get_label()
    attack_lens = train_data.get_group()
    
    offset = 0
    max_logloss = []
    avg_max_logloss = 0.0
    
    if attack_lens is not None:
    
        for atk in attack_lens:
            losses = [binary_log_loss(h,t) for h,t in zip(preds[offset:offset+atk], labels[offset:offset+atk])]
            max_logloss.append(max(losses))

            offset += atk
        
        avg_max_logloss = np.mean(max_logloss)  

    return 'avg_binary_log_loss_under_max_attack', avg_max_logloss, False

def avg_non_interferent_log_loss(preds, train_data, alpha=1.0):
    
    # binary logloss under maximal attack
    _, loss_uma, _    = avg_log_loss_uma(preds, train_data)
    
    # binary logloss (plain)
    _, loss_plain, _  = avg_log_loss(preds, train_data)
    
    # combine the above two losses together
    weighted_loss = alpha*loss_uma + (1.0-alpha)*loss_plain

    return 'avg_non_interferent_log_loss [alpha={}]'.format(alpha), weighted_loss, False

def optimize_log_loss_uma(preds, train_data):
    labels = train_data.get_label()
    attack_lens = train_data.get_group()
    
    grads = np.zeros_like(labels, dtype=np.float64)
    hess = np.zeros_like(grads)
    
    if attack_lens is not None:

        norm = 1.0 / float(len(attack_lens))

        offset = 0
        for atk in attack_lens:
            exp_pl = np.exp(- preds[offset:offset+atk] * labels[offset:offset+atk])

            inv_sum = 1.0 / np.sum(1.0 + exp_pl)

            x_grad = inv_sum * exp_pl

            grads[offset:offset+atk] = norm * x_grad * (- labels[offset:offset+atk])
            hess[offset:offset+atk]  = norm * x_grad * (1.0 - x_grad)

            offset += atk    
    
    return grads, hess

def optimize_non_interferent_log_loss(preds, train_data, alpha=1.0):
    # binary logloss under maximal attack
    grads_uma, hess_uma = optimize_log_loss_uma(preds, train_data)
    
    # binary logloss (plain)
    grads_plain, hess_plain = optimize_log_loss(preds, train_data)
    
    # combine the above two losses together
    grads = alpha*grads_uma + (1.0-alpha)*grads_plain
    hess  = alpha*hess_uma  + (1.0-alpha)*hess_plain
    
    return grads, hess



def AdvBoosting_extend_model(data, cat_fx, input_model=None, num_trees=1, params=None):
    ''' 
    model  : is the LightGBM Model
    data   : data matrix with all valid attacks (last column is label)
    returns the new model (is model modified inplace?)
    '''
    
    if cat_fx is None:
        cat_fx = "auto"
    
    if params is None:
        params = {
            'learning_rate': 0.1,
            'num_leaves': 16,
            'min_data_in_leaf': 20, #[1, 20]
            'verbose': 0
        }  
    
    lgbm_train = lightgbm.Dataset(data=data[:,:-1], 
                                  label=data[:,-1])
    
    lgbm_info = {}
    lgbm_model = lightgbm.train(params, lgbm_train, 
                                num_boost_round = num_trees, 
                                init_model = input_model,
                                categorical_feature=cat_fx,
                                fobj = optimize_log_loss, 
                                feval = avg_log_loss,
                                evals_result = lgbm_info,
                                valid_sets   = [lgbm_train], 
                                valid_names  = ['adv-train'],
                                verbose_eval=10)

    return lgbm_model, lgbm_info

In [12]:
# happens to generate core dump. why?
def AdvBoosting_in_memory( atk_train, trees, 
                 cat_fx,
                 output_model_file,
                 partial_save=10, 
                 adv_rounds=1,
                 params=None):
    ''' 
    atk_data: full dataset including all valid attacks
    atk_groups: lenght of each attack set
    trees: total number of trees to be produced
    adv_rounds: adversarial instance injecting frequency
    '''
    # get groups and remove instance ids
    atk_groups = atk_train['instance_id'].value_counts().sort_index().values
    atk_data   = atk_train.iloc[:,1:].values
    
    # train first trees
    original_ids = np.cumsum(atk_groups[:-1])
    original_ids = np.insert(original_ids, 0, 0)
    
    model, model_info = AdvBoosting_extend_model( atk_data[original_ids, :],
                                                  cat_fx=cat_fx,
                                                  input_model=None, 
                                                  num_trees=adv_rounds, 
                                                  params=params )
    
    # train remaining trees
    for t in range(adv_rounds+1, trees+1, adv_rounds):
        # attack dataset
        adv_data, adv_offsets = AdvBoosting_gen_data( model, atk_data, atk_groups )
        
        # train additional trees
        model, model_info = AdvBoosting_extend_model( adv_data, 
                                                      cat_fx=cat_fx,
                                                      input_model=model, 
                                                      num_trees=adv_rounds, 
                                                      params=params)
        # save partial model
        if t%partial_save==0 and t!=trees:
            partial_filename = "{}.T{:03d}.lgbm".format(output_model_file, t)
            model.save_model( filename=partial_filename )
            
    model.save_model(filename=output_model_file)

In [13]:
def AdvBoosting( atk_train, trees, 
                 output_model_file,
                 partial_save=10, 
                 adv_rounds=1,
                 params=None):
    ''' 
    atk_data: full dataset including all valid attacks
    atk_groups: lenght of each attack set
    trees: total number of trees to be produced
    adv_rounds: adversarial instance injecting frequency
    '''
    # temp lgbm file
    temp = output_model_file+".tmp"
    
    # get index of categorical features (-1 because of instance_id)
    cat_fx = np.where(atk_train.dtypes=='object')[0] -1
    print ("CatFX:", atk_train.columns.values[cat_fx+1])
    
    # get groups and remove instance ids
    atk_groups = atk_train['instance_id'].value_counts().sort_index().values
    atk_train = label_encode(atk_train, atk_train.columns.values[cat_fx+1])
    
    #fx = infer_categorical_features(full)
        #print("List of categorical features: [{}]".format(", ".join([cf for cf in fx])))
        #full = label_encode(full, fx)
    
    
    # prepare data (avoiding pandas)
    cat_fx = list([int(x) for x in cat_fx])    
    atk_data   = atk_train.iloc[:,1:].values
    
    # train first trees
    original_ids = np.cumsum(atk_groups[:-1])
    original_ids = np.insert(original_ids, 0, 0)
    
    model, model_info = AdvBoosting_extend_model( atk_data[original_ids, :], 
                                                  cat_fx=cat_fx,
                                                  input_model=None, 
                                                  num_trees=adv_rounds, 
                                                  params=params )
    
    
    # train remaining trees
    for t in range(adv_rounds+1, trees+1, adv_rounds):
        # attack dataset
        adv_data, adv_offsets = AdvBoosting_gen_data( model, atk_data, atk_groups )
        
        # train additional trees
        model.save_model(temp)
        model, model_info = AdvBoosting_extend_model( adv_data, 
                                                      cat_fx=cat_fx,
                                                      input_model=temp, 
                                                      num_trees=adv_rounds, 
                                                      params=params)
        # save partial model
        if t%partial_save==0 and t!=trees:
            partial_filename = "{}.T{:03d}.lgbm".format(output_model_file, t)
            model.save_model( filename=partial_filename )
            
    model.save_model(filename=output_model_file)

# Scripting wrapper

In [14]:
def run_training( atk_train_file, atk_valid_file, atk_test_file,
                  output_model_file,
                  num_trees,
                  learning_rate,
                  partial_save,
                  adversarial_rounds):

    # process train/valid/test
    train, valid, test = load_atk_train_valid_test(atk_train_file, atk_valid_file, atk_test_file)
    
    # train
    lgbm_params = { 'learning_rate': float(learning_rate), 'num_leaves': 16,
                    'min_data_in_leaf': 20, #[1, 20]
                    'verbose': 0 } 

    AdvBoosting( train, 
                 trees=num_trees, 
                 output_model_file=output_model_file, 
                 adv_rounds=adversarial_rounds, 
                 partial_save=partial_save, 
                 params=lgbm_params)

# Start the Training

In [15]:
run_training( "../data/census/train_B5.csv.bz2", 
              "../data/census/valid_B5.csv.bz2",
              "../data/census/test_B5.csv.bz2",
              "../out/models/adv_boosting.lgbm",
              num_trees=50,
              learning_rate=0.1,
              partial_save=10,
              adversarial_rounds=1)



Loading pre-processed files...
CatFX: ['workclass' 'marital_status' 'occupation' 'relationship' 'race' 'sex'
 'native_country']




[10]	adv-train's avg_binary_log_loss: 0.321584
[20]	adv-train's avg_binary_log_loss: 0.226903
[30]	adv-train's avg_binary_log_loss: 0.195902
[40]	adv-train's avg_binary_log_loss: 0.184389
[50]	adv-train's avg_binary_log_loss: 0.179951


In [102]:
# run_training( "../data/census/train_B5.csv.bz2", 
#               "../data/census/valid_B5.csv.bz2",
#               "../data/census/test_B5.csv.bz2",
#               "../out/models/debug.lgbm",
#               num_trees=10,
#               learning_rate=0.1,
#               partial_save=1000,
#               adversarial_rounds=10)

Loading pre-processed files...
CatFX: ['workclass' 'marital_status' 'occupation' 'relationship' 'race' 'sex'
 'native_country']
[10]	adv-train's avg_binary_log_loss: 0.442759


In [10]:
#!ls -lht ../out/models

In [11]:
# debug running time
#%load_ext line_profiler
# %lprun -f AdvBoosting  AdvBoosting(train, trees=3)
# %lprun -f AdvBoosting_gen_data  AdvBoosting(train, trees=3)

In [12]:
#!rm ../out/models/*.lgbm