In [1]:
import os, sys
import numpy as np
import pandas as pd
import lightgbm
import json

## Prepare Data

In [2]:
# def infer_categorical_features(dataset):
#     categorical_features = set([])
#     for column in dataset.columns:
#         if dataset[column].dtype == 'object':
#             categorical_features.add(column)
#     return categorical_features
            
def label_encode(dataset, categorical_features):
    dataset_le = dataset.copy()
    for column in dataset_le.columns:
        if column in categorical_features:
            dataset_le[column] = dataset_le[column].astype('category')
            dataset_le[column] = dataset_le[column].cat.codes.astype(np.int32)
    return dataset_le

def load_atk_train_valid_test( atk_train_file, atk_valid_file, atk_test_file, 
                               train_split=0.6, valid_split=0.2,
                               force=False):
    
    
    if  ( force or 
          not os.path.exists(atk_train_file+".cat.bz2") or
          not os.path.exists(atk_valid_file+".cat.bz2") or
          not os.path.exists(atk_test_file+".cat.bz2") or 
          not os.path.exists(atk_train_file+".cat.json") ):
    
        print ("Pre-processing original files...")

        print ("Loading:", atk_train_file)
        print ("Loading:", atk_valid_file)
        print ("Loading:", atk_test_file)

        train = pd.read_csv(atk_train_file)
        valid = pd.read_csv(atk_valid_file)
        test  = pd.read_csv(atk_test_file)
        
        print ("Train/Valid/Test sizes:", train.shape, valid.shape, test.shape)
        print ("Train/Valid/Test split: {:.2f} {:.2f} {:.2f}"
                   .format( train.shape[0]/(train.shape[0]+valid.shape[0]+test.shape[0]),
                            valid.shape[0]/(train.shape[0]+valid.shape[0]+test.shape[0]),
                            test.shape[0] /(train.shape[0]+valid.shape[0]+test.shape[0]) ) )

        # concat to process correctly label encoding
        full = pd.concat( [train, valid, test] )

        # get index of categorical features (-1 because of instance_id)
        cat_fx = full.columns.values[ np.where(full.dtypes=='object')[0] ]
        cat_fx = list(cat_fx)    
        full = label_encode(full, cat_fx)
        with open(atk_train_file+".cat.json", 'w') as fp:
            json.dump(cat_fx, fp)
        print ("CatFX:", cat_fx)

        # split-back into train valid test
        train_size = int( full.shape[0]*train_split )
        valid_size = int( full.shape[0]*valid_split )
        train_cat = full.iloc[0:train_size,:]
        valid_cat = full.iloc[train_size:train_size+valid_size,:]
        test_cat  = full.iloc[train_size+valid_size:,:]    

        print ("Train/Valid/Test sizes:", train_cat.shape, valid_cat.shape, test_cat.shape)
        print ("Train/Valid/Test split: {:.2f} {:.2f} {:.2f}"
                   .format( train_cat.shape[0]/(train_cat.shape[0]+valid_cat.shape[0]+test_cat.shape[0]),
                            valid_cat.shape[0]/(train_cat.shape[0]+valid_cat.shape[0]+test_cat.shape[0]),
                            test_cat.shape[0] /(train_cat.shape[0]+valid_cat.shape[0]+test_cat.shape[0]) ) )

        # save to file
        print ("Saving processed files *.cat.bz2")
        train_cat.to_csv(atk_train_file+".cat.bz2", compression="bz2", index=False)
        valid_cat.to_csv(atk_valid_file+".cat.bz2", compression="bz2", index=False)
        test_cat.to_csv (atk_test_file+".cat.bz2",  compression="bz2", index=False)
        
    else:
        print ("Loading pre-processed files...")

        train_cat = pd.read_csv(atk_train_file+".cat.bz2")
        valid_cat = pd.read_csv(atk_valid_file+".cat.bz2")
        test_cat  = pd.read_csv(atk_test_file+".cat.bz2")
        
        with open(atk_train_file+".cat.json", 'r') as fp:
            cat_fx = json.load(fp)
    
    # return data
    return train_cat, valid_cat, test_cat, cat_fx


## Adversarial Boosting

In [3]:
def AdvBoosting_gen_data(model, data, groups):
    ''' 
    model  : is the LightGBM Model
    data   : data matrix with all valid attacks (last column is label)
    groups : grouping of same attacked instance 
    returns the new data matrix and new groups
    
    WARNING: currently works only for binary classification
    '''
    # score the datataset
    labels = data[:,-1]
    
    predictions = model.predict(data[:,:-1]) # exclude labels
    # binarize
    predictions = (predictions>0).astype(np.float)
    predictions = 2*predictions - 1
    
    # check mispredictions
    matchings = labels * predictions
    
    # select original data + attacked instances
    new_selected = [] # id of selected instances
    new_groups   = []
    
    offset = 0
    for g in groups:
        if g==0:
            print ("Error !!!!")
        elif g==1:
            # there are no attacks, just add original
            new_selected += [offset]
            new_groups   += [1]
        else:
            # get a slice of the matching scores
            g_matchings = matchings[offset:offset+g]

            # most misclassified (smallest margin)
            # skip original
            adv_instance = np.argmin(g_matchings[1:])+1

            # add original and adversarial
            new_selected += [offset, adv_instance]
            new_groups   += [2]
        
        offset += g
    
    new_dataset = data[new_selected,:]
    
    return new_dataset, new_groups

In [4]:
# Our custom metrics
def binary_log_loss(pred, true_label):

    return np.log(1.0 + np.exp(-pred * true_label))

# self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool
def avg_log_loss(preds, train_data):
    
    labels = train_data.get_label()
    losses = np.log(1.0 + np.exp(-preds*labels))
    avg_loss = np.mean(losses)
    
    return 'avg_binary_log_loss', avg_loss, False


def optimize_log_loss(preds, train_data):
    labels = train_data.get_label()
    exp_pl = np.exp(preds * labels)
    # http://www.wolframalpha.com/input/?i=differentiate+log(1+%2B+exp(-kx)+)
    grads = -labels / (1.0 +  exp_pl)  
    # http://www.wolframalpha.com/input/?i=d%5E2%2Fdx%5E2+log(1+%2B+exp(-kx)+)
    hess = labels**2 * exp_pl / (1.0 + exp_pl)**2 

    # this is to optimize average logloss
    norm = 1.0/len(preds)
    grads *= norm
    hess *= norm
    
    return grads, hess


# self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool
def avg_log_loss_uma(preds, train_data):
    labels = train_data.get_label()
    attack_lens = train_data.get_group()
    
    offset = 0
    max_logloss = []
    avg_max_logloss = 0.0
    
    if attack_lens is not None:
    
        for atk in attack_lens:
            losses = [binary_log_loss(h,t) for h,t in zip(preds[offset:offset+atk], labels[offset:offset+atk])]
            max_logloss.append(max(losses))

            offset += atk
        
        avg_max_logloss = np.mean(max_logloss)  

    return 'avg_binary_log_loss_under_max_attack', avg_max_logloss, False

def avg_non_interferent_log_loss(preds, train_data, alpha=1.0):
    
    # binary logloss under maximal attack
    _, loss_uma, _    = avg_log_loss_uma(preds, train_data)
    
    # binary logloss (plain)
    _, loss_plain, _  = avg_log_loss(preds, train_data)
    
    # combine the above two losses together
    weighted_loss = alpha*loss_uma + (1.0-alpha)*loss_plain

    return 'avg_non_interferent_log_loss [alpha={}]'.format(alpha), weighted_loss, False

def optimize_log_loss_uma(preds, train_data):
    labels = train_data.get_label()
    attack_lens = train_data.get_group()
    
    grads = np.zeros_like(labels, dtype=np.float64)
    hess = np.zeros_like(grads)
    
    if attack_lens is not None:

        norm = 1.0 / float(len(attack_lens))

        offset = 0
        for atk in attack_lens:
            exp_pl = np.exp(- preds[offset:offset+atk] * labels[offset:offset+atk])

            inv_sum = 1.0 / np.sum(1.0 + exp_pl)

            x_grad = inv_sum * exp_pl

            grads[offset:offset+atk] = norm * x_grad * (- labels[offset:offset+atk])
            hess[offset:offset+atk]  = norm * x_grad * (1.0 - x_grad)

            offset += atk    
    
    return grads, hess

def optimize_non_interferent_log_loss(preds, train_data, alpha=1.0):
    # binary logloss under maximal attack
    grads_uma, hess_uma = optimize_log_loss_uma(preds, train_data)
    
    # binary logloss (plain)
    grads_plain, hess_plain = optimize_log_loss(preds, train_data)
    
    # combine the above two losses together
    grads = alpha*grads_uma + (1.0-alpha)*grads_plain
    hess  = alpha*hess_uma  + (1.0-alpha)*hess_plain
    
    return grads, hess



def AdvBoosting_extend_model(data, cat_fx, input_model=None, num_trees=1, params=None):
    ''' 
    model  : is the LightGBM Model
    data   : data matrix with all valid attacks (last column is label)
    returns the new model (is model modified inplace?)
    '''
    
    if cat_fx is None or len(cat_fx)==0:
        cat_fx = "auto"

    lgbm_train = lightgbm.Dataset(data=data[:,:-1], 
                                  label=data[:,-1],
                                  categorical_feature = cat_fx)
    
    lgbm_info = {}
    lgbm_model = lightgbm.train(params, lgbm_train, 
                                num_boost_round = num_trees, 
                                init_model = input_model,
                                fobj = optimize_log_loss, 
                                feval = avg_log_loss,
                                evals_result = lgbm_info,
                                valid_sets   = [lgbm_train], 
                                valid_names  = ['adv-train'],
                                verbose_eval=10)

    return lgbm_model, lgbm_info

In [5]:
def AdvBoosting( atk_train, trees, 
                 cat_fx,
                 params,
                 output_model_file,
                 partial_save=10, 
                 adv_rounds=1):
    ''' 
    atk_data: full dataset including all valid attacks
    atk_groups: lenght of each attack set
    trees: total number of trees to be produced
    adv_rounds: adversarial instance injecting frequency
    '''
    # temp lgbm file
    temp = output_model_file+".tmp"
    
    # get groups and remove instance ids
    atk_groups = atk_train['instance_id'].value_counts().sort_index().values
    atk_train.drop('instance_id', axis=1, inplace=True)
    
    # get index of categorical features 
    cat_fx = np.where(atk_train.columns.isin(cat_fx) )[0]
    cat_fx = list([int(x) for x in cat_fx])  
    # print ("CatFX:", atk_train.columns.values[cat_fx])

    # prepare data (avoiding pandas)
    atk_data   = atk_train.values

    # train first trees
    original_ids = np.cumsum(atk_groups[:-1])
    original_ids = np.insert(original_ids, 0, 0)
    
    model, model_info = AdvBoosting_extend_model( atk_data[original_ids, :], 
                                                  cat_fx=cat_fx,
                                                  input_model=None, 
                                                  num_trees=adv_rounds, 
                                                  params=params )
    
    
    # train remaining trees
    for t in range(adv_rounds+1, trees+1, adv_rounds):
        # attack dataset
        adv_data, adv_offsets = AdvBoosting_gen_data( model, atk_data, atk_groups )
        
        # train additional trees
        model.save_model(temp)
        model, model_info = AdvBoosting_extend_model( adv_data, 
                                                      cat_fx=cat_fx,
                                                      input_model=temp, 
                                                      num_trees=adv_rounds, 
                                                      params=params)
        # save partial model
        if t%partial_save==0 and t!=trees:
            partial_filename = "{}.T{:03d}.lgbm".format(output_model_file, t)
            model.save_model( filename=partial_filename )
            
    model.save_model(filename=output_model_file)
    
    return model, model_info

# Scripting wrapper

In [6]:
def run_training( atk_train_file, atk_valid_file, atk_test_file,
                  output_model_file,
                  num_trees,
                  learning_rate,
                  num_leaves,
                  partial_save,
                  adversarial_rounds):

    # load train/valid/test
    train, valid, test, cat_fx = load_atk_train_valid_test(atk_train_file, atk_valid_file, atk_test_file)
    
    # train params
    lgbm_params = { 'learning_rate': float(learning_rate), 
                    'num_leaves': int(num_leaves)} 

    # start training
    return AdvBoosting(  train, 
                         trees=num_trees, 
                         cat_fx = cat_fx,
                         output_model_file=output_model_file, 
                         adv_rounds=adversarial_rounds, 
                         partial_save=partial_save, 
                         params=lgbm_params)

# Gradient Boosting Baseline

Some results:

       learning rate  num leaves  num trees  avg_binary_log_loss
    0            0.1        16.0      196.0             0.297940
    1            0.1        32.0      197.0             0.301215
    2            0.5        16.0       81.0             0.297984
    3            0.5        32.0       22.0             0.305828
    
I chose the following setting:

        learning rate  num leaves  num trees  avg_binary_log_loss
    2            0.5        16.0       81.0             0.297984

The final model is saved here: `../out/models/lgbm_census_T81_L16_S050.model`

**No need to run this again**

In [31]:
def train_gradient_boosting_baseline( train_file, valid_file, test_file,
                                      output_model_file):
    
    exp = pd.DataFrame(columns=['learning rate', 'num leaves', 'num trees', 'avg_binary_log_loss'])
    
    # load train/valid/test
    train, valid, test, cat_fx = load_atk_train_valid_test(train_file, valid_file, test_file)

    # get index of categorical features 
    cat_fx = np.where(train.columns.isin(cat_fx) )[0]
    cat_fx = list([int(x) for x in cat_fx])  
    print ("CatFX:", train.columns.values[cat_fx])
    

    num_trees     = 200
    for learning_rate in [0.1, 0.5]:
        for num_leaves in [16, 32]:
    
            # datasets
            lgbm_train = lightgbm.Dataset(data=train.values[:,:-1], 
                                          label=train.values[:,-1],
                                          categorical_feature = cat_fx)

            lgbm_valid = lightgbm.Dataset(data=valid.values[:,:-1], 
                                          label=valid.values[:,-1],
                                          categorical_feature = cat_fx)

            # run train
            lgbm_params = { 'learning_rate': learning_rate, 
                            'num_leaves': num_leaves} 
            lgbm_info = {}
            lgbm_model = lightgbm.train(lgbm_params, lgbm_train, 
                                        num_boost_round = num_trees,
                                        fobj            = optimize_log_loss, 
                                        feval           = avg_log_loss,
                                        evals_result    = lgbm_info,
                                        valid_sets      = [lgbm_train, lgbm_valid], 
                                        valid_names     = ['train', 'valid'],
                                        verbose_eval    = 10)

            # save file
            best_valid_iter = np.argmin(lgbm_info['valid']['avg_binary_log_loss'] )

            model_file_name = "{:s}_T{:d}_L{:d}_S{:03d}.model".format(output_model_file,
                                                               best_valid_iter+1,
                                                               num_leaves,
                                                               int(learning_rate*100) )
            lgbm_model.save_model(model_file_name, num_iteration=best_valid_iter+1)
            print ("Model saved to", model_file_name)

            # update experimental results
            exp = exp.append( {  'learning rate':learning_rate, 
                                 'num leaves':num_leaves, 
                                 'num trees':best_valid_iter+1, 
                                 'avg_binary_log_loss':lgbm_info['valid']['avg_binary_log_loss'][best_valid_iter] },
                             ignore_index=True)
    
    return exp

# enable/disable LGBM Baseline
if False:
    experiments = train_gradient_boosting_baseline ( "../data/census/train_ori.csv.bz2",
                                                     "../data/census/valid_ori.csv.bz2",
                                                     "../data/census/test_ori.csv.bz2",
                                                     "../out/models/lgbm_census")  

    experiments.to_csv('LGBM_Census_Baseline.csv')

    print (experiments)

# Adversatial Boosting Training

**Nota**: Adversarial Boosing mi pare a volte più veloce della baseline. E' corretto ?!?

In [47]:
def train_adversarial_boosting( train_file, valid_file, test_file,
                                output_model_file,
                                learning_rate=0.5,
                                num_leaves=16):
    
    exp = pd.DataFrame(columns=['learning rate', 'num leaves', 'num trees', 'avg_binary_log_loss'])
    
    # load train/valid/test
    train, valid, test, cat_fx = load_atk_train_valid_test(train_file, valid_file, test_file)

    # get index of categorical features 
    cat_fx = np.where(train.columns.isin(cat_fx) )[0]
    cat_fx = list([int(x) for x in cat_fx])  
    print ("CatFX:", train.columns.values[cat_fx])
    

    for num_trees in [500]:
        # datasets
        lgbm_train = lightgbm.Dataset(data=train.values[:,:-1], 
                                      label=train.values[:,-1],
                                      categorical_feature = cat_fx)

        lgbm_valid = lightgbm.Dataset(data=valid.values[:,:-1], 
                                      label=valid.values[:,-1],
                                      categorical_feature = cat_fx)

        # run train
        lgbm_params = { 'learning_rate': learning_rate, 
                        'num_leaves': num_leaves} 
        lgbm_info = {}
        lgbm_model = lightgbm.train(lgbm_params, lgbm_train, 
                                    num_boost_round = num_trees,
                                    fobj            = optimize_log_loss, 
                                    feval           = avg_log_loss,
                                    evals_result    = lgbm_info,
                                    valid_sets      = [lgbm_train, lgbm_valid], 
                                    valid_names     = ['train', 'valid'],
                                    verbose_eval    = 10)
        
        # save file
        best_valid_iter = np.argmin(lgbm_info['valid']['avg_binary_log_loss'] )

        model_file_name = "{:s}_T{:d}_L{:d}_S{:03d}.model".format(output_model_file,
                                                           best_valid_iter+1,
                                                           num_leaves,
                                                           int(learning_rate*100) )
        lgbm_model.save_model(model_file_name, num_iteration=best_valid_iter+1)
        print ("Model saved to", model_file_name)

        # update experimental results
        exp = exp.append( {  'learning rate':learning_rate, 
                             'num leaves':num_leaves, 
                             'num trees':best_valid_iter+1, 
                             'avg_binary_log_loss':lgbm_info['valid']['avg_binary_log_loss'][best_valid_iter] },
                         ignore_index=True)
    
    return exp

In [38]:
# enable/disable
if False:
    experiments = train_adversarial_boosting ( "../data/census/train_ori.csv.bz2",
                                               "../data/census/valid_ori.csv.bz2",
                                               "../data/census/test_ori.csv.bz2",
                                               "../out/models/AdvBoost_census")  

    experiments.to_csv('AdvBoosting_Census_Baseline.csv')

    print (experiments)

# Under Attacks

In [48]:
for B in [5, 15, 150, 300]:

    experiments = train_adversarial_boosting ( "../data/census/train_B{:d}.csv.bz2".format(B),
                                               "../data/census/valid_B{:d}.csv.bz2".format(B),
                                               "../data/census/test_B{:d}.csv.bz2".format(B),
                                               "../out/models/AdvBoost_census_B{:d}".format(B))  

    experiments.to_csv('AdvBoosting_Census_B{:d}.csv'.format(B))

    print (experiments)

Loading pre-processed files...
CatFX: ['workclass' 'marital_status' 'occupation' 'relationship' 'race' 'sex'
 'native_country']




[10]	train's avg_binary_log_loss: 0.0881606	valid's avg_binary_log_loss: 0.088576
[20]	train's avg_binary_log_loss: 0.0845659	valid's avg_binary_log_loss: 0.0859332
[30]	train's avg_binary_log_loss: 0.0763656	valid's avg_binary_log_loss: 0.0786738
[40]	train's avg_binary_log_loss: 0.0753332	valid's avg_binary_log_loss: 0.078013
[50]	train's avg_binary_log_loss: 0.0746124	valid's avg_binary_log_loss: 0.0777822
[60]	train's avg_binary_log_loss: 0.0685185	valid's avg_binary_log_loss: 0.0721167
[70]	train's avg_binary_log_loss: 0.0679041	valid's avg_binary_log_loss: 0.0720017
[80]	train's avg_binary_log_loss: 0.0673566	valid's avg_binary_log_loss: 0.0719959
[90]	train's avg_binary_log_loss: 0.0668565	valid's avg_binary_log_loss: 0.0719737
[100]	train's avg_binary_log_loss: 0.0663494	valid's avg_binary_log_loss: 0.0719345
[110]	train's avg_binary_log_loss: 0.063515	valid's avg_binary_log_loss: 0.0692359
[120]	train's avg_binary_log_loss: 0.0631388	valid's avg_binary_log_loss: 0.0691593
[130

[400]	train's avg_binary_log_loss: 0.0331553	valid's avg_binary_log_loss: 0.0362885
[410]	train's avg_binary_log_loss: 0.0331064	valid's avg_binary_log_loss: 0.0363127
[420]	train's avg_binary_log_loss: 0.0330635	valid's avg_binary_log_loss: 0.0363284
[430]	train's avg_binary_log_loss: 0.0330199	valid's avg_binary_log_loss: 0.0363315
[440]	train's avg_binary_log_loss: 0.0329757	valid's avg_binary_log_loss: 0.0363178
[450]	train's avg_binary_log_loss: 0.0329426	valid's avg_binary_log_loss: 0.0363124
[460]	train's avg_binary_log_loss: 0.0324043	valid's avg_binary_log_loss: 0.0357645
[470]	train's avg_binary_log_loss: 0.0323734	valid's avg_binary_log_loss: 0.035769
[480]	train's avg_binary_log_loss: 0.0317113	valid's avg_binary_log_loss: 0.0350373
[490]	train's avg_binary_log_loss: 0.0316763	valid's avg_binary_log_loss: 0.0350377
[500]	train's avg_binary_log_loss: 0.0312486	valid's avg_binary_log_loss: 0.034587
Model saved to ../out/models/AdvBoost_census_B15_T500_L16_S050.model
   learni

[200]	train's avg_binary_log_loss: 0.0118413	valid's avg_binary_log_loss: 0.0118646
[210]	train's avg_binary_log_loss: 0.011839	valid's avg_binary_log_loss: 0.0118617
[220]	train's avg_binary_log_loss: 0.011837	valid's avg_binary_log_loss: 0.0118598
[230]	train's avg_binary_log_loss: 0.0118353	valid's avg_binary_log_loss: 0.0118583
[240]	train's avg_binary_log_loss: 0.0118337	valid's avg_binary_log_loss: 0.0118566
[250]	train's avg_binary_log_loss: 0.0118322	valid's avg_binary_log_loss: 0.0118554
[260]	train's avg_binary_log_loss: 0.0118306	valid's avg_binary_log_loss: 0.011854
[270]	train's avg_binary_log_loss: 0.0118289	valid's avg_binary_log_loss: 0.0118524
[280]	train's avg_binary_log_loss: 0.0118272	valid's avg_binary_log_loss: 0.0118507
[290]	train's avg_binary_log_loss: 0.0118255	valid's avg_binary_log_loss: 0.0118491
[300]	train's avg_binary_log_loss: 0.0118239	valid's avg_binary_log_loss: 0.0118474
[310]	train's avg_binary_log_loss: 0.0118222	valid's avg_binary_log_loss: 0.011

In [50]:
!ls -lht ../out/models/

total 2.3M
-rw-rw-r-- 1 lucchese lucchese 155K Feb 19 11:39 AdvBoost_census_B300_T500_L16_S050.model
-rw-rw-r-- 1 lucchese lucchese 212K Feb 19 11:36 AdvBoost_census_B150_T499_L16_S050.model
-rw-rw-r-- 1 lucchese lucchese 407K Feb 19 11:34 AdvBoost_census_B15_T500_L16_S050.model
-rw-rw-r-- 1 lucchese lucchese 490K Feb 19 11:33 AdvBoost_census_B5_T365_L16_S050.model
-rw-rw-r-- 1 lucchese lucchese  60K Feb 19 11:24 lgbm_census_T22_L32_S050.model
-rw-rw-r-- 1 lucchese lucchese 115K Feb 19 11:24 lgbm_census_T81_L16_S050.model
-rw-rw-r-- 1 lucchese lucchese 535K Feb 19 11:24 lgbm_census_T197_L32_S010.model
-rw-rw-r-- 1 lucchese lucchese 278K Feb 19 11:24 lgbm_census_T196_L16_S010.model


In [None]:
#!ls -lht ../data/census/

In [None]:
#!ls -lht ../out/models

In [None]:
# debug running time
#%load_ext line_profiler
# %lprun -f AdvBoosting  AdvBoosting(train, trees=3)
# %lprun -f AdvBoosting_gen_data  AdvBoosting(train, trees=3)

In [None]:
#!rm ../out/models/*.lgbm