# Training Models

This notebook contains the code used for training the following learning models:

-  **Standard GBDT** (_baseline 1_)
-  **Adversarial Boosting** (_baseline 2_)
-  **Non-Interferent GBDT** (our proposal)

# Documentation

 - http://lightgbm.readthedocs.io/en/latest/
 - http://lightgbm.readthedocs.io/en/latest/Python-Intro.html
 - https://github.com/Microsoft/LightGBM/tree/master/examples/python-guide

In [1]:
import sys
import os
import numpy as np
import pandas as pd
import lightgbm
import pickle
import json
import functools
from os import listdir
from os.path import isfile, join
from sklearn.svm import SVC
from sklearn.metrics import f1_score

# Data Preparation

In [2]:
def label_encode(dataset, categorical_features):
    dataset_le = dataset.copy()
    for column in dataset_le.columns:
        if column in categorical_features:
            dataset_le[column] = dataset_le[column].astype('category')
            dataset_le[column] = dataset_le[column].cat.codes.astype(np.int32)
    return dataset_le

def load_atk_train_valid_test(atk_train_file, atk_valid_file, atk_test_file, 
                              train_split=0.6, valid_split=0.2, force=False):
    
    
    if  (force or 
          not os.path.exists(atk_train_file+".cat.bz2") or
          not os.path.exists(atk_valid_file+".cat.bz2") or
          not os.path.exists(atk_test_file+".cat.bz2") or 
          not os.path.exists(atk_train_file+".cat.json") ):
    
        print ("Pre-processing original files...")

        print ("Loading:", atk_train_file)
        print ("Loading:", atk_valid_file)
        print ("Loading:", atk_test_file)

        train = pd.read_csv(atk_train_file)
        valid = pd.read_csv(atk_valid_file)
        test  = pd.read_csv(atk_test_file)
        
        print ("Train/Valid/Test sizes:", train.shape, valid.shape, test.shape)
        print ("Train/Valid/Test split: {:.2f} {:.2f} {:.2f}"
                   .format( train.shape[0]/(train.shape[0]+valid.shape[0]+test.shape[0]),
                            valid.shape[0]/(train.shape[0]+valid.shape[0]+test.shape[0]),
                            test.shape[0] /(train.shape[0]+valid.shape[0]+test.shape[0]) ) )


        # split-back into train valid test
        if 'instance_id' in train.columns.values:
            print ('   ... with instance ids')
            valid['instance_id'] += train.iloc[-1,0]
            test['instance_id']  += valid.iloc[-1,0]
            assert max(train['instance_id'])<min(valid['instance_id']), "Instance ID mismatch"
            assert max(valid['instance_id'])<min(test['instance_id']), "Instance ID mismatch"
            
            groups = np.concatenate( [ train['instance_id'].value_counts().sort_index().values,
                                       valid['instance_id'].value_counts().sort_index().values,
                                       test['instance_id'].value_counts().sort_index().values ] )
            
            num_train_groups = int( len(groups)*train_split )
            train_size = sum(groups[:num_train_groups])
            num_valid_groups = int( len(groups)*valid_split )
            valid_size = sum(groups[num_train_groups:num_train_groups+num_valid_groups])
        else:
            full_size = len(train) + len(valid) + len(test)
            train_size = int( full_size*train_split )
            valid_size = int( full_size*valid_split )
        
        # concat to process correctly label encoding
        full = pd.concat( [train, valid, test] )

        # get index of categorical features (-1 because of instance_id)
        cat_fx = full.columns.values[np.where(full.dtypes=='object')[0]]
        cat_fx = list(cat_fx)    
        full = label_encode(full, cat_fx)
        with open(atk_train_file+".cat.json", 'w') as fp:
            json.dump(cat_fx, fp)
        print ("CatFX:", cat_fx)

        train_cat = full.iloc[0:train_size,:]
        valid_cat = full.iloc[train_size:train_size+valid_size,:]
        test_cat  = full.iloc[train_size+valid_size:,:]
        
        assert len(train_cat)+len(valid_cat)+len(test_cat)==len(full), "Split sizes mismatch"
        

        print ("Train/Valid/Test sizes:", train_cat.shape, valid_cat.shape, test_cat.shape)
        print ("Train/Valid/Test split: {:.2f} {:.2f} {:.2f}"
                   .format( train_cat.shape[0]/(train_cat.shape[0]+valid_cat.shape[0]+test_cat.shape[0]),
                            valid_cat.shape[0]/(train_cat.shape[0]+valid_cat.shape[0]+test_cat.shape[0]),
                            test_cat.shape[0] /(train_cat.shape[0]+valid_cat.shape[0]+test_cat.shape[0]) ) )

        # save to file
        print ("Saving processed files *.cat.bz2")
        train_cat.to_csv(atk_train_file+".cat.bz2", compression="bz2", index=False)
        valid_cat.to_csv(atk_valid_file+".cat.bz2", compression="bz2", index=False)
        test_cat.to_csv (atk_test_file+".cat.bz2",  compression="bz2", index=False)
        
    else:
        print ("Loading pre-processed files...")

        train_cat = pd.read_csv(atk_train_file+".cat.bz2")
        valid_cat = pd.read_csv(atk_valid_file+".cat.bz2")
        test_cat  = pd.read_csv(atk_test_file+".cat.bz2")
        
        with open(atk_train_file+".cat.json", 'r') as fp:
            cat_fx = json.load(fp)
    
    # return data
    return train_cat, valid_cat, test_cat, cat_fx


# Objective Functions

## Standard

The following function, called <code>optimize_log_loss</code>, is the one that should be optimized (i.e., minimized) for learning _standard_ and _baseline_ approaches. More specifically, this is the standard binary log loss which is used to train any _standard_ or _baseline_ model.

# $L$ = <code>optimize_log_loss</code>

$$
L = \frac{1}{|\mathcal{D}|} \cdot \sum_{(\mathbf{x},y) \in \mathcal{D}}\ell(h(\mathbf{x}), y)
$$

where:

$$
\ell(h(\mathbf{x}), y) = log(1+e^{(-yh(\mathbf{x}))})
$$

In [3]:
def optimize_log_loss(preds, train_data):
    labels = train_data.get_label()
    exp_pl = np.exp(preds * labels)
    # http://www.wolframalpha.com/input/?i=differentiate+log(1+%2B+exp(-kx)+)
    grads = -labels / (1.0 +  exp_pl)  
    # http://www.wolframalpha.com/input/?i=d%5E2%2Fdx%5E2+log(1+%2B+exp(-kx)+)
    hess = labels**2 * exp_pl / (1.0 + exp_pl)**2 

    # this is to optimize average logloss
    norm = 1.0/len(preds)
    grads *= norm
    hess *= norm
    
    return grads, hess

## Custom

In addition to the standard binary log loss used to train a model, we introduce our custom <code>optimize_non_interferent_log_loss</code>, which is computed as the weighted combination of two objective functions, as follows:

-  $L$ = <code>optimize_log_loss</code> (standard, already seen above);
-  $L^A$ = <code>optimize_log_loss_uma</code> (custom, defined below).

# $L^A$ = <code>optimize_log_loss_uma</code>

This function is used to train a **full** _non-interferent_ model; in other words, full non-interferent models are learned by optimizing (i.e., minimizing) the function which measures the binary log loss **under the maximal attack** possible.

$$
L^A = \frac{1}{|\mathcal{D}|} \cdot \sum_{(\mathbf{x},y) \in \mathcal{D}} \log  \left( \sum_{\mathbf{x}' \in \mathit{MaxAtk}({\mathbf{x}},{A})} e^{\ell(h(\mathbf{x}'), y)} \right).
$$

where still:

$$
\ell(h(\mathbf{x}), y) = log(1+e^{(-yh(\mathbf{x}))})
$$

In [4]:
def optimize_log_loss_uma(preds, train_data):
    labels = train_data.get_label()
    attack_lens = train_data.get_group()
    
    grads = np.zeros_like(labels, dtype=np.float64)
    hess = np.zeros_like(grads)
    
    if attack_lens is not None:

        norm = 1.0 / float(len(attack_lens))

        offset = 0
        for atk in attack_lens:
            exp_pl = np.exp(- preds[offset:offset+atk] * labels[offset:offset+atk])

            inv_sum = 1.0 / np.sum(1.0 + exp_pl)

            x_grad = inv_sum * exp_pl

            grads[offset:offset+atk] = norm * x_grad * (- labels[offset:offset+atk])
            hess[offset:offset+atk]  = norm * x_grad * (1.0 - x_grad)

            offset += atk    
    
    return grads, hess

In [5]:
def optimize_log_loss_uma_ext(preds, train_data):
    labels = train_data.get_label()
    weights = train_data.get_weight()
    
    grads = np.zeros_like(labels, dtype=np.float64)
    hess = np.zeros_like(grads)
    
    norm = 1.0 / float(len(labels))

    exp_pl = np.exp(- preds * labels)

    x_grad = weights * exp_pl

    grads = norm * x_grad * (- labels)
    hess  = norm * x_grad * (1.0 - x_grad)

    return grads, hess

# <code>optimize_non_interferent_log_loss</code>

$$
\alpha\cdot L^A + (1-\alpha)\cdot L
$$

$$
\alpha \cdot \underbrace{\Bigg[\frac{1}{|\mathcal{D}|} \cdot \sum_{(\mathbf{x},y) \in \mathcal{D}} \log  \left( \sum_{\mathbf{x}' \in \mathit{MaxAtk}({\mathbf{x}},{A})} e^{\ell(h(\mathbf{x}'), y)} \right)\Bigg]}_{L^A} + (1-\alpha) \cdot \underbrace{\Bigg[\frac{1}{|\mathcal{D}|} \cdot \sum_{(\mathbf{x},y) \in \mathcal{D}} \ell(h(\mathbf{x}, y))\Bigg]}_{L}
$$

In [39]:
def optimize_non_interferent_log_loss(preds, train_data, alpha=1.0):
    # binary logloss under maximal attack
    # grads_uma, hess_uma = optimize_log_loss_uma(preds, train_data)
    grads_uma, hess_uma = optimize_log_loss_uma_ext(preds, train_data)
    
    # binary logloss (plain)
    grads_plain, hess_plain = optimize_log_loss(preds, train_data)
    
    #print ("uma:   ", grads_uma.min(), grads_uma.max(), hess_uma.min(), hess_uma.max())
    #print ("plain: ", grads_plain.min(), grads_plain.max(), hess_plain.min(), hess_plain.max())
    #print ("uma:   ", np.quantile(grads_uma,[.25, .75]), np.quantile( hess_uma, [.25, .75]) )
    #print ("plain: ", np.quantile(grads_plain,[.25, .75]), np.quantile( hess_plain, [.25, .75]) )
    
    # combine the above two losses together
    k=1
    grads = alpha*grads_uma + (1.0-alpha)*grads_plain
    hess  = alpha*hess_uma  + (1.0-alpha)*hess_plain
#     grads *= k
#     hess *= k
    
    return grads, hess

In [36]:
def optimize_non_interferent_log_loss_claudio(preds, train_data, alpha=1.0):
    # binary logloss under maximal attack
    grads_uma, hess_uma = optimize_log_loss_uma(preds, train_data)
    
    # binary logloss (plain)
    grads_plain, hess_plain = optimize_log_loss(preds, train_data)
    
    # combine the above two losses together
    grads = alpha*grads_uma + (1.0-alpha)*grads_plain
    hess  = alpha*hess_uma  + (1.0-alpha)*hess_plain
    
    return grads, hess

## Using one objective function for both _standard_ and _non-interferent_ learning

The advantage of the <code>optimize_non_interferent_log_loss</code> function defined above is that we can wrap it so that we can use it as the only objective function (<code>fobj</code>) passed in to LightGBM. 

In other words, if we call <code>fobj=optimize_non_interferent_log_loss</code> with <code>alpha=0.0</code>, this will end up optimizing (i.e., minimizing) the "vanilla" objective function (i.e., the standard binary log loss, defined by the function <code>optimize_log_loss</code> above).

Conversely, calling <code>fobj=optimize_non_interferent_log_loss</code> with <code>alpha=1.0</code> turns into optimizing (i.e., minimizing) the full non-interferent objective function (i.e., the custom binary log loss under max attack, defined by the function <code>optimize_log_loss_uma</code> above).

Anything that sits in between (i.e., <code>0 < alpha < 1</code>) optimizes an objective function that trades off between the standard and the full non-interferent term.

# Evaluation Metrics

## Standard

The following function is the one used for evaluating the quality of the learned model (either _standard_, _adversarial-boosting_, or _non-interferent_). This is the standard <code>avg_log_loss</code>.

In [8]:
def logistic(x):
    return 1.0/(1.0 + np.exp(-x))

In [9]:
def logit(p):
    return np.log(p/(1-p))

# <code>avg_log_loss</code>

In [10]:
# self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool
def avg_log_loss(preds, train_data):
    
    labels = train_data.get_label()
    losses = np.log(1.0 + np.exp(-preds*labels))
    avg_loss = np.mean(losses)
    
    return 'avg_binary_log_loss', avg_loss, False

## Custom

Similarly to what we have done for <code>fobj</code>, <code>feval</code> can be computed from a weighted combination of two evaluation metrics:

-  <code>avg_log_loss</code> (standard, defined above);
-  <code>avg_log_loss_uma</code> (custom, defined below).

# <code>avg_log_loss_uma</code>

This is the binary log loss yet modified to operate on groups of perturbed instances.

In [11]:
# Our custom metrics
def binary_log_loss(pred, true_label):

    return np.log(1.0 + np.exp(-pred * true_label))

# self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool
def avg_log_loss_uma(preds, train_data):
    labels = train_data.get_label()
    attack_lens = train_data.get_group()
    
    offset = 0
    max_logloss = []
    avg_max_logloss = 0.0
    
    if attack_lens is not None:
    
        for atk in attack_lens:
            losses = [binary_log_loss(h,t) for h,t in zip(preds[offset:offset+atk], labels[offset:offset+atk])]
            max_logloss.append(max(losses))

            offset += atk
        
        avg_max_logloss = np.mean(max_logloss)  

    return 'avg_binary_log_loss_under_max_attack', avg_max_logloss, False

# <code>feval=avg_non_interferent_log_loss</code>

Used for measuring the validity of any model (either _standard_, _baseline_, or _non-interferent_). More precisely, <code>avg_non_interferent_log_loss</code> is the weighted sum of the binary log loss and the binary log loss under maximal attack.

In [12]:
def avg_non_interferent_log_loss(preds, train_data, alpha=1.0):
    
    # binary logloss under maximal attack
    _, loss_uma, _    = avg_log_loss_uma(preds, train_data)
    
    # binary logloss (plain)
    #_, loss_plain, _  = avg_log_loss(preds, train_data)
    ids = []
    attack_lens = train_data.get_group()
    if attack_lens is not None:
        offset=0
        for atk in attack_lens:
            ids += [offset]
            offset += atk      
            
    ids = np.array(ids)
    labels = train_data.get_label()
    losses = binary_log_loss(pred=preds[ids], true_label=labels[ids])
    loss_plain = np.mean(losses)

    # combine the above two losses together
    weighted_loss = alpha*loss_uma + (1.0-alpha)*loss_plain

    return 'avg_non_interferent_log_loss [alpha={:.2f}]'.format(alpha), weighted_loss, False

# Adversarial Boosting

In [13]:
def gen_adv_boosting_data(model, data, groups, num_atks=1):
    ''' 
    model  : is the LightGBM Model
    data   : data matrix with all valid attacks (last column is label)
    groups : grouping of same attacked instance 
    returns the new data matrix and new groups
    
    WARNING: currently works only for binary classification
    '''
    # score the datataset
    labels = data[:,-1]
    
    # check mispredictions
    predictions = model.predict(data[:,:-1]) # exclude labels
    matchings = labels * predictions
    
    # select original data + attacked instances
    new_selected = [] # id of selected instances
    new_groups   = []
    
    offset = 0
    for g in groups:
        if g==0:
            print ("Error !!!!")
        elif g==1:
            # there are no attacks, just add original
            new_selected += [offset]
            new_groups   += [1]
        else:
            # get a slice of the matching scores
            g_matchings = matchings[offset:offset+g]

            # most misclassified (smallest margin)
            # skip original
            #adv_instance = np.argmin(g_matchings[1:])+1
            adv_instances = np.argsort(g_matchings[1:])
            adv_instances = adv_instances[:num_atks]
            adv_instances += offset +1

            # add original and adversarial
            new_selected += [offset] + list(adv_instances)
            new_groups   += [1 + len(adv_instances)]
        
        offset += g
    
    new_dataset = data[new_selected,:]
    
    return new_dataset, new_groups

In [14]:
def extend_adv_boosting_model(train, valid, cat_fx, input_model=None, num_trees=1, params=None):
    ''' 
    model  : is the LightGBM Model
    data   : data matrix with all valid attacks (last column is label)
    returns the new model (is model modified inplace?)
    '''
    
    if cat_fx is None or len(cat_fx)==0:
        cat_fx = "auto"
        
    assert train.shape[1]==valid.shape[1], "Train/Valid Mismatch!"

    lgbm_train = lightgbm.Dataset(data=train[:,:-1], 
                                  label=train[:,-1],
                                  categorical_feature = cat_fx)
    
    lgbm_valid = lightgbm.Dataset(data=valid[:,:-1], 
                                  label=valid[:,-1],
                                  categorical_feature = cat_fx)
    
    lgbm_info = {}
    lgbm_model = lightgbm.train(params, lgbm_train, 
                                num_boost_round = num_trees, 
                                init_model = input_model,
                                fobj = optimize_log_loss, 
                                feval = avg_log_loss,
                                evals_result = lgbm_info,
                                valid_sets   = [lgbm_train, lgbm_valid], 
                                valid_names  = ['train', 'valid'],
                                verbose_eval=5)

    return lgbm_model, lgbm_info

In [15]:
def AdvBoosting(atk_train, atk_valid, trees, 
                 cat_fx,
                 params,
                 output_model_file,
                 partial_save=1000, 
                 adv_rounds=1):
    ''' 
    atk_data: full dataset including all valid attacks
    atk_groups: lenght of each attack set
    trees: total number of trees to be produced
    adv_rounds: adversarial instance injecting frequency
    '''
    # temp lgbm file
    temp = output_model_file+".tmp"
    
    # get groups and remove instance ids
    atk_groups = atk_train['instance_id'].value_counts().sort_index().values
    atk_valid_groups = atk_valid['instance_id'].value_counts().sort_index().values
    
    # get index of categorical features 
    cat_fx = np.where(atk_train.columns.isin(cat_fx))[0]
    cat_fx = list([int(x) for x in cat_fx])  
    # print ("CatFX:", atk_train.columns.values[cat_fx])

    # prepare data (avoiding pandas)
    atk_data   = atk_train.iloc[:,1:].values
    atk_valid  = atk_valid.iloc[:,1:].values
    cat_fx = [x - 1 for x in cat_fx]

    # train first trees
    original_ids = np.cumsum(atk_groups[:-1])
    original_ids = np.insert(original_ids, 0, 0)
    
    original_valid_ids = np.cumsum(atk_valid_groups[:-1])
    original_valid_ids = np.insert(original_valid_ids, 0, 0)
    
    model, model_info = extend_adv_boosting_model(atk_data[original_ids, :], 
                                                  atk_valid[original_valid_ids, :],
                                                  cat_fx=cat_fx,
                                                  input_model=None, 
                                                  num_trees=adv_rounds, 
                                                  params=params)
    
    best_model = model
    best_info = model_info
    best_loss = np.min(model_info['valid']['avg_binary_log_loss'])
    best_round = 1
        
    # train remaining trees
    for t in range(adv_rounds+1, trees+1, adv_rounds):
        # attack dataset
        adv_data, _       = gen_adv_boosting_data(model, atk_data, atk_groups)
        adv_valid_data, _ = gen_adv_boosting_data(model, atk_valid, atk_valid_groups)
        
        # train additional trees
        model.save_model(temp)
        model, model_info = extend_adv_boosting_model(adv_data, 
                                                      adv_valid_data,
                                                      cat_fx=cat_fx,
                                                      input_model=temp, 
                                                      num_trees=adv_rounds, 
                                                      params=params)

        if np.min(model_info['valid']['avg_binary_log_loss']) < best_loss:
            best_model = model
            best_info  = model_info
            best_loss  = np.min(model_info['valid']['avg_binary_log_loss'])
            best_round = t
        
        # save partial model
        if t % partial_save == 0 and t != trees:
            partial_filename = "{:s}_T{:d}-of-{:d}_S{:04d}_L{:d}.model.tmp".format(output_model_file, 
                                                                                   t, 
                                                                                   trees, 
                                                                                   int(params['learning_rate'] * 1000),
                                                                                   params['num_leaves']
                                                                                  )
            
            print("Save partial model to {}".format(partial_filename))
            model.save_model(filename=partial_filename)
            
    
    return model, model_info, best_loss, best_round

# Training Standard GBDT (_baseline 1_)

In [16]:
def train_gradient_boosting_baseline( train_file, valid_file, test_file,
                                output_model_file):
    
    exp = pd.DataFrame(columns=['num_trees', 'learning_rate', 'num_leaves', 'best_round', 'avg_binary_log_loss'])
    
    # load train/valid/test
    train, valid, test, cat_fx = load_atk_train_valid_test(train_file, valid_file, test_file)
    
    assert "instance_id" not in train.columns.values, "Wrong training set file for GBDT"

    # get index of categorical features 
    cat_fx = np.where(train.columns.isin(cat_fx))[0]
    cat_fx = list([int(x) for x in cat_fx])  
    print ("CatFX:", train.columns.values[cat_fx])
    

    for num_trees in [500]:
        best_model = None
        best_info = None
        best_loss = np.inf
        for learning_rate in [0.1]: #[0.01, 0.05, 0.1]:
            for num_leaves in [16]: #[8, 16, 24]:
                # datasets
                lgbm_train = lightgbm.Dataset(data=train.iloc[:,:-1].values, 
                                              label=train.iloc[:,-1].values,
                                              categorical_feature = cat_fx)

                lgbm_valid = lightgbm.Dataset(data=valid.iloc[:,:-1].values, 
                                              label=valid.iloc[:,-1].values,
                                              categorical_feature = cat_fx)

                # run train
                lgbm_params = { 'learning_rate': learning_rate, 
                                'num_leaves': num_leaves} 
                lgbm_info = {}
                lgbm_model = lightgbm.train(lgbm_params, lgbm_train, 
                                            num_boost_round = num_trees,
                                            fobj            = optimize_log_loss, 
                                            feval           = avg_log_loss,
                                            evals_result    = lgbm_info,
                                            valid_sets      = [lgbm_train, lgbm_valid], 
                                            valid_names     = ['train', 'valid'],
                                            verbose_eval    = 5)
                
                if np.min(lgbm_info['valid']['avg_binary_log_loss']) < best_loss:
                    best_model = lgbm_model
                    best_info = lgbm_info
                    best_loss = np.min(lgbm_info['valid']['avg_binary_log_loss'])
                    best_info['num_trees'] = num_trees
                    best_info['learning_rate'] = learning_rate
                    best_info['num_leaves'] = num_leaves
                    
                    
                best_valid_iter = np.argmin(lgbm_info['valid']['avg_binary_log_loss'])
                
                # update experimental results
                exp = exp.append({'num_trees': num_trees, 
                                  'learning_rate':learning_rate,
                                  'num_leaves':num_leaves, 
                                  'best_round':best_valid_iter+1, 
                                  'avg_binary_log_loss':lgbm_info['valid']['avg_binary_log_loss'][best_valid_iter]},
                                 ignore_index=True)
                
        
        # save file
        best_valid_iter = np.argmin(best_info['valid']['avg_binary_log_loss'])

        model_file_name = "{:s}_T{:d}_S{:04d}_L{:d}_R{:d}.model".format(output_model_file,
                                                                        best_info['num_trees'],
                                                                        int(best_info['learning_rate']*1000),
                                                                        best_info['num_leaves'],
                                                                        best_valid_iter + 1
                                                                       )
        
        best_model.save_model(model_file_name)
        print ("Model saved to", model_file_name)
        
        best_model = lightgbm.Booster(model_file=model_file_name)
        print ("Check valid score:", avg_log_loss(preds=best_model.predict(valid.iloc[:,:-1].values),
                                                  train_data=lgbm_valid))

    
    return exp

In [88]:
# enable/disable LGBM Baseline
if True:
    experiments = train_gradient_boosting_baseline("../data/census/train_ori.csv.bz2",
                                                     "../data/census/valid_ori.csv.bz2",
                                                     "../data/census/test_ori.csv.bz2",
                                                     "../out/models/std_gbdt_census")  

    experiments.to_csv('../out/models/std_gbdt_census.csv', index=False)

    print (experiments)

Loading pre-processed files...
CatFX: ['workclass' 'marital_status' 'occupation' 'relationship' 'race' 'sex'
 'native_country']
[5]	train's avg_binary_log_loss: 0.5195	valid's avg_binary_log_loss: 0.520269
[10]	train's avg_binary_log_loss: 0.442694	valid's avg_binary_log_loss: 0.443485
[15]	train's avg_binary_log_loss: 0.404239	valid's avg_binary_log_loss: 0.404565
[20]	train's avg_binary_log_loss: 0.383888	valid's avg_binary_log_loss: 0.384591
[25]	train's avg_binary_log_loss: 0.372059	valid's avg_binary_log_loss: 0.372916
[30]	train's avg_binary_log_loss: 0.364054	valid's avg_binary_log_loss: 0.365226
[35]	train's avg_binary_log_loss: 0.359023	valid's avg_binary_log_loss: 0.360431
[40]	train's avg_binary_log_loss: 0.354723	valid's avg_binary_log_loss: 0.35629
[45]	train's avg_binary_log_loss: 0.346831	valid's avg_binary_log_loss: 0.349523
[50]	train's avg_binary_log_loss: 0.332285	valid's avg_binary_log_loss: 0.336143
[55]	train's avg_binary_log_loss: 0.326506	valid's avg_binary_log_

# Training Adversarial Boosting (_baseline 2_)

In [17]:
def train_adversarial_boosting(train_file, valid_file, test_file, output_model_file):
    
    exp = pd.DataFrame(columns=['num_trees', 'learning_rate', 'num_leaves', 'best_round', 'avg_binary_log_loss'])
    
    # load train/valid/test
    train, valid, test, cat_fx = load_atk_train_valid_test(train_file, valid_file, test_file)
    assert "instance_id" in train.columns.values, "Wrong training set file for GBDT"

    for num_trees in [200]:
        for learning_rate in [0.1]: #[0.01, 0.05, 0.1]:
            for num_leaves in [16]: #[8, 16, 24]:
                      
                lgbm_params = { 'learning_rate': learning_rate, 
                                'num_leaves': num_leaves} 
                
                lgbm_model, lgbm_info, best_loss, best_valid_iter = AdvBoosting(train,
                                                    valid,
                                                    trees=num_trees, 
                                                    cat_fx = cat_fx, 
                                                    output_model_file=output_model_file, 
                                                    adv_rounds=1,
                                                    params=lgbm_params)
                

                ####
                #best_model = lightgbm.Booster(model_file=model_file_name)
                atk_valid_groups = valid['instance_id'].value_counts().sort_index().values
                cat_fx = np.where(valid.columns.isin(cat_fx))[0]
                cat_fx = list([int(x) for x in cat_fx])  

                atk_valid  = valid.iloc[:,1:].values
                cat_fx = [x - 1 for x in cat_fx]
                
                original_valid_ids = np.cumsum(atk_valid_groups[:-1])
                original_valid_ids = np.insert(original_valid_ids, 0, 0)

                lgbm_valid = lightgbm.Dataset(data=atk_valid[original_valid_ids,:-1], 
                                              label=atk_valid[original_valid_ids,-1],
                                              categorical_feature = cat_fx)
                
                
                lgbm_valid_att = lightgbm.Dataset(data=atk_valid[:,:-1], 
                                              label=atk_valid[:,-1],
                                              categorical_feature = cat_fx)
                
                print ("Check valid score without attacks:", avg_log_loss(preds=lgbm_model.predict(atk_valid[original_valid_ids,:-1]),
                                                  train_data=lgbm_valid))
                
                print ("Check valid score with attacks:", avg_log_loss(preds=lgbm_model.predict(atk_valid[:,:-1]),
                                                                       train_data=lgbm_valid_att))

                
                ####
                # update experimental results
                exp = exp.append({'num_trees': num_trees, 
                                  'learning_rate':learning_rate,
                                  'num_leaves':num_leaves, 
                                  'best_round':best_valid_iter, 
                                  'avg_binary_log_loss':best_loss},
                                 ignore_index=True)
        
        # save file
        model_file_name = "{:s}_T{:d}_S{:04d}_L{:d}_R{:d}.model".format(output_model_file,
                                                                                num_trees,
                                                                                int(learning_rate*1000),
                                                                                num_leaves,
                                                                                best_valid_iter
                                                                               )
        lgbm_model.save_model(model_file_name)
        print ("Model saved to", model_file_name)
                
    return exp

In [None]:
# enable/disable
if True:
    for B in [150]: #[5, 15, 150, 300]:

        experiments = train_adversarial_boosting ( "../data/census/train_B{:d}.csv.bz2".format(B),
                                                   "../data/census/valid_B{:d}.csv.bz2".format(B),
                                                   "../data/census/test_B{:d}.csv.bz2".format(B),
                                                   "../out/models/adv_boosting_census_B{:d}".format(B))  

        experiments.to_csv('../out/models/adv_boosting_census_B{:d}.csv'.format(B), index=False)

        print (experiments)

    Check valid score without attacks: ('avg_binary_log_loss', 0.3237495543276321, False)
    Check valid score with attacks: ('avg_binary_log_loss', 0.03632538905070899, False)
    Model saved to ../out/models/adv_boosting_census_B150_T200_S0100_L16_R200.model
       num_trees  learning_rate  num_leaves  best_round  avg_binary_log_loss
    0      200.0            0.1        16.0       200.0             0.224304

# Training Non-Interferent GBDT

In [24]:
def extend_non_interf_model(atk_train, atk_valid, cat_fx, 
                            alpha=1.0, num_trees=1, params=None):
    ''' 
    model  : is the LightGBM Model
    data   : data matrix with all valid attacks (last column is label)
    returns the new model (is model modified inplace?)
    '''
    
    if cat_fx is None or len(cat_fx)==0:
        cat_fx = "auto"
        
    assert atk_train.shape[1]==atk_valid.shape[1], "Train/Valid Mismatch!"
    
    train_groups = atk_train['instance_id'].value_counts().sort_index().values
    valid_groups = atk_valid['instance_id'].value_counts().sort_index().values

    original_train_ids = np.cumsum(train_groups[:-1])
    original_train_ids = np.insert(original_train_ids, 0, 0)
    
    original_valid_ids = np.cumsum(valid_groups[:-1])
    original_valid_ids = np.insert(original_valid_ids, 0, 0)

    # get index of categorical features 
    cat_fx = np.where(atk_train.columns.isin(cat_fx))[0]
    cat_fx = list([int(x) for x in cat_fx])
    print ("CatFX:", atk_train.columns.values[cat_fx])

    # remove instance id
    atk_train = atk_train.iloc[:,1:].values
    atk_valid = atk_valid.iloc[:,1:].values
    cat_fx = [x - 1 for x in cat_fx]
        
    unatk_train = atk_train[original_train_ids,:]
    unatk_valid = atk_valid[original_valid_ids,:]

    
    # -------------------------
    # train first iteration
    lgbm_train = lightgbm.Dataset(data=unatk_train[:,:-1], 
                                  label=unatk_train[:,-1],
                                  categorical_feature = cat_fx)
    
    lgbm_valid = lightgbm.Dataset(data=unatk_valid[:,:-1], 
                                  label=unatk_valid[:,-1],
                                  categorical_feature = cat_fx)

    lgbm_info = {}
    lgbm_model = lightgbm.train(params, lgbm_train, 
                                num_boost_round = 1,
                                fobj  = optimize_log_loss, 
                                feval = avg_log_loss,
                                evals_result = lgbm_info,
                                valid_sets   = [lgbm_train, lgbm_valid], 
                                valid_names  = ['train', 'valid'],
                                verbose_eval=5)

    # -------------------------
    # train other iteration
    def get_ni_w_old(preds, labels, groups):
        # "weights"
        w = np.ones(len(groups))

        offset = 0
        for instance_id, g in enumerate(groups):
            exp_pl = np.exp(- preds[offset:offset+g] * labels[offset:offset+g])
            w[instance_id] = 1.0 / np.sum(1.0 + exp_pl)
            offset += g  

        return w
    
    def get_ni_w(preds, labels, groups):
        # "weights"
        w = np.ones(len(groups))

        offset = 0
        for instance_id, g in enumerate(groups):
            exp_pl = np.exp(- preds[offset:offset+g] * labels[offset:offset+g])
            # can we replace with e^max
            w[instance_id] = 1.0 / np.max(1.0 + exp_pl)
            offset += g  

        return w
    
    def get_ni_w_num(preds, labels, groups):
        # "weights"
        w = np.ones(len(groups))

        offset = 0
        for instance_id, g in enumerate(groups):
            exp_pl = np.exp(- preds[offset:offset+g] * labels[offset:offset+g])
            w[instance_id] = 1.0 / np.sum(1.0 + exp_pl)
            w[instance_id] *= np.exp(-2.0) 
            offset += g  

        return w

    for t in range (1, num_trees):
    
        # get predictions on atk instances
        train_preds  = lgbm_model.predict(atk_train[:,:-1])
        train_labels = atk_train[:,-1]
        train_weights = get_ni_w(train_preds, train_labels, train_groups)
                
        # repeat for validation
        valid_preds  = lgbm_model.predict(atk_valid[:,:-1])
        valid_labels = atk_valid[:,-1]
        valid_weights = get_ni_w(valid_preds, valid_labels, valid_groups)
        
        # prepare data and train
        lgbm_train = lightgbm.Dataset(data=unatk_train[:,:-1], 
                                      label=unatk_train[:,-1],
                                      weight=train_weights,
                                      categorical_feature = cat_fx)

        lgbm_valid = lightgbm.Dataset(data=unatk_valid[:,:-1], 
                                      label=unatk_valid[:,-1],
                                      weight=valid_weights,
                                      categorical_feature = cat_fx)

        new_lgbm_info = {}
        lgbm_model = lightgbm.train(params, lgbm_train, 
                                    num_boost_round = 1, 
                                    init_model = lgbm_model,
                                    fobj  = functools.partial(optimize_non_interferent_log_loss, alpha=alpha), 
                                    feval = avg_log_loss,# functools.partial(avg_non_interferent_log_loss, alpha=alpha),
                                    evals_result = new_lgbm_info,
                                    valid_sets   = [lgbm_train, lgbm_valid], 
                                    valid_names  = ['train', 'valid'],
                                    verbose_eval=5)
        
        awesome_hack = "avg_binary_log_loss"
        lgbm_info['train'][awesome_hack] += new_lgbm_info['train'][awesome_hack]
        lgbm_info['valid'][awesome_hack] += new_lgbm_info['valid'][awesome_hack]


    return lgbm_model, lgbm_info

In [37]:
def train_non_interferent(train_file, valid_file, test_file, output_model_file):
    exp = pd.DataFrame(columns=['num_trees', 'learning_rate', 'num_leaves', 'alpha', 'best_round', 'avg_non_interferent_log_loss'])
    
    # load train/valid/test
    atk_train, atk_valid, atk_test, cat_fx = load_atk_train_valid_test(train_file, valid_file, test_file)
    
    for num_trees in [200]:
        for alpha in [0.5]: #[0.25, 0.50, 0.75, 1.00]:
            best_model = None
            best_info = None
            best_loss = np.inf
            awesome_hack = "avg_non_interferent_log_loss" + " [alpha={:.2f}]".format(alpha)
            awesome_hack = "avg_binary_log_loss"
            
            for learning_rate in [0.1]: #[0.01, 0.05, 0.1]:
                for num_leaves in [24]: #[8, 16, 24, 32]:
                    
                    
                    lgbm_params = { 'learning_rate': learning_rate, 
                                    'num_leaves': num_leaves} 
                    lgbm_model, lgbm_info = extend_non_interf_model(atk_train, atk_valid, cat_fx, 
                                alpha=alpha, num_trees=num_trees, params=lgbm_params)
                    
                    if np.min(lgbm_info['valid'][awesome_hack]) < best_loss:
                        best_model = lgbm_model
                        best_info = lgbm_info
                        best_loss = np.min(lgbm_info['valid'][awesome_hack])
                        best_info['num_trees'] = num_trees
                        best_info['learning_rate'] = learning_rate
                        best_info['num_leaves'] = num_leaves
                

                    # save file

                    best_valid_iter = np.argmin(lgbm_info['valid'][awesome_hack])

                    # update experimental results
                    exp = exp.append({'num_trees': num_trees, 
                                  'learning_rate':learning_rate,
                                  'num_leaves':num_leaves, 
                                  'alpha': alpha,
                                  'best_round':best_valid_iter+1, 
                                  'avg_non_interferent_log_loss':lgbm_info['valid'][awesome_hack][best_valid_iter]},
                                 ignore_index=True)
            
                best_valid_iter = np.argmin(best_info['valid'][awesome_hack])
            
                model_file_name = "{:s}_T{:d}_S{:04d}_L{:d}_A{:03d}_R{:d}.model".format(output_model_file,
                                                                                    best_info['num_trees'],
                                                                                    int(best_info['learning_rate']*1000),
                                                                                    best_info['num_leaves'],
                                                                                    int(alpha * 100),
                                                                                    best_valid_iter + 1
                                                                                   )
            
            
                best_model.save_model(model_file_name)
                print ("Model saved to", model_file_name)
                    
    return exp

In [38]:
# enable/disable
if True:
    for B in [150]: #[5, 15, 150, 300]:

        experiments = train_non_interferent("../data/census/train_B{:d}.csv.bz2".format(B),
                                                   "../data/census/valid_B{:d}.csv.bz2".format(B),
                                                   "../data/census/test_B{:d}.csv.bz2".format(B),
                                                   "../out/models/non_interferent_census_B{:d}".format(B))  

        experiments.to_csv('../out/models/non_interferent_census_B{:d}.csv'.format(B), index=False)

        print (experiments)

Loading pre-processed files...
CatFX: ['workclass' 'marital_status' 'occupation' 'relationship' 'race' 'sex'
 'native_country']
[5]	train's avg_binary_log_loss: 0.515172	valid's avg_binary_log_loss: 0.516217
[10]	train's avg_binary_log_loss: 0.437	valid's avg_binary_log_loss: 0.438303
[15]	train's avg_binary_log_loss: 0.383875	valid's avg_binary_log_loss: 0.386024
[20]	train's avg_binary_log_loss: 0.349702	valid's avg_binary_log_loss: 0.353112
[25]	train's avg_binary_log_loss: 0.328831	valid's avg_binary_log_loss: 0.333189
[30]	train's avg_binary_log_loss: 0.314788	valid's avg_binary_log_loss: 0.319582
[35]	train's avg_binary_log_loss: 0.30553	valid's avg_binary_log_loss: 0.311086
[40]	train's avg_binary_log_loss: 0.298597	valid's avg_binary_log_loss: 0.304944
[45]	train's avg_binary_log_loss: 0.293196	valid's avg_binary_log_loss: 0.300221
[50]	train's avg_binary_log_loss: 0.289252	valid's avg_binary_log_loss: 0.297562
[55]	train's avg_binary_log_loss: 0.28556	valid's avg_binary_log_lo

# Adversarial Boosting with Our Cost function

In [136]:
def NIBoosting(atk_train, atk_valid, trees, 
                 cat_fx,
                 params,
                 output_model_file,
                 partial_save=1000, 
                 adv_rounds=1,
                 alpha = 1.0):
    ''' 
    atk_data: full dataset including all valid attacks
    atk_groups: lenght of each attack set
    trees: total number of trees to be produced
    adv_rounds: adversarial instance injecting frequency
    '''
    # temp lgbm file
    temp = output_model_file+".tmp"
    
    # get groups and remove instance ids
    atk_groups       = atk_train['instance_id'].value_counts().sort_index().values
    atk_valid_groups = atk_valid['instance_id'].value_counts().sort_index().values
    
    # get index of categorical features 
    cat_fx = np.where(atk_train.columns.isin(cat_fx))[0]
    cat_fx = list([int(x) for x in cat_fx])  
    # print ("CatFX:", atk_train.columns.values[cat_fx])

    # prepare data (avoiding pandas)
    atk_train  = atk_train.iloc[:,1:].values
    atk_valid  = atk_valid.iloc[:,1:].values
    cat_fx = [x - 1 for x in cat_fx]

    # train first trees
    original_ids = np.cumsum(atk_groups[:-1])
    original_ids = np.insert(original_ids, 0, 0)
    
    original_valid_ids = np.cumsum(atk_valid_groups[:-1])
    original_valid_ids = np.insert(original_valid_ids, 0, 0)
    
    # -------------------------
    # train first iteration
    lgbm_train = lightgbm.Dataset(data=atk_train[original_ids,:-1], 
                                  label=atk_train[original_ids,-1],
                                  categorical_feature = cat_fx)
    
    lgbm_valid = lightgbm.Dataset(data=atk_valid[original_valid_ids,:-1], 
                                  label=atk_valid[original_valid_ids,-1],
                                  categorical_feature = cat_fx)

    lgbm_info = {}
    lgbm_model = lightgbm.train(params, lgbm_train, 
                                num_boost_round = adv_rounds,
                                fobj  = optimize_log_loss, 
                                feval = avg_log_loss,
                                evals_result = lgbm_info,
                                valid_sets   = [lgbm_train, lgbm_valid], 
                                valid_names  = ['train', 'valid'],
                                verbose_eval=5)

    awesome_hack = "avg_non_interferent_log_loss" + " [alpha={:.2f}]".format(alpha)
    awesome_hack = "avg_binary_log_loss"
    lgbm_info['train'][awesome_hack] = lgbm_info['train']["avg_binary_log_loss"]
    lgbm_info['valid'][awesome_hack] = lgbm_info['valid']["avg_binary_log_loss"]

    # train remaining trees
    for t in range(adv_rounds+1, trees+1, adv_rounds):
        # attack dataset
        adv_train, adv_train_groups = gen_adv_boosting_data(lgbm_model, atk_train, atk_groups, num_atks=5)
        adv_valid, adv_valid_groups = gen_adv_boosting_data(lgbm_model, atk_valid, atk_valid_groups, num_atks=5)
                
        # prepare data and train
        lgbm_train = lightgbm.Dataset(data=adv_train[:,:-1], 
                                      label=adv_train[:,-1],
                                      categorical_feature = cat_fx)

        lgbm_valid = lightgbm.Dataset(data=adv_valid[:,:-1], 
                                      label=adv_valid[:,-1],
                                      categorical_feature = cat_fx)

        new_lgbm_info = {}
        lgbm_model = lightgbm.train(params, lgbm_train, 
                                    num_boost_round = adv_rounds, 
                                    init_model = lgbm_model,
                                    fobj  = functools.partial(optimize_non_interferent_log_loss_claudio, alpha=alpha), 
                                    feval = avg_log_loss,# functools.partial(avg_non_interferent_log_loss, alpha=alpha),
                                    evals_result = new_lgbm_info,
                                    valid_sets   = [lgbm_train, lgbm_valid], 
                                    valid_names  = ['train', 'valid'],
                                    verbose_eval=5)
        
        lgbm_info['train'][awesome_hack] += new_lgbm_info['train'][awesome_hack]
        lgbm_info['valid'][awesome_hack] += new_lgbm_info['valid'][awesome_hack]

    
    return lgbm_model, lgbm_info

In [139]:
def train_claudio(train_file, valid_file, test_file, output_model_file):
    
    exp = pd.DataFrame(columns=['num_trees', 'learning_rate', 'num_leaves', 'best_round', 'avg_binary_log_loss'])
    
    # load train/valid/test
    train, valid, test, cat_fx = load_atk_train_valid_test(train_file, valid_file, test_file)
    assert "instance_id" in train.columns.values, "Wrong training set file for GBDT"

    for num_trees in [200]:
        for alpha in [0.5]: #[0.25, 0.50, 0.75, 1.00]:
            best_model = None
            best_info = None
            best_loss = np.inf
            awesome_hack = "avg_non_interferent_log_loss" + " [alpha={:.2f}]".format(alpha)
            awesome_hack = "avg_binary_log_loss"

            for learning_rate in [1.0]: #[0.01, 0.05, 0.1]:
                for num_leaves in [16]: #[8, 16, 24]:

                    lgbm_params = { 'learning_rate': learning_rate, 
                                    'num_leaves': num_leaves} 

                    lgbm_model, lgbm_info = NIBoosting(train,
                                                        valid,
                                                        trees=num_trees, 
                                                        cat_fx = cat_fx, 
                                                        output_model_file=output_model_file, 
                                                        adv_rounds=1,
                                                        params=lgbm_params,
                                                        alpha=alpha)

                    if np.min(lgbm_info['valid'][awesome_hack]) < best_loss:
                        best_model = lgbm_model
                        best_info = lgbm_info
                        best_loss = np.min(lgbm_info['valid'][awesome_hack])
                        best_info['num_trees'] = num_trees
                        best_info['learning_rate'] = learning_rate
                        best_info['num_leaves'] = num_leaves

                    
                    ####
                    # update experimental results
                    best_valid_iter = np.argmin(lgbm_info['valid'][awesome_hack])
                    exp = exp.append({'num_trees': num_trees, 
                                      'learning_rate':learning_rate,
                                      'num_leaves':num_leaves, 
                                      'best_round':best_valid_iter, 
                                      awesome_hack:lgbm_info['valid'][awesome_hack][best_valid_iter]},
                                      ignore_index=True)

            # save file
            best_valid_iter = np.argmin(best_info['valid'][awesome_hack])
            
            model_file_name = "{:s}_claudio_T{:d}_S{:04d}_L{:d}_A{:03d}_R{:d}.model".format(output_model_file,
                                                                                best_info['num_trees'],
                                                                                int(best_info['learning_rate']*1000),
                                                                                best_info['num_leaves'],
                                                                                int(alpha * 100),
                                                                                best_valid_iter + 1
                                                                               )
            
            
            best_model.save_model(model_file_name)
            print ("Model saved to", model_file_name)
                
    return exp

In [140]:
# enable/disable
if True:
    for B in [150]: #[5, 15, 150, 300]:

        experiments = train_claudio ( "../data/census/train_B{:d}.csv.bz2".format(B),
                                                   "../data/census/valid_B{:d}.csv.bz2".format(B),
                                                   "../data/census/test_B{:d}.csv.bz2".format(B),
                                                   "../out/models/adv_boosting_census_B{:d}".format(B))  

        experiments.to_csv('../out/models/adv_boosting_census_B{:d}.csv'.format(B), index=False)

        print (experiments)

Loading pre-processed files...
[5]	train's avg_binary_log_loss: 0.13994	valid's avg_binary_log_loss: 0.141381
[10]	train's avg_binary_log_loss: 0.137907	valid's avg_binary_log_loss: 0.140562
[15]	train's avg_binary_log_loss: 0.13639	valid's avg_binary_log_loss: 0.139487
[20]	train's avg_binary_log_loss: 0.134786	valid's avg_binary_log_loss: 0.139052
[25]	train's avg_binary_log_loss: 0.133823	valid's avg_binary_log_loss: 0.138367
[30]	train's avg_binary_log_loss: 0.133116	valid's avg_binary_log_loss: 0.138354
[35]	train's avg_binary_log_loss: 0.132402	valid's avg_binary_log_loss: 0.137978
[40]	train's avg_binary_log_loss: 0.117862	valid's avg_binary_log_loss: 0.124742
[45]	train's avg_binary_log_loss: 0.117203	valid's avg_binary_log_loss: 0.124434
[50]	train's avg_binary_log_loss: 0.116629	valid's avg_binary_log_loss: 0.123937
[55]	train's avg_binary_log_loss: 0.105531	valid's avg_binary_log_loss: 0.113309
[60]	train's avg_binary_log_loss: 0.104778	valid's avg_binary_log_loss: 0.112722


# Unit Tests & Old implementation

In [68]:
def train_non_interferent(train_file, valid_file, test_file, output_model_file):
    
    exp = pd.DataFrame(columns=['num_trees', 'learning_rate', 'num_leaves', 'alpha', 'best_round', 'avg_non_interferent_log_loss'])
    
    # load train/valid/test
    train, valid, test, cat_fx = load_atk_train_valid_test(train_file, valid_file, test_file)
    train_groups = train['instance_id'].value_counts().sort_index().values
    valid_groups = valid['instance_id'].value_counts().sort_index().values
    test_groups = test['instance_id'].value_counts().sort_index().values
    
    # get index of categorical features 
    cat_fx = np.where(train.columns.isin(cat_fx))[0]
    cat_fx = list([int(x) for x in cat_fx])
    print ("CatFX:", train.columns.values[cat_fx])
    
    # prepare data
    train = train.iloc[:,1:]
    valid = valid.iloc[:,1:]
    test = test.iloc[:,1:]
    
    cat_fx = [x - 1 for x in cat_fx]

    for num_trees in [100]:
        for alpha in [1.0]: #[0.25, 0.50, 0.75, 1.00]:
            best_model = None
            best_info = None
            best_loss = np.inf
            awesome_hack = "avg_non_interferent_log_loss" + " [alpha={:.2f}]".format(alpha)
            
            for learning_rate in [0.1]: #[0.01, 0.05, 0.1]:
                for num_leaves in [16]: #[8, 16, 24, 32]:
                    # datasets
                    lgbm_train = lightgbm.Dataset(data=train.iloc[:,:-1].values,
                                                  label=train.iloc[:,-1].values,
                                                  group=train_groups,
                                                  categorical_feature = cat_fx)

                    lgbm_valid = lightgbm.Dataset(data=valid.iloc[:,:-1].values,
                                                  label=valid.iloc[:,-1].values,
                                                  group=valid_groups,
                                                  #reference=lgbm_train, 
                                                  #free_raw_data=False,
                                                  categorical_feature = cat_fx)

                    # run train
                    lgbm_params = { 'learning_rate': learning_rate, 
                                    'num_leaves': num_leaves} 
                    lgbm_info = {}
                    lgbm_model = lightgbm.train(lgbm_params, lgbm_train, 
                                                num_boost_round = num_trees,
                                                fobj            = lambda x,y: optimize_non_interferent_log_loss(x,y,alpha=alpha), # functools.partial(optimize_non_interferent_log_loss, alpha=alpha),
                                                feval           = lambda x,y: avg_non_interferent_log_loss(x,y,alpha=alpha), #, functools.partial(avg_non_interferent_log_loss, alpha=alpha),
                                                evals_result    = lgbm_info,
                                                valid_sets      = [lgbm_train, lgbm_valid], 
                                                valid_names     = ['train', 'valid'],
                                                verbose_eval    = 5)
                    
                    
                    if np.min(lgbm_info['valid'][awesome_hack]) < best_loss:
                        best_model = lgbm_model
                        best_info = lgbm_info
                        best_loss = np.min(lgbm_info['valid'][awesome_hack])
                        best_info['num_trees'] = num_trees
                        best_info['learning_rate'] = learning_rate
                        best_info['num_leaves'] = num_leaves
                

                    # save file
                    
                    best_valid_iter = np.argmin(lgbm_info['valid'][awesome_hack])

                    # update experimental results
                    exp = exp.append({'num_trees': num_trees, 
                                      'learning_rate':learning_rate,
                                      'num_leaves':num_leaves, 
                                      'alpha': alpha,
                                      'best_round':best_valid_iter+1, 
                                      'avg_non_interferent_log_loss':lgbm_info['valid'][awesome_hack][best_valid_iter]},
                                     ignore_index=True)
            
            best_valid_iter = np.argmin(best_info['valid'][awesome_hack])
            
            model_file_name = "{:s}_T{:d}_S{:04d}_L{:d}_A{:03d}_R{:d}.model".format(output_model_file,
                                                                                    best_info['num_trees'],
                                                                                    int(best_info['learning_rate']*1000),
                                                                                    best_info['num_leaves'],
                                                                                    int(alpha * 100),
                                                                                    best_valid_iter + 1
                                                                                   )
            
            
            best_model.save_model(model_file_name)
            print ("Model saved to", model_file_name)
                    
    return exp

In [69]:
# enable/disable
if True:
    for B in [150]: #[5, 15, 150, 300]:

        experiments = train_non_interferent("../data/census/train_B{:d}.csv.bz2".format(B),
                                                   "../data/census/valid_B{:d}.csv.bz2".format(B),
                                                   "../data/census/test_B{:d}.csv.bz2".format(B),
                                                   "../out/models/non_interferent_census_B{:d}".format(B))  

        experiments.to_csv('../out/models/non_interferent_census_B{:d}.csv'.format(B), index=False)

        print (experiments)

Loading pre-processed files...
CatFX: ['workclass' 'marital_status' 'occupation' 'relationship' 'race' 'sex'
 'native_country']


TypeError: unsupported operand type(s) for *: 'NoneType' and 'float'

## Check groups are the same after re-splitting.

In [None]:
def check_groups(atk_train_file, atk_valid_file, atk_test_file):
    # Load post-resplitting data
    post_train, post_valid, post_test, cat_fx = load_atk_train_valid_test(atk_train_file, atk_valid_file, atk_test_file)
    post_train_groups = post_train['instance_id'].value_counts().sort_index().values
    post_valid_groups = post_valid['instance_id'].value_counts().sort_index().values
    post_test_groups  = post_test['instance_id'].value_counts().sort_index().values
    
    # load pre-re-splitting data
    pre_train = pd.read_csv(atk_train_file)
    pre_valid = pd.read_csv(atk_valid_file)
    pre_test  = pd.read_csv(atk_test_file)
    pre_train_groups = pre_train['instance_id'].value_counts().sort_index().values
    pre_valid_groups = pre_valid['instance_id'].value_counts().sort_index().values
    pre_test_groups  = pre_test['instance_id'].value_counts().sort_index().values

    # check global lenght
    print ("PRE  TOTAL instances:", len(pre_train)+ len(pre_valid)+ len(pre_test))
    print ("POST TOTAL instances:", len(post_train)+ len(post_valid)+ len(post_test))
    
    assert len(pre_train)+ len(pre_valid)+ len(pre_test) == \
            len(post_train)+ len(post_valid)+ len(post_test),\
            "Different number of instances !"
            
    # check groups
    print ("PRE  lengths in groups:", len(pre_train_groups), len(pre_valid_groups), len(pre_test_groups))
    print ("POST lengths in groups:", len(post_train_groups), len(post_valid_groups), len(post_test_groups))
    pre_all_groups  = np.concatenate([pre_train_groups, pre_valid_groups, pre_test_groups])
    post_all_groups = np.concatenate([post_train_groups, post_valid_groups, post_test_groups])
    print ("PRE  TOTAL groups:", len(pre_all_groups))
    print ("POST TOTAL groups:", len(post_all_groups))
    
    assert len(pre_all_groups)==len(post_all_groups),\
        "Different number of groups!"
    
    # compare values
    assert (pre_all_groups==post_all_groups).all(),\
        "Groups have different sizes!"

B=5
check_groups("../data/census/train_B{:d}.csv.bz2".format(B),
                   "../data/census/valid_B{:d}.csv.bz2".format(B),
                   "../data/census/test_B{:d}.csv.bz2".format(B)) 
B=None

## Check extraction of non-attacked instances

In [None]:
def get_nonatk_ids(atk_data):
    # get groups and remove instance ids
    atk_groups = atk_data['instance_id'].value_counts().sort_index().values

    original_ids = np.cumsum(atk_groups[:-1])
    original_ids = np.insert(original_ids, 0, 0)

    return original_ids
    
def check_nonatk_filter(atk_train_file, atk_valid_file, atk_test_file,
                        train_file, valid_file, test_file):
    # Load post-resplitting data
    atk_train, atk_valid, atk_test, cat_fx = load_atk_train_valid_test(atk_train_file, atk_valid_file, atk_test_file)
    
    # Load non attacked instances
    train, valid, test, cat_fx = load_atk_train_valid_test(train_file, valid_file, test_file)

    # Filter atks and skip instance id
    filtered_train = atk_train.iloc[get_nonatk_ids(atk_train),1:]
    filtered_valid = atk_valid.iloc[get_nonatk_ids(atk_valid),1:]
    filtered_test  = atk_test.iloc[get_nonatk_ids(atk_test),1:]

    print ("Attacked shapes", atk_train.shape, atk_valid.shape, atk_test.shape)
    print ("Filteres shapes", filtered_train.shape, filtered_valid.shape, filtered_test.shape)
    print ("Original shapes", train.shape, valid.shape, test.shape)
    
    assert np.array_equal(train.values,filtered_train.values), "Different Data !"
    assert np.array_equal(valid.values,filtered_valid.values), "Different Data !"
    assert np.array_equal(test.values,filtered_test.values), "Different Data !"



B=5
check_nonatk_filter("../data/census/train_B{:d}.csv.bz2".format(B),
                       "../data/census/valid_B{:d}.csv.bz2".format(B),
                       "../data/census/test_B{:d}.csv.bz2".format(B),
                   "../data/census/train_ori.csv.bz2".format(B),
                       "../data/census/valid_ori.csv.bz2".format(B),
                       "../data/census/test_ori.csv.bz2".format(B) ) 
B=None

# Training SVM

In [None]:
def train_svm(train_file, valid_file, test_file, output_model_file):
    
    exp = pd.DataFrame(columns=['C', 'avg_binary_log_loss'])
    
    # load train/valid/test
    train, valid, test, cat_fx = load_atk_train_valid_test(train_file, valid_file, test_file)
    X_train = train.iloc[:,:-1].values
    y_train = train.iloc[:,-1].values
    y_train[y_train == -1] = 0
    
    X_valid = valid.iloc[:,:-1].values
    y_valid = valid.iloc[:,-1].values
    
    for c in [0.001, 0.01, 0.1, 1.0]:
        
        model = SVC(kernel='rbf', probability=True)
        model.fit(X_train, y_train)
        
        y_preds = model.predict_proba(X_valid)[:,0]
        cur_avg_binary_log_loss = np.mean(binary_log_loss(y_preds, y_valid))
        
        model_file_name = "{:s}_C{:04d}.model".format(output_model_file, int(c * 1000))
        
        with open(model_file_name, 'wb') as fout:
            pickle.dump(model, fout)
        
        print ("Model saved to", model_file_name)
        
        # update experimental results
        exp = exp.append({'C': c, 
                          'avg_binary_log_loss':cur_avg_binary_log_loss},
                         ignore_index=True)
    
    return exp

In [None]:
# enable/disable LGBM Baseline
if False:
    experiments = train_svm ( "../data/census/train_ori.csv.bz2",
                                                     "../data/census/valid_ori.csv.bz2",
                                                     "../data/census/test_ori.csv.bz2",
                                                     "../out/models/svm_census")  

    experiments.to_csv('../out/models/svm_census.csv', index=False)

    print (experiments)