# Training Models

This notebook contains the code used for training the following learning models:

-  **Standard GBDT** (_baseline 1_)
-  **Adversarial Boosting** (_baseline 2_)
-  **Non-Interferent GBDT** (our proposal)

# Documentation

 - http://lightgbm.readthedocs.io/en/latest/
 - http://lightgbm.readthedocs.io/en/latest/Python-Intro.html
 - https://github.com/Microsoft/LightGBM/tree/master/examples/python-guide

In [None]:
import sys
import os
import numpy as np
import pandas as pd
import lightgbm
import pickle
import json
import functools
from os import listdir
from os.path import isfile, join
from sklearn.svm import SVC

# Data Preparation

In [None]:
def label_encode(dataset, categorical_features):
    dataset_le = dataset.copy()
    for column in dataset_le.columns:
        if column in categorical_features:
            dataset_le[column] = dataset_le[column].astype('category')
            dataset_le[column] = dataset_le[column].cat.codes.astype(np.int32)
    return dataset_le

def load_atk_train_valid_test( atk_train_file, atk_valid_file, atk_test_file, 
                               train_split=0.6, valid_split=0.2,
                               force=False):
    
    
    if  ( force or 
          not os.path.exists(atk_train_file+".cat.bz2") or
          not os.path.exists(atk_valid_file+".cat.bz2") or
          not os.path.exists(atk_test_file+".cat.bz2") or 
          not os.path.exists(atk_train_file+".cat.json") ):
    
        print ("Pre-processing original files...")

        print ("Loading:", atk_train_file)
        print ("Loading:", atk_valid_file)
        print ("Loading:", atk_test_file)

        train = pd.read_csv(atk_train_file)
        valid = pd.read_csv(atk_valid_file)
        test  = pd.read_csv(atk_test_file)
        
        print ("Train/Valid/Test sizes:", train.shape, valid.shape, test.shape)
        print ("Train/Valid/Test split: {:.2f} {:.2f} {:.2f}"
                   .format( train.shape[0]/(train.shape[0]+valid.shape[0]+test.shape[0]),
                            valid.shape[0]/(train.shape[0]+valid.shape[0]+test.shape[0]),
                            test.shape[0] /(train.shape[0]+valid.shape[0]+test.shape[0]) ) )

        # concat to process correctly label encoding
        full = pd.concat( [train, valid, test] )

        # get index of categorical features (-1 because of instance_id)
        cat_fx = full.columns.values[ np.where(full.dtypes=='object')[0] ]
        cat_fx = list(cat_fx)    
        full = label_encode(full, cat_fx)
        with open(atk_train_file+".cat.json", 'w') as fp:
            json.dump(cat_fx, fp)
        print ("CatFX:", cat_fx)

        # split-back into train valid test
        train_size = int( full.shape[0]*train_split )
        valid_size = int( full.shape[0]*valid_split )
        train_cat = full.iloc[0:train_size,:]
        valid_cat = full.iloc[train_size:train_size+valid_size,:]
        test_cat  = full.iloc[train_size+valid_size:,:]    

        print ("Train/Valid/Test sizes:", train_cat.shape, valid_cat.shape, test_cat.shape)
        print ("Train/Valid/Test split: {:.2f} {:.2f} {:.2f}"
                   .format( train_cat.shape[0]/(train_cat.shape[0]+valid_cat.shape[0]+test_cat.shape[0]),
                            valid_cat.shape[0]/(train_cat.shape[0]+valid_cat.shape[0]+test_cat.shape[0]),
                            test_cat.shape[0] /(train_cat.shape[0]+valid_cat.shape[0]+test_cat.shape[0]) ) )

        # save to file
        print ("Saving processed files *.cat.bz2")
        train_cat.to_csv(atk_train_file+".cat.bz2", compression="bz2", index=False)
        valid_cat.to_csv(atk_valid_file+".cat.bz2", compression="bz2", index=False)
        test_cat.to_csv (atk_test_file+".cat.bz2",  compression="bz2", index=False)
        
    else:
        print ("Loading pre-processed files...")

        train_cat = pd.read_csv(atk_train_file+".cat.bz2")
        valid_cat = pd.read_csv(atk_valid_file+".cat.bz2")
        test_cat  = pd.read_csv(atk_test_file+".cat.bz2")
        
        with open(atk_train_file+".cat.json", 'r') as fp:
            cat_fx = json.load(fp)
    
    # return data
    return train_cat, valid_cat, test_cat, cat_fx


# Objective Functions

## Standard

The following function, called <code>optimize_log_loss</code>, is the one that should be optimized (i.e., minimized) for learning _standard_ and _baseline_ approaches. More specifically, this is the standard binary log loss which is used to train any _standard_ or _baseline_ model.

# $L$ = <code>optimize_log_loss</code>

$$
L = \frac{1}{|\mathcal{D}|} \cdot \sum_{(\mathbf{x},y) \in \mathcal{D}}\ell(h(\mathbf{x}), y)
$$

where:

$$
\ell(h(\mathbf{x}), y) = log(1+e^{(-yh(\mathbf{x}))})
$$

In [None]:
def optimize_log_loss(preds, train_data):
    labels = train_data.get_label()
    exp_pl = np.exp(preds * labels)
    # http://www.wolframalpha.com/input/?i=differentiate+log(1+%2B+exp(-kx)+)
    grads = -labels / (1.0 +  exp_pl)  
    # http://www.wolframalpha.com/input/?i=d%5E2%2Fdx%5E2+log(1+%2B+exp(-kx)+)
    hess = labels**2 * exp_pl / (1.0 + exp_pl)**2 

    # this is to optimize average logloss
    norm = 1.0/len(preds)
    grads *= norm
    hess *= norm
    
    return grads, hess

## Custom

In addition to the standard binary log loss used to train a model, we introduce our custom <code>optimize_non_interferent_log_loss</code>, which is computed as the weighted combination of two objective functions, as follows:

-  $L$ = <code>optimize_log_loss</code> (standard, already seen above);
-  $L^A$ = <code>optimize_log_loss_uma</code> (custom, defined below).

# $L^A$ = <code>optimize_log_loss_uma</code>

This function is used to train a **full** _non-interferent_ model; in other words, full non-interferent models are learned by optimizing (i.e., minimizing) the function which measures the binary log loss **under the maximal attack** possible.

$$
L^A = \frac{1}{|\mathcal{D}|} \cdot \sum_{(\mathbf{x},y) \in \mathcal{D}} \log  \left( \sum_{\mathbf{x}' \in \mathit{MaxAtk}({\mathbf{x}},{A})} e^{\ell(h(\mathbf{x}'), y)} \right).
$$

where still:

$$
\ell(h(\mathbf{x}), y) = log(1+e^{(-yh(\mathbf{x}))})
$$

In [None]:
def optimize_log_loss_uma(preds, train_data):
    labels = train_data.get_label()
    attack_lens = train_data.get_group()
    
    grads = np.zeros_like(labels, dtype=np.float64)
    hess = np.zeros_like(grads)
    
    if attack_lens is not None:

        norm = 1.0 / float(len(attack_lens))

        offset = 0
        for atk in attack_lens:
            exp_pl = np.exp(- preds[offset:offset+atk] * labels[offset:offset+atk])

            inv_sum = 1.0 / np.sum(1.0 + exp_pl)

            x_grad = inv_sum * exp_pl

            grads[offset:offset+atk] = norm * x_grad * (- labels[offset:offset+atk])
            hess[offset:offset+atk]  = norm * x_grad * (1.0 - x_grad)

            offset += atk    
    
    return grads, hess

# <code>optimize_non_interferent_log_loss</code>

$$
\alpha\cdot L^A + (1-\alpha)\cdot L
$$

$$
\alpha \cdot \underbrace{\Bigg[\frac{1}{|\mathcal{D}|} \cdot \sum_{(\mathbf{x},y) \in \mathcal{D}} \log  \left( \sum_{\mathbf{x}' \in \mathit{MaxAtk}({\mathbf{x}},{A})} e^{\ell(h(\mathbf{x}'), y)} \right)\Bigg]}_{L^A} + (1-\alpha) \cdot \underbrace{\Bigg[\frac{1}{|\mathcal{D}|} \cdot \sum_{(\mathbf{x},y) \in \mathcal{D}} \ell(h(\mathbf{x}, y))\Bigg]}_{L}
$$

In [None]:
def optimize_non_interferent_log_loss(preds, train_data, alpha=1.0):
    # binary logloss under maximal attack
    grads_uma, hess_uma = optimize_log_loss_uma(preds, train_data)
    
    # binary logloss (plain)
    grads_plain, hess_plain = optimize_log_loss(preds, train_data)
    
    # combine the above two losses together
    grads = alpha*grads_uma + (1.0-alpha)*grads_plain
    hess  = alpha*hess_uma  + (1.0-alpha)*hess_plain
    
    return grads, hess

## Using one objective function for both _standard_ and _non-interferent_ learning

The advantage of the <code>optimize_non_interferent_log_loss</code> function defined above is that we can wrap it so that we can use it as the only objective function (<code>fobj</code>) passed in to LightGBM. 

In other words, if we call <code>fobj=optimize_non_interferent_log_loss</code> with <code>alpha=0.0</code>, this will end up optimizing (i.e., minimizing) the "vanilla" objective function (i.e., the standard binary log loss, defined by the function <code>optimize_log_loss</code> above).

Conversely, calling <code>fobj=optimize_non_interferent_log_loss</code> with <code>alpha=1.0</code> turns into optimizing (i.e., minimizing) the full non-interferent objective function (i.e., the custom binary log loss under max attack, defined by the function <code>optimize_log_loss_uma</code> above).

Anything that sits in between (i.e., <code>0 < alpha < 1</code>) optimizes an objective function that trades off between the standard and the full non-interferent term.

# Evaluation Metrics

## Standard

The following function is the one used for evaluating the quality of the learned model (either _standard_, _adversarial-boosting_, or _non-interferent_). This is the standard <code>avg_log_loss</code>.

In [None]:
def logistic(x):
    return 1.0/(1.0 + np.exp(-x))

In [None]:
def logit(p):
    return np.log(p/(1-p))

# <code>avg_log_loss</code>

In [None]:
# self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool
def avg_log_loss(preds, train_data):
    
    labels = train_data.get_label()
    losses = np.log(1.0 + np.exp(-preds*labels))
    avg_loss = np.mean(losses)
    
    return 'avg_binary_log_loss', avg_loss, False

## Custom

Similarly to what we have done for <code>fobj</code>, <code>feval</code> can be computed from a weighted combination of two evaluation metrics:

-  <code>avg_log_loss</code> (standard, defined above);
-  <code>avg_log_loss_uma</code> (custom, defined below).

# <code>avg_log_loss_uma</code>

This is the binary log loss yet modified to operate on groups of perturbed instances.

In [None]:
# Our custom metrics
def binary_log_loss(pred, true_label):

    return np.log(1.0 + np.exp(-pred * true_label))

# self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool
def avg_log_loss_uma(preds, train_data):
    labels = train_data.get_label()
    attack_lens = train_data.get_group()
    
    offset = 0
    max_logloss = []
    avg_max_logloss = 0.0
    
    if attack_lens is not None:
    
        for atk in attack_lens:
            losses = [binary_log_loss(h,t) for h,t in zip(preds[offset:offset+atk], labels[offset:offset+atk])]
            max_logloss.append(max(losses))

            offset += atk
        
        avg_max_logloss = np.mean(max_logloss)  

    return 'avg_binary_log_loss_under_max_attack', avg_max_logloss, False

# <code>feval=avg_non_interferent_log_loss</code>

Used for measuring the validity of any model (either _standard_, _baseline_, or _non-interferent_). More precisely, <code>avg_non_interferent_log_loss</code> is the weighted sum of the binary log loss and the binary log loss under maximal attack.

In [None]:
def avg_non_interferent_log_loss(preds, train_data, alpha=1.0):
    
    # binary logloss under maximal attack
    _, loss_uma, _    = avg_log_loss_uma(preds, train_data)
    
    # binary logloss (plain)
    _, loss_plain, _  = avg_log_loss(preds, train_data)
    
    # combine the above two losses together
    weighted_loss = alpha*loss_uma + (1.0-alpha)*loss_plain

    return 'avg_non_interferent_log_loss [alpha={:.2f}]'.format(alpha), weighted_loss, False

# Adversarial Boosting

In [None]:
def gen_adv_boosting_data(model, data, groups):
    ''' 
    model  : is the LightGBM Model
    data   : data matrix with all valid attacks (last column is label)
    groups : grouping of same attacked instance 
    returns the new data matrix and new groups
    
    WARNING: currently works only for binary classification
    '''
    # score the datataset
    labels = data[:,-1]
    
    predictions = model.predict(data[:,:-1]) # exclude labels
    # binarize
    predictions = (predictions>0).astype(np.float)
    predictions = 2*predictions - 1
    
    # check mispredictions
    matchings = labels * predictions
    
    # select original data + attacked instances
    new_selected = [] # id of selected instances
    new_groups   = []
    
    offset = 0
    for g in groups:
        if g==0:
            print ("Error !!!!")
        elif g==1:
            # there are no attacks, just add original
            new_selected += [offset]
            new_groups   += [1]
        else:
            # get a slice of the matching scores
            g_matchings = matchings[offset:offset+g]

            # most misclassified (smallest margin)
            # skip original
            adv_instance = np.argmin(g_matchings[1:])+1

            # add original and adversarial
            new_selected += [offset, adv_instance]
            new_groups   += [2]
        
        offset += g
    
    new_dataset = data[new_selected,:]
    
    return new_dataset, new_groups

In [None]:
def extend_adv_boosting_model(train, valid, cat_fx, input_model=None, num_trees=1, params=None):
    ''' 
    model  : is the LightGBM Model
    data   : data matrix with all valid attacks (last column is label)
    returns the new model (is model modified inplace?)
    '''
    
    if cat_fx is None or len(cat_fx)==0:
        cat_fx = "auto"

    lgbm_train = lightgbm.Dataset(data=train[:,:-1], 
                                  label=train[:,-1],
                                  categorical_feature = cat_fx)
    
    lgbm_valid = lightgbm.Dataset(data=valid[:,:-1], 
                                  label=valid[:,-1],
                                  categorical_feature = cat_fx)
    
    lgbm_info = {}
    lgbm_model = lightgbm.train(params, lgbm_train, 
                                num_boost_round = num_trees, 
                                init_model = input_model,
                                fobj = optimize_log_loss, 
                                feval = avg_log_loss,
                                evals_result = lgbm_info,
                                valid_sets   = [lgbm_train, lgbm_valid], 
                                valid_names  = ['train', 'valid'],
                                verbose_eval=25)

    return lgbm_model, lgbm_info

In [None]:
def AdvBoosting(atk_train, valid, trees, 
                 cat_fx,
                 params,
                 output_model_file,
                 partial_save=100, 
                 adv_rounds=1):
    ''' 
    atk_data: full dataset including all valid attacks
    atk_groups: lenght of each attack set
    trees: total number of trees to be produced
    adv_rounds: adversarial instance injecting frequency
    '''
    # temp lgbm file
    temp = output_model_file+".tmp"
    
    # get groups and remove instance ids
    atk_groups = atk_train['instance_id'].value_counts().sort_index().values
    #atk_train.drop('instance_id', axis=1, inplace=True)
    
    # get index of categorical features 
    cat_fx = np.where(atk_train.columns.isin(cat_fx) )[0]
    cat_fx = list([int(x) for x in cat_fx])  
    # print ("CatFX:", atk_train.columns.values[cat_fx])

    # prepare data (avoiding pandas)
    atk_data   = atk_train.iloc[:,1:].values
    valid = valid.values

    # train first trees
    original_ids = np.cumsum(atk_groups[:-1])
    original_ids = np.insert(original_ids, 0, 0)
    
    model, model_info = extend_adv_boosting_model(atk_data[original_ids, :], 
                                                  valid,
                                                  cat_fx=cat_fx,
                                                  input_model=None, 
                                                  num_trees=adv_rounds, 
                                                  params=params )
    
    best_model = model
    best_info = model_info
    best_loss = model_info['valid']['avg_binary_log_loss']
    best_round = 1
        
    # train remaining trees
    for t in range(adv_rounds+1, trees+1, adv_rounds):
        # attack dataset
        adv_data, adv_offsets = gen_adv_boosting_data(model, atk_data, atk_groups)
        
        # train additional trees
        model.save_model(temp)
        model, model_info = extend_adv_boosting_model(adv_data, 
                                                      valid,
                                                      cat_fx=cat_fx,
                                                      input_model=temp, 
                                                      num_trees=adv_rounds, 
                                                      params=params)

        if model_info['valid']['avg_binary_log_loss'] < best_loss:
            best_model = model
            best_info = model_info
            best_loss = np.min(model_info['valid']['avg_binary_log_loss'])
            best_round = t
        
        # save partial model
        if t % partial_save == 0 and t != trees:
            partial_filename = "{:s}_T{:d}-of-{:d}_S{:04d}_L{:d}.model.tmp".format(output_model_file, 
                                                                                   t, 
                                                                                   trees, 
                                                                                   int(params['learning_rate'] * 1000),
                                                                                   params['num_leaves']
                                                                                  )
            
            print("Save partial model to {}".format(partial_filename))
            model.save_model(filename=partial_filename)
            
    
    return model, model_info, best_loss, best_round

# Training Standard GBDT (_baseline 1_)

In [None]:
def train_gradient_boosting_baseline( train_file, valid_file, test_file,
                                output_model_file):
    
    exp = pd.DataFrame(columns=['num_trees', 'learning_rate', 'num_leaves', 'best_round', 'avg_binary_log_loss'])
    
    # load train/valid/test
    train, valid, test, cat_fx = load_atk_train_valid_test(train_file, valid_file, test_file)

    # get index of categorical features 
    cat_fx = np.where(train.columns.isin(cat_fx) )[0]
    cat_fx = list([int(x) for x in cat_fx])  
    print ("CatFX:", train.columns.values[cat_fx])
    

    for num_trees in [50, 100, 150, 200, 250]:
        best_model = None
        best_info = None
        best_loss = np.inf
        for learning_rate in [0.001, 0.01, 0.1, 1.0]:
            for num_leaves in [8, 16, 24]:
                # datasets
                lgbm_train = lightgbm.Dataset(data=train.values[:,:-1], 
                                              label=train.values[:,-1],
                                              categorical_feature = cat_fx)

                lgbm_valid = lightgbm.Dataset(data=valid.values[:,:-1], 
                                              label=valid.values[:,-1],
                                              categorical_feature = cat_fx)

                # run train
                lgbm_params = { 'learning_rate': learning_rate, 
                                'num_leaves': num_leaves} 
                lgbm_info = {}
                lgbm_model = lightgbm.train(lgbm_params, lgbm_train, 
                                            num_boost_round = num_trees,
                                            fobj            = optimize_log_loss, 
                                            feval           = avg_log_loss,
                                            evals_result    = lgbm_info,
                                            valid_sets      = [lgbm_train, lgbm_valid], 
                                            valid_names     = ['train', 'valid'],
                                            verbose_eval    = 25)

                if np.min(lgbm_info['valid']['avg_binary_log_loss']) < best_loss:
                    best_model = lgbm_model
                    best_info = lgbm_info
                    best_loss = np.min(lgbm_info['valid']['avg_binary_log_loss'])
                    
                best_valid_iter = np.argmin(lgbm_info['valid']['avg_binary_log_loss'])
                
                # update experimental results
                exp = exp.append({'num_trees': num_trees, 
                                  'learning_rate':learning_rate,
                                  'num_leaves':num_leaves, 
                                  'best_round':best_valid_iter+1, 
                                  'avg_binary_log_loss':lgbm_info['valid']['avg_binary_log_loss'][best_valid_iter]},
                                 ignore_index=True)
                
        
        # save file
        best_valid_iter = np.argmin(best_info['valid']['avg_binary_log_loss'])

        model_file_name = "{:s}_T{:d}_S{:04d}_L{:d}_R{:d}.model".format(output_model_file,
                                                                        num_trees,
                                                                        int(learning_rate*1000),
                                                                        num_leaves,
                                                                        best_valid_iter + 1
                                                                       )
        
        best_model.save_model(model_file_name, num_iteration=best_valid_iter+1)
        print ("Model saved to", model_file_name)

    
    return exp

In [None]:
# enable/disable LGBM Baseline
if True:
    experiments = train_gradient_boosting_baseline("../data/census/train_ori.csv.bz2",
                                                     "../data/census/valid_ori.csv.bz2",
                                                     "../data/census/test_ori.csv.bz2",
                                                     "../out/models/std_gbdt_census")  

    experiments.to_csv('../out/models/std_gbdt_census.csv', index=False)

    print (experiments)

# Training Adversarial Boosting (_baseline 2_)

In [None]:
def train_adversarial_boosting(train_file, valid_file, test_file, output_model_file):
    
    exp = pd.DataFrame(columns=['num_trees', 'learning_rate', 'num_leaves', 'best_round', 'avg_binary_log_loss'])
    
    # load train/valid/test
    train, valid, test, cat_fx = load_atk_train_valid_test(train_file, valid_file, test_file)

    # get index of categorical features 
    cat_fx = np.where(train.columns.isin(cat_fx) )[0]
    cat_fx = list([int(x) for x in cat_fx])  
    print ("CatFX:", train.columns.values[cat_fx])
    

    for num_trees in [50, 100, 150, 200, 250]:
        for learning_rate in [0.001, 0.01, 0.1, 1.0]:
            for num_leaves in [8, 16, 24]:
                      
                lgbm_params = { 'learning_rate': learning_rate, 
                                'num_leaves': num_leaves} 
                
                lgbm_model, lgbm_info, best_loss, best_valid_iter = AdvBoosting(train,
                                                    valid,
                                                    trees=num_trees, 
                                                    cat_fx = cat_fx, 
                                                    output_model_file=output_model_file, 
                                                    adv_rounds=1,
                                                    params=lgbm_params)
                

                # update experimental results
                exp = exp.append({'num_trees': num_trees, 
                                  'learning_rate':learning_rate,
                                  'num_leaves':num_leaves, 
                                  'best_round':best_valid_iter, 
                                  'avg_binary_log_loss':best_loss},
                                 ignore_index=True)
        
        # save file
        model_file_name = "{:s}_T{:d}_S{:04d}_L{:d}_R{:d}.model".format(output_model_file,
                                                                                num_trees,
                                                                                int(learning_rate*1000),
                                                                                num_leaves,
                                                                                best_valid_iter
                                                                               )
        lgbm_model.save_model(model_file_name, num_iteration=best_valid_iter)
        print ("Model saved to", model_file_name)
                
    return exp

In [None]:
# enable/disable
if True:
    for B in [5, 15, 150, 300]:

        experiments = train_adversarial_boosting ( "../data/census/train_B{:d}.csv.bz2".format(B),
                                                   "../data/census/valid_B{:d}.csv.bz2".format(B),
                                                   "../data/census/test_B{:d}.csv.bz2".format(B),
                                                   "../out/models/adv_boosting_census_B{:d}".format(B))  

        experiments.to_csv('../out/models/adv_boosting_census_B{:d}.csv'.format(B), index=False)

        print (experiments)

# Training Non-Interferent GBDT

In [None]:
def train_non_interferent(train_file, valid_file, test_file, output_model_file):
    
    exp = pd.DataFrame(columns=['num_trees', 'learning_rate', 'num_leaves', 'alpha', 'best_round', 'avg_binary_log_loss'])
    
    # load train/valid/test
    train, valid, test, cat_fx = load_atk_train_valid_test(train_file, valid_file, test_file)

    # get index of categorical features 
    cat_fx = np.where(train.columns.isin(cat_fx) )[0]
    cat_fx = list([int(x) for x in cat_fx])  
    print ("CatFX:", train.columns.values[cat_fx])
    

    for num_trees in [50, 100, 150, 200, 250]:
        for alpha in [0.25, 0.50, 0.75, 1.00]:
            best_model = None
            best_info = None
            best_loss = np.inf
            awesome_hack = "avg_non_interferent_log_loss" + " [alpha={:.2f}]".format(alpha)
            
            for learning_rate in [0.001, 0.01, 0.1, 1.0]:
                for num_leaves in [8, 16, 24]:
                    # datasets
                    lgbm_train = lightgbm.Dataset(data=train.values[:,:-1], 
                                                  label=train.values[:,-1],
                                                  categorical_feature = cat_fx)

                    lgbm_valid = lightgbm.Dataset(data=valid.values[:,:-1], 
                                                  label=valid.values[:,-1],
                                                  categorical_feature = cat_fx)

                    # run train
                    lgbm_params = { 'learning_rate': learning_rate, 
                                    'num_leaves': num_leaves} 
                    lgbm_info = {}
                    lgbm_model = lightgbm.train(lgbm_params, lgbm_train, 
                                                num_boost_round = num_trees,
                                                fobj            = functools.partial(optimize_non_interferent_log_loss, alpha=alpha),
                                                feval           = functools.partial(avg_non_interferent_log_loss, alpha=alpha),
                                                evals_result    = lgbm_info,
                                                valid_sets      = [lgbm_train, lgbm_valid], 
                                                valid_names     = ['train', 'valid'],
                                                verbose_eval    = 25)
                    
                    
                    if np.min(lgbm_info['valid'][awesome_hack]) < best_loss:
                        best_model = lgbm_model
                        best_info = lgbm_info
                        best_loss = np.min(lgbm_info['valid'][awesome_hack])
                

                    # save file
                    
                    best_valid_iter = np.argmin(lgbm_info['valid'][awesome_hack])

                    # update experimental results
                    exp = exp.append({'num_trees': num_trees, 
                                      'learning_rate':learning_rate,
                                      'num_leaves':num_leaves, 
                                      'alpha': alpha,
                                      'best_round':best_valid_iter+1, 
                                      'avg_binary_log_loss':lgbm_info['valid'][awesome_hack][best_valid_iter]},
                                     ignore_index=True)
            
            best_valid_iter = np.argmin(best_info['valid'][awesome_hack])
            
            model_file_name = "{:s}_T{:d}_S{:04d}_L{:d}_A{:03d}_R{:d}.model".format(output_model_file,
                                                                                    num_trees,
                                                                                    int(learning_rate*1000),
                                                                                    num_leaves,
                                                                                    int(alpha * 100),
                                                                                    best_valid_iter + 1
                                                                                   )
            best_model.save_model(model_file_name, num_iteration=best_valid_iter+1)
            print ("Model saved to", model_file_name)
            
                    
    return exp

In [None]:
# enable/disable
if True:
    for B in [5, 15, 150, 300]:

        experiments = train_non_interferent("../data/census/train_B{:d}.csv.bz2".format(B),
                                                   "../data/census/valid_B{:d}.csv.bz2".format(B),
                                                   "../data/census/test_B{:d}.csv.bz2".format(B),
                                                   "../out/models/non_interferent_census_B{:d}".format(B))  

        experiments.to_csv('../out/models/non_interferent_census_B{:d}.csv'.format(B), index=False)

        print (experiments)

# Training SVM

In [None]:
def train_svm(train_file, valid_file, test_file, output_model_file):
    
    exp = pd.DataFrame(columns=['C', 'avg_binary_log_loss'])
    
    # load train/valid/test
    train, valid, test, cat_fx = load_atk_train_valid_test(train_file, valid_file, test_file)
    X_train = train.iloc[:,:-1].values
    y_train = train.iloc[:,-1].values
    y_train[y_train == -1] = 0
    
    X_valid = valid.iloc[:,:-1].values
    y_valid = valid.iloc[:,-1].values
    
    for c in [0.01, 0.1, 1, 10]:
        
        model = SVC(kernel='rbf', probability=True)
        model.fit(X_train, y_train)
        
        y_preds = model.predict_proba(X_valid)[:,0]
        cur_avg_binary_log_loss = np.mean(binary_log_loss(y_preds, y_valid))
        
        model_file_name = "{:s}_C{:04d}.model".format(output_model_file, int(c * 1000))
        
        with open(model_file_name, 'wb') as fout:
            pickle.dump(model, fout)
        
        print ("Model saved to", model_file_name)
        
        # update experimental results
        exp = exp.append({'C': c, 
                          'avg_binary_log_loss':cur_avg_binary_log_loss},
                         ignore_index=True)
    
    return exp

In [None]:
# enable/disable LGBM Baseline
if True:
    experiments = train_svm ( "../data/census/train_ori.csv.bz2",
                                                     "../data/census/valid_ori.csv.bz2",
                                                     "../data/census/test_ori.csv.bz2",
                                                     "../out/models/svm_census")  

    experiments.to_csv('../out/models/svm_census.csv', index=False)

    print (experiments)