# Non-Interferent Training

# Documentation

 - http://lightgbm.readthedocs.io/en/latest/
 - http://lightgbm.readthedocs.io/en/latest/Python-Intro.html
 - https://github.com/Microsoft/LightGBM/tree/master/examples/python-guide

In [52]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [53]:
import sys
import os
import numpy as np
import pandas as pd
import lightgbm
import pickle
import json
import functools
from os import listdir
from os.path import isfile, join
from nilib import *

# Path to dataset files

In [56]:
DATASET_NAME="wine"

In [57]:
DATASET_DIR="../data/{}".format(DATASET_NAME)
MODELS_DIR="../out/models/{}".format(DATASET_NAME)
TRAINING_FILENAME=DATASET_DIR + "/" + "train_ori.csv.bz2"
TRAINING_FILENAME_ATT=DATASET_DIR + "/" + "train_B{}.csv.bz2"
VALIDATION_FILENAME=DATASET_DIR + "/" + "valid_ori.csv.bz2"
VALIDATION_FILENAME_ATT=DATASET_DIR + "/" + "valid_B{}.csv.bz2"
TEST_FILENAME=DATASET_DIR + "/" + "test_ori.csv.bz2"
TEST_FILENAME_ATT=DATASET_DIR + "/" + "test_B{}.csv.bz2"
MODEL_FILENAME=MODELS_DIR + "/non-interferent_{}_B{}"

In [58]:
TRAINING_BUDGETS=[20, 30, 40]

# Non-Interferent GBDT

In [66]:
def extend_non_interf_model(atk_train, atk_valid, cat_fx, 
                            alpha=1.0, num_trees=1, params=None):
    ''' 
    model  : is the LightGBM Model
    data   : data matrix with all valid attacks (last column is label)
    returns the new model (is model modified inplace?)
    '''
    
    assert atk_train.shape[1]==atk_valid.shape[1], "Train/Valid Mismatch!"
    
    train_groups = atk_train['instance_id'].value_counts().sort_index().values
    valid_groups = atk_valid['instance_id'].value_counts().sort_index().values

    original_train_ids = np.cumsum(train_groups[:-1])
    original_train_ids = np.insert(original_train_ids, 0, 0)
    
    original_valid_ids = np.cumsum(valid_groups[:-1])
    original_valid_ids = np.insert(original_valid_ids, 0, 0)

    # get index of categorical features 
    cat_fx = np.where(atk_train.columns.isin(cat_fx))[0]
    cat_fx = list([int(x) for x in cat_fx])
    print ("CatFX:", atk_train.columns.values[cat_fx])

    # remove instance id
    atk_train = atk_train.iloc[:,1:].values
    atk_valid = atk_valid.iloc[:,1:].values
    cat_fx = [x - 1 for x in cat_fx]
        
    unatk_train = atk_train[original_train_ids,:]
    unatk_valid = atk_valid[original_valid_ids,:]

    if cat_fx is None or len(cat_fx)==0:
        cat_fx = "auto"

    # -------------------------
    # train first iteration
    lgbm_train = lightgbm.Dataset(data=unatk_train[:,:-1], 
                                  label=unatk_train[:,-1],
                                  categorical_feature = cat_fx)
    
    lgbm_valid = lightgbm.Dataset(data=atk_valid[:,:-1], 
                                  label=atk_valid[:,-1],
                                  group=valid_groups,
                                  categorical_feature = cat_fx)

    lgbm_info = {}
    lgbm_model = lightgbm.train(params, lgbm_train, 
                                num_boost_round = 1,
                                fobj  = optimize_log_loss, 
                                feval = functools.partial(avg_non_interferent_log_loss, alpha=alpha), #avg_log_loss
                                evals_result = lgbm_info,
                                valid_sets   = [lgbm_valid], 
                                valid_names  = ['valid'],
                                verbose_eval=25)

    # -------------------------
    # train other iteration
    def get_ni_w_old(preds, labels, groups):
        # "weights"
        w = np.ones(len(groups))

        offset = 0
        for instance_id, g in enumerate(groups):
            exp_pl = np.exp(- preds[offset:offset+g] * labels[offset:offset+g])
            w[instance_id] = 1.0 / np.max(1.0 + exp_pl)
            offset += g  

        return w
    
    def get_ni_w(preds, labels, groups):
        # "weights"
        w = np.ones(len(groups))

        offset = 0
        for instance_id, g in enumerate(groups):
            exp_pl = np.exp(- preds[offset:offset+g] * labels[offset:offset+g])
            # can we replace with e^max
            w[instance_id] = 1.0 / np.max(1.0 + exp_pl)
            offset += g  

        return w
    
    def get_ni_w_num(preds, labels, groups):
        # "weights"
        w = np.ones(len(groups))

        offset = 0
        for instance_id, g in enumerate(groups):
            exp_pl = np.exp(- preds[offset:offset+g] * labels[offset:offset+g])
            w[instance_id] = 1.0 / np.sum(1.0 + exp_pl)
            w[instance_id] *= np.exp(-2.0) 
            offset += g  

        return w

    for t in range (1, num_trees):
    
        # get predictions on atk instances
        train_preds  = lgbm_model.predict(atk_train[:,:-1])
        train_labels = atk_train[:,-1]
        train_weights = get_ni_w(train_preds, train_labels, train_groups)
                
        # repeat for validation
        valid_preds  = lgbm_model.predict(atk_valid[:,:-1])
        valid_labels = atk_valid[:,-1]
        valid_weights = get_ni_w(valid_preds, valid_labels, valid_groups)
        
        # prepare data and train
        lgbm_train = lightgbm.Dataset(data=unatk_train[:,:-1], 
                                      label=unatk_train[:,-1],
                                      weight=train_weights,
                                      categorical_feature = cat_fx)

        lgbm_valid = lightgbm.Dataset(data=atk_valid[:,:-1], 
                                      label=atk_valid[:,-1],
                                      group=valid_groups,
                                      categorical_feature = cat_fx)

        new_lgbm_info = {}
        lgbm_model = lightgbm.train(params, lgbm_train, 
                                    num_boost_round = 1, 
                                    init_model = lgbm_model,
                                    fobj  = functools.partial(optimize_non_interferent_log_loss, alpha=alpha), 
                                    feval = functools.partial(avg_non_interferent_log_loss, alpha=alpha), #avg_log_loss
                                    evals_result = new_lgbm_info,
                                    valid_sets   = [lgbm_valid], #[lgbm_train, lgbm_valid], 
                                    valid_names  = ['valid'],    #['train', 'valid'],
                                    verbose_eval=25)
        
        awesome_hack = 'avg_non_interferent_log_loss [alpha={:.2f}]'.format(alpha)
        lgbm_info['valid'][awesome_hack] += new_lgbm_info['valid'][awesome_hack]


    return lgbm_model, lgbm_info

In [69]:
def train_non_interferent(train_file, valid_file, test_file, output_model_file, alpha=1.0):
    exp = pd.DataFrame(columns=['num_trees', 'learning_rate', 'num_leaves', 'alpha', 'best_round', 'metric', 'filename'])
    
    # load train/valid/test
    atk_train, atk_valid, atk_test, cat_fx = load_atk_train_valid_test(train_file, valid_file, test_file)
    
    for num_trees in [200]:
        best_model = None
        best_info = None
        best_loss = np.inf
        awesome_hack = 'avg_non_interferent_log_loss [alpha={:.2f}]'.format(alpha)

        for learning_rate in [0.01, 0.05, 0.1]:
            for num_leaves in [8, 16, 24]:

                lgbm_params = { 'learning_rate': learning_rate, 
                                'num_leaves': num_leaves} 
                lgbm_model, lgbm_info = extend_non_interf_model(atk_train, atk_valid, cat_fx, 
                            alpha=alpha, num_trees=num_trees, params=lgbm_params)

                # save file
                best_valid_iter = np.argmin(lgbm_info['valid'][awesome_hack])

                model_file_name = "{:s}_T{:d}_S{:04d}_L{:d}_A{:03d}_R{:d}.model".format(output_model_file,
                                                                                    num_trees,
                                                                                    int(learning_rate*1000),
                                                                                    num_leaves,
                                                                                    int(alpha * 100),
                                                                                    best_valid_iter + 1
                                                                                   )
                # update experimental results
                exp = exp.append({'num_trees': num_trees, 
                              'learning_rate':learning_rate,
                              'num_leaves':num_leaves, 
                              'alpha': alpha,
                              'best_round':best_valid_iter+1, 
                              'metric':lgbm_info['valid'][awesome_hack][best_valid_iter],
                              'filename':model_file_name},
                             ignore_index=True)




                lgbm_model.save_model(model_file_name)
                print ("Model saved to", model_file_name)
                    
    return exp

In [None]:
for B in TRAINING_BUDGETS:

        experiments = train_non_interferent(TRAINING_FILENAME_ATT.format(B),
                                            VALIDATION_FILENAME_ATT.format(B),
                                            TEST_FILENAME_ATT.format(B),
                                            MODEL_FILENAME.format(DATASET_NAME, B))  

        experiments.to_csv(MODEL_FILENAME.format(DATASET_NAME, B) + ".csv", index=False)

        print(experiments)
        print ('best model is:', experiments.sort_values('metric').iloc[0]['filename'] )

Loading pre-processed files...
CatFX: []
[25]	valid's avg_non_interferent_log_loss [alpha=1.00]: 0.66121
