# Non-Interferent Training

# Documentation

 - http://lightgbm.readthedocs.io/en/latest/
 - http://lightgbm.readthedocs.io/en/latest/Python-Intro.html
 - https://github.com/Microsoft/LightGBM/tree/master/examples/python-guide

In [None]:
import sys
import os
import numpy as np
import pandas as pd
import lightgbm
import pickle
import json
import functools
from os import listdir
from os.path import isfile, join

from nilib import *

%load_ext autoreload
%autoreload 2

# Non-Interferent GBDT

In [None]:
def extend_non_interf_model(atk_train, atk_valid, cat_fx, 
                            alpha=1.0, num_trees=1, params=None):
    ''' 
    model  : is the LightGBM Model
    data   : data matrix with all valid attacks (last column is label)
    returns the new model (is model modified inplace?)
    '''
    
    if cat_fx is None or len(cat_fx)==0:
        cat_fx = "auto"
        
    assert atk_train.shape[1]==atk_valid.shape[1], "Train/Valid Mismatch!"
    
    train_groups = atk_train['instance_id'].value_counts().sort_index().values
    valid_groups = atk_valid['instance_id'].value_counts().sort_index().values

    original_train_ids = np.cumsum(train_groups[:-1])
    original_train_ids = np.insert(original_train_ids, 0, 0)
    
    original_valid_ids = np.cumsum(valid_groups[:-1])
    original_valid_ids = np.insert(original_valid_ids, 0, 0)

    # get index of categorical features 
    cat_fx = np.where(atk_train.columns.isin(cat_fx))[0]
    cat_fx = list([int(x) for x in cat_fx])
    print ("CatFX:", atk_train.columns.values[cat_fx])

    # remove instance id
    atk_train = atk_train.iloc[:,1:].values
    atk_valid = atk_valid.iloc[:,1:].values
    cat_fx = [x - 1 for x in cat_fx]
        
    unatk_train = atk_train[original_train_ids,:]
    unatk_valid = atk_valid[original_valid_ids,:]

    
    # -------------------------
    # train first iteration
    lgbm_train = lightgbm.Dataset(data=unatk_train[:,:-1], 
                                  label=unatk_train[:,-1],
                                  categorical_feature = cat_fx)
    
    lgbm_valid = lightgbm.Dataset(data=unatk_valid[:,:-1], 
                                  label=unatk_valid[:,-1],
                                  categorical_feature = cat_fx)

    lgbm_info = {}
    lgbm_model = lightgbm.train(params, lgbm_train, 
                                num_boost_round = 1,
                                fobj  = optimize_log_loss, 
                                feval = avg_log_loss,
                                evals_result = lgbm_info,
                                valid_sets   = [lgbm_train, lgbm_valid], 
                                valid_names  = ['train', 'valid'],
                                verbose_eval=5)

    # -------------------------
    # train other iteration
    def get_ni_w_old(preds, labels, groups):
        # "weights"
        w = np.ones(len(groups))

        offset = 0
        for instance_id, g in enumerate(groups):
            exp_pl = np.exp(- preds[offset:offset+g] * labels[offset:offset+g])
            w[instance_id] = 1.0 / np.max(1.0 + exp_pl)
            offset += g  

        return w
    
    def get_ni_w(preds, labels, groups):
        # "weights"
        w = np.ones(len(groups))

        offset = 0
        for instance_id, g in enumerate(groups):
            exp_pl = np.exp(- preds[offset:offset+g] * labels[offset:offset+g])
            # can we replace with e^max
            w[instance_id] = 1.0 / np.max(1.0 + exp_pl)
            offset += g  

        return w
    
    def get_ni_w_num(preds, labels, groups):
        # "weights"
        w = np.ones(len(groups))

        offset = 0
        for instance_id, g in enumerate(groups):
            exp_pl = np.exp(- preds[offset:offset+g] * labels[offset:offset+g])
            w[instance_id] = 1.0 / np.sum(1.0 + exp_pl)
            w[instance_id] *= np.exp(-2.0) 
            offset += g  

        return w

    for t in range (1, num_trees):
    
        # get predictions on atk instances
        train_preds  = lgbm_model.predict(atk_train[:,:-1])
        train_labels = atk_train[:,-1]
        train_weights = get_ni_w(train_preds, train_labels, train_groups)
                
        # repeat for validation
        valid_preds  = lgbm_model.predict(atk_valid[:,:-1])
        valid_labels = atk_valid[:,-1]
        valid_weights = get_ni_w(valid_preds, valid_labels, valid_groups)
        
        # prepare data and train
        lgbm_train = lightgbm.Dataset(data=unatk_train[:,:-1], 
                                      label=unatk_train[:,-1],
                                      weight=train_weights,
                                      categorical_feature = cat_fx)

        lgbm_valid = lightgbm.Dataset(data=unatk_valid[:,:-1], 
                                      label=unatk_valid[:,-1],
                                      weight=valid_weights,
                                      categorical_feature = cat_fx)

        new_lgbm_info = {}
        lgbm_model = lightgbm.train(params, lgbm_train, 
                                    num_boost_round = 1, 
                                    init_model = lgbm_model,
                                    fobj  = functools.partial(optimize_non_interferent_log_loss, alpha=alpha), 
                                    feval = avg_log_loss,# functools.partial(avg_non_interferent_log_loss, alpha=alpha),
                                    evals_result = new_lgbm_info,
                                    valid_sets   = [lgbm_train, lgbm_valid], 
                                    valid_names  = ['train', 'valid'],
                                    verbose_eval=5)
        
        awesome_hack = "avg_binary_log_loss"
        lgbm_info['train'][awesome_hack] += new_lgbm_info['train'][awesome_hack]
        lgbm_info['valid'][awesome_hack] += new_lgbm_info['valid'][awesome_hack]


    return lgbm_model, lgbm_info

In [None]:
def train_non_interferent(train_file, valid_file, test_file, output_model_file):
    exp = pd.DataFrame(columns=['num_trees', 'learning_rate', 'num_leaves', 'alpha', 'best_round', 'avg_non_interferent_log_loss'])
    
    # load train/valid/test
    atk_train, atk_valid, atk_test, cat_fx = load_atk_train_valid_test(train_file, valid_file, test_file)
    
    for num_trees in [200]:
        for alpha in [0.5]: #[0.25, 0.50, 0.75, 1.00]:
            best_model = None
            best_info = None
            best_loss = np.inf
            awesome_hack = "avg_non_interferent_log_loss" + " [alpha={:.2f}]".format(alpha)
            awesome_hack = "avg_binary_log_loss"
            
            for learning_rate in [0.1]: #[0.01, 0.05, 0.1]:
                for num_leaves in [24]: #[8, 16, 24, 32]:
                    
                    
                    lgbm_params = { 'learning_rate': learning_rate, 
                                    'num_leaves': num_leaves} 
                    lgbm_model, lgbm_info = extend_non_interf_model(atk_train, atk_valid, cat_fx, 
                                alpha=alpha, num_trees=num_trees, params=lgbm_params)
                    
                    if np.min(lgbm_info['valid'][awesome_hack]) < best_loss:
                        best_model = lgbm_model
                        best_info = lgbm_info
                        best_loss = np.min(lgbm_info['valid'][awesome_hack])
                        best_info['num_trees'] = num_trees
                        best_info['learning_rate'] = learning_rate
                        best_info['num_leaves'] = num_leaves
                

                    # save file

                    best_valid_iter = np.argmin(lgbm_info['valid'][awesome_hack])

                    # update experimental results
                    exp = exp.append({'num_trees': num_trees, 
                                  'learning_rate':learning_rate,
                                  'num_leaves':num_leaves, 
                                  'alpha': alpha,
                                  'best_round':best_valid_iter+1, 
                                  'avg_non_interferent_log_loss':lgbm_info['valid'][awesome_hack][best_valid_iter]},
                                 ignore_index=True)
            
                best_valid_iter = np.argmin(best_info['valid'][awesome_hack])
            
                model_file_name = "{:s}_T{:d}_S{:04d}_L{:d}_A{:03d}_R{:d}.model".format(output_model_file,
                                                                                    best_info['num_trees'],
                                                                                    int(best_info['learning_rate']*1000),
                                                                                    best_info['num_leaves'],
                                                                                    int(alpha * 100),
                                                                                    best_valid_iter + 1
                                                                                   )
            
            
                best_model.save_model(model_file_name)
                print ("Model saved to", model_file_name)
                    
    return exp

In [None]:
# enable/disable
if True:
    for B in [5]: #[5, 15, 150, 300]:

        experiments = train_non_interferent("../data/census/train_B{:d}.csv.bz2".format(B),
                                                   "../data/census/valid_B{:d}.csv.bz2".format(B),
                                                   "../data/census/test_B{:d}.csv.bz2".format(B),
                                                   "../out/models/non_interferent_census_B{:d}".format(B))  

        experiments.to_csv('../out/models/non_interferent_census_B{:d}.csv'.format(B), index=False)

        print (experiments)