# Documentation

 - http://lightgbm.readthedocs.io/en/latest/
 - http://lightgbm.readthedocs.io/en/latest/Python-Intro.html
 - https://github.com/Microsoft/LightGBM/tree/master/examples/python-guide

In [None]:
import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm
from os import listdir
from os.path import isfile, join
from sklearn.metrics import roc_auc_score, f1_score
# Adding the following line, allows Jupyter Notebook to visualize plots
# produced by matplotlib directly below the code cell which generated those.
%matplotlib inline

np.random.seed(73)

# Load 6 different datasets:
-  Training set (original)
-  Training set (_attacked_)
-  Validation set (original)
-  Validation set (_attacked_)
-  Test set (original)
-  Test set (_attacked_)

In [None]:
DATASETS_PATH = "../data/census"
MODELS_PATH = "../out/models"
ATTACKER = "weak" # strong
TRAINING_SET="train_ori.csv.bz2" # original training set
TRAINING_SET_ATT="train_"+ATTACKER+"_att.csv.bz2" # perturbed training set
VALIDATION_SET="valid_ori.csv.bz2" # original validation set
VALIDATION_SET_ATT="valid_"+ATTACKER+"_att.csv.bz2" # perturbed validation set
TEST_SET="test_ori.csv.bz2" # original test set
TEST_SET_ATT="test_"+ATTACKER+"_att.csv.bz2" # perturbed test set

In [None]:
def load_dataset(path, dataset_filename, sep=","):
    return pd.read_csv(path+"/"+dataset_filename, sep=sep)

In [None]:
def infer_categorical_features(dataset):
    categorical_features = []
    for column in dataset.columns:
        if dataset[column].dtype == 'object':
            categorical_features.append(column)
    return categorical_features
            
def label_encode(dataset, categorical_features):
    dataset_le = dataset.copy()
    for column in dataset_le.columns:
        if column in categorical_features:
            dataset_le[column] = dataset_le[column].astype('category')
            dataset_le[column] = dataset_le[column].cat.codes
    return dataset_le

In [None]:
def one_hot_encode(dataset, label):
    dataset_oh = pd.get_dummies(dataset)
    columns = dataset_oh.columns.tolist()
    columns.insert(len(columns), columns.pop(columns.index(label)))
    dataset_oh = dataset_oh.loc[:,columns]
    dataset_oh.columns = columns
    
    return dataset_oh

In [None]:
TRAIN = load_dataset(DATASETS_PATH, TRAINING_SET)
TRAIN_ATT = load_dataset(DATASETS_PATH, TRAINING_SET_ATT)

VALID = load_dataset(DATASETS_PATH, VALIDATION_SET)
VALID_ATT = load_dataset(DATASETS_PATH, VALIDATION_SET_ATT)

TEST = load_dataset(DATASETS_PATH, TEST_SET)
TEST_ATT = load_dataset(DATASETS_PATH, TEST_SET_ATT)

# Compute group lengths

In [None]:
TRAIN_ATT_OFFSETS = TRAIN_ATT['instance_id'].value_counts().sort_index().values
VALID_ATT_OFFSETS = VALID_ATT['instance_id'].value_counts().sort_index().values
TEST_ATT_OFFSETS = TEST_ATT['instance_id'].value_counts().sort_index().values

# Infer _categorical_ features

In [None]:
CATEGORICAL_FEATURES = infer_categorical_features(TRAIN)
print("List of categorical features: [{}]"
      .format(", ".join([cf for cf in CATEGORICAL_FEATURES])))

# Transform _categorical_ features to _numeric_ (label encoding)

In [None]:
TRAIN = label_encode(TRAIN, set(CATEGORICAL_FEATURES))
TRAIN_ATT = label_encode(TRAIN_ATT.iloc[:,1:], set(CATEGORICAL_FEATURES))

VALID = label_encode(VALID, set(CATEGORICAL_FEATURES))
VALID_ATT = label_encode(VALID_ATT.iloc[:,1:], set(CATEGORICAL_FEATURES))

TEST = label_encode(TEST, set(CATEGORICAL_FEATURES))
TEST_ATT = label_encode(TEST_ATT.iloc[:,1:], set(CATEGORICAL_FEATURES))

# Transform _categorical_ features to _numeric_ (one-hot encoding)

In [None]:
# TRAIN = one_hot_encode(TRAIN, "income_greater_than_50k")
# TRAIN_ATT = one_hot_encode(TRAIN_ATT.iloc[:,1:], "income_greater_than_50k")

# VALID = one_hot_encode(VALID, "income_greater_than_50k")
# VALID_ATT = one_hot_encode(VALID_ATT.iloc[:,1:], "income_greater_than_50k")

# TEST = one_hot_encode(TEST, "income_greater_than_50k")
# TEST_ATT = one_hot_encode(TEST_ATT.iloc[:,1:], "income_greater_than_50k")

In [None]:
print(TRAIN.shape)
print(TRAIN_ATT.shape)
print(VALID.shape)
print(VALID_ATT.shape)
print(TEST.shape)
print(TEST_ATT.shape)

# Hyperparameters used for _standard_ and _baseline_ learning

-  Training is done by optimizing (i.e., minimizing) standard **binary log loss** (<code>fobj=optimize_binary_logloss</code>)
-  Evaluation is measured using standard **binary log loss** (<code>feval=avg_binary_logloss</code>)

In [None]:
# Please, refer to https://slundberg.github.io/shap/notebooks/Census%20income%20classification%20with%20LightGBM.html for any further detail
# or
# https://medium.com/@pushkarmandot/https-medium-com-pushkarmandot-what-is-lightgbm-how-to-implement-it-how-to-fine-tune-the-parameters-60347819b7fc
std_params = {
    "max_bin": 511,
    "learning_rate": 0.05,
    "boosting_type": "gbdt",#"rf"
    "objective": "regression_l2", #"binary",
    "metric": ["None"], # We use our own implementation of binary log loss (i.e., optimize_binary_logloss) 
                        # instead of the default one (i.e., "binary_logloss"), which may be in fact cross-entropy
    "num_leaves": 15,
    "verbose": 1,
    "min_data_in_leaf": 20,
    "bagging_freq": 1,
    "bagging_fraction": 0.8,
    "feature_fraction": 0.8,
    "boost_from_average": True
}

# Hyperparameters used for _non-interferent_ learning

-  Training is done by optimizing (i.e., minimizing) our custom **binary log loss under max attack** (<code>fobj=optimize_binary_logloss_under_max_attack</code>)
-  Evaluation is measured using our custom **binary log loss under max attack** (<code>feval=avg_binary_logloss_under_max_attack</code>)

In [None]:
# Please, refer to https://slundberg.github.io/shap/notebooks/Census%20income%20classification%20with%20LightGBM.html for any further detail
# or
# https://medium.com/@pushkarmandot/https-medium-com-pushkarmandot-what-is-lightgbm-how-to-implement-it-how-to-fine-tune-the-parameters-60347819b7fc
non_interferent_params = {
    "max_bin": 511,
    "learning_rate": 0.05,
    "boosting_type": "gbdt",#"rf",
    "objective": "regression_l2",
    "metric": ["None"], # We will specify our own custom objective function (i.e., optimize_binary_logloss_under_max_attack)
    "num_leaves": 15,
    "verbose": 1,
    "min_data_in_leaf": 20,
    "bagging_freq": 1,
    "bagging_fraction": 0.8,
    "feature_fraction": 0.8,
    "boost_from_average": True
}

# Common Hyperparameters

In [None]:
MAX_BOOST_ROUNDS = 200
MIN_BOOST_ROUNDS = 200
STEP_BOOST_ROUNDS = 50
BOOST_ROUNDS = [br for br in range(MIN_BOOST_ROUNDS, MAX_BOOST_ROUNDS+1, STEP_BOOST_ROUNDS)]

# Standard objective function

The following function, called <code>optimize_binary_logloss</code>, is the one optimized (i.e., minimized) for learning _standard_ and _baseline_ approaches. More specifically, this is the standard binary log loss which is used to train any _standard_ or _baseline_ model.

# <code>fobj=optimize_binary_logloss</code>

In [None]:
# self-defined objective function
# f(preds: array, train_data: Dataset) -> grad: array, hess: array

# To be used with a regression task
def optimize_binary_logloss(preds, train_data):
    labels = train_data.get_label()
    exp_pl = np.exp(preds * labels)
    # http://www.wolframalpha.com/input/?i=differentiate+log(1+%2B+exp(-kx)+)
    grads = -labels / (1.0 +  exp_pl)  
    # http://www.wolframalpha.com/input/?i=d%5E2%2Fdx%5E2+log(1+%2B+exp(-kx)+)
    hess = labels**2 * exp_pl / (1.0 + exp_pl)**2 

    # this is to optimize average logloss
    norm = 1.0/len(preds)
    grads *= norm
    hess *= norm
    
    return grads, hess

# Custom objective function

In addition to the standard binary log loss used to train a model, we introduce another custom objective function called <code>optimize_binary_logloss_under_max_attack</code>. This function is used to train a _non-interferent_ model; in other words, non-interferent models are learned by optimizing (i.e., minimizing) the following function.

$$
\frac{1}{|\mathcal{D}|} \cdot \sum_{(\mathbf{x},y) \in \mathcal{D}} \log  \left( \sum_{\mathbf{x}' \in \mathit{MaxAtk}({\mathbf{x}},{A})} e^{\ell(h(\mathbf{x}'), y)} \right).
$$

# <code>fobj=optimize_binary_logloss_under_max_attack</code>

In [None]:
# self-defined objective function
# f(preds: array, train_data: Dataset) -> grad: array, hess: array

def optimize_binary_logloss_under_max_attack(preds, train_data):
    labels = train_data.get_label()
    attack_lens = train_data.get_group()
    
    grads = np.zeros_like(labels, dtype=np.float64)
    hess = np.zeros_like(grads)

    norm = 1.0 / float(len(attack_lens))
    
    offset = 0
    for atk in attack_lens:
        exp_pl = np.exp(- preds[offset:offset+atk] * labels[offset:offset+atk])
        
        inv_sum = 1.0 / np.sum(1.0 + exp_pl)

        x_grad = inv_sum * exp_pl
            
        grads[offset:offset+atk] = norm * x_grad * (- labels[offset:offset+atk])
        hess[offset:offset+atk]  = norm * x_grad * (1.0 - x_grad)
        
        offset += atk    
    
    return grads, hess

# Standard evaluation metric

The following function is the one used for evaluating the quality of the learned model (either _standard_, _baseline_, or _non-interferent_). This is the standard <code>avg_binary_logloss</code>.

# <code>feval=avg_binary_logloss</code>

In [None]:
# self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool
def avg_binary_logloss(preds, train_data):
    
    labels = train_data.get_label()
    losses = np.log(1.0 + np.exp(-preds*labels))
    avg_loss = np.mean(losses)
    
    return 'binary_logloss', avg_loss, False

In [None]:
def eval_binary_logloss(model, boost_round, test, test_groups=None):
    
    lgbm_test = lightgbm.Dataset(data=test.iloc[:,:-1].values, 
                                 label=test.iloc[:,-1].values,
                                 free_raw_data=False)
    
    return avg_binary_logloss(model.predict(test.iloc[:,:-1].values, num_iteration=boost_round), lgbm_test)[1]

# Custom evaluation metric

In addition to the standard <code>avg_binary_logloss</code>, we provide an additional evaluation metric called <code>avg_binary_logloss_under_max_attack</code>, which is used for measuring the validity of any model (either _standard_, _baseline_, or _non-interferent_). More precisely, <code>avg_binary_logloss_under_max_attack</code> is the binary logloss modified to operate on groups of perturbed instances.

# <code>feval=avg_binary_logloss_under_max_attack</code>

In [None]:
def logistic(p):
    return 1.0/(1.0 + np.exp(-p))

In [None]:
# Our custom metric

def binary_logloss(pred, true_label):

    return np.log(1.0 + np.exp(-pred * true_label))

# self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool
def avg_binary_logloss_under_max_attack(preds, train_data):
    labels = train_data.get_label()
    attack_lens = train_data.get_group()
    
    offset = 0
    max_logloss = []
    
    for atk in attack_lens:
        losses = [binary_logloss(h,t) for h,t in zip(preds[offset:offset+atk], labels[offset:offset+atk])]
        max_logloss.append(max(losses))
        
        offset += atk
    
    return 'binary_logloss_under_max_attack', np.mean(max_logloss), False

In [None]:
def eval_binary_logloss_under_max_attack(model, boost_round, test, test_groups=None):
    
    lgbm_test = lightgbm.Dataset(data=test.iloc[:,:-1].values, 
                                 label=test.iloc[:,-1].values,
                                 group=test_groups,
                                 free_raw_data=False)
    
    return avg_binary_logloss_under_max_attack(model.predict(test.iloc[:,:-1].values, num_iteration=boost_round), 
                                               lgbm_test)[1]

# Additional validity measures

In addition to the evaluation metrics defined above (used for training), we also consider the following **4** measures of validity to compare the performance of each learned model:

-  <code>eval_binary_err_rate</code>: This is the traditional binary error rate (1-accuracy);
-  <code>eval_binary_err_rate_under_max_attack</code>: This is the binary error rate modified to operate on groups of perturbed instances.
-  <code>eval_roc_auc</code>: This is the classical ROC AUC score;
-  <code>eval_roc_auc_under_max_attack</code>: This is the ROC AUC score modified to operate on groups of perturbed instances.

Again, note that those are **not** metrics used at training time (i.e., they do not define any <code>feval</code>), rather they are used to assess the (offline) quality of each learned model.

# <code>eval_binary_err_rate</code>

In [None]:
def eval_binary_err_rate(model, boost_round, test_set, test_groups=None):
    X = test_set.iloc[:,:-1].values
    labels = test_set.iloc[:,-1].values
    predictions = [1 if p > 0 else -1 for p in model.predict(X, num_iteration=boost_round)]
    
    errs = 0
    for p,l in zip(predictions,labels):
        if p != l:
            errs += 1
    return errs/len(predictions)

# <code>eval_binary_err_rate_under_max_attack</code>

In [None]:
def eval_binary_err_rate_under_max_attack(model, boost_round, test_set, test_groups=None):
    X = test_set.iloc[:,:-1].values
    labels = test_set.iloc[:,-1].values
    predictions = [1 if p > 0 else -1 for p in model.predict(X, num_iteration=boost_round)]
    
    offset = 0
    errs = 0

    for g in test_groups:
        predictions_att = predictions[offset:offset+g]
        true_label = labels[offset]
        if np.any([p != true_label for p in predictions_att]):
            errs += 1
        offset += g

    return errs/len(test_groups)

# <code>eval_roc_auc</code>

In [None]:
def eval_roc_auc(model, boost_round, test_set, test_groups=None):
    X = test_set.iloc[:,:-1].values
    labels = test_set.iloc[:,-1].values
    predictions = logistic(model.predict(X, num_iteration=boost_round))
    
    return roc_auc_score(labels, predictions)

# <code>eval_roc_auc_under_max_attack</code>

In [None]:
def eval_roc_auc_under_max_attack(model, boost_round, test_set, test_groups=None):
    X = test_set.iloc[:,:-1].values
    labels = test_set.iloc[:,-1].values
    predictions = logistic(model.predict(X, num_iteration=boost_round))
    
    offset = 0
    true_labels = []
    worst_predictions = []
    
    for g in test_groups:
        true_label = labels[offset]
        true_labels.append(true_label)
        predictions_att = predictions[offset:offset+g]
        prediction_distances = np.abs(predictions_att - true_label)
        worst_predictions.append(predictions_att[np.argmax(prediction_distances)])
    
        offset += g
        
    return roc_auc_score(true_labels, worst_predictions)

# <code>eval_f1</code>

In [None]:
def eval_f1(model, boost_round, test_set, test_groups=None):
    X = test_set.iloc[:,:-1].values
    labels = test_set.iloc[:,-1].values
    predictions = [1 if p > 0 else -1 for p in logistic(model.predict(X, num_iteration=boost_round))]
    
    return f1_score(labels, predictions)

# <code>eval_f1_under_max_attack</code>

In [None]:
def eval_f1_under_max_attack(model, boost_round, test_set, test_groups=None):
    X = test_set.iloc[:,:-1].values
    labels = test_set.iloc[:,-1].values
    predictions = [1 if p > 0 else -1 for p in logistic(model.predict(X, num_iteration=boost_round))]
    
    offset = 0
    true_labels = []
    worst_predictions = []
    
    for g in test_groups:
        true_label = labels[offset]
        true_labels.append(true_label)
        predictions_att = predictions[offset:offset+g]
        prediction_distances = np.abs(predictions_att - true_label)
        worst_predictions.append(predictions_att[np.argmax(prediction_distances)])
    
        offset += g
        
    return f1_score(true_labels, worst_predictions)

# Save/Load model

In [None]:
def save_model(model_filename, model):
    with open(model_filename, 'wb') as fout:
        pickle.dump(model, fout)

In [None]:
def load_model(model_filename):
    with open(model_filename, 'rb') as fin:
        return pickle.load(fin)

# Evaluation metrics

In [None]:
EVAL_METRICS = [eval_binary_logloss, 
                eval_binary_err_rate, 
                eval_roc_auc
               ]

EVAL_METRICS_UNDER_MAX_ATTACK = [eval_binary_logloss_under_max_attack, 
                                 eval_binary_err_rate_under_max_attack, 
                                 eval_roc_auc_under_max_attack
                                ]

# Evaluate each model w.r.t. _all_ evaluation metrics

In [None]:
def eval_learned_model(model, boost_round, eval_metric, test, test_groups=None):
    return eval_metric(model, boost_round, test, test_groups=test_groups)

In [None]:
def eval_learned_models(model, model_type, boost_round, test, test_groups=None):

    eval_metrics = EVAL_METRICS
    d_test = "D_test"
    if test_groups is not None:
        eval_metrics = EVAL_METRICS_UNDER_MAX_ATTACK
        d_test = "D_test_att"
    
    header = ['Model','N. of Trees'] + [m.__name__.replace('eval_','').replace('_',' ').title() for m in eval_metrics]
    df = pd.DataFrame(columns=header)
    first_row = [model_type, boost_round] + [None for m in eval_metrics]
    df.loc[0] = first_row

    for eval_metric in eval_metrics:
        res = eval_learned_model(model, boost_round, eval_metric, test, test_groups=test_groups)
        print("{} learning - {} on {} [boost rounds={}] = {:.5f}"
                  .format(model_type, eval_metric.__name__, d_test, boost_round, res))
        df[eval_metric.__name__.replace('eval_','').replace('_',' ').title()] = res
    print("******************************************************************************************************")
    
    return df

# TRAINING

# 1. _Standard_ Learning: Models are trained on the original dataset $\mathcal{D}_{train}$ using _standard_ binary log loss

-  This model is trained on the original training set by minimizing standard **binary log loss** (i.e., <code>fobj=optimize_binary_logloss</code>)

-  Its performance is assessed by means of <code>feval=avg_binary_logloss</code> (i.e., the metric optimized during training) both on training and validation set.

-  The model leading to the lowest **binary log loss** on the validation test is the one returned.

In [None]:
def std_learning(train, 
                 valid,
                 params=std_params, 
                 fobj=optimize_binary_logloss, 
                 feval=avg_binary_logloss,
                 num_boost_round=MIN_BOOST_ROUNDS):
    
    lgbm_info = {}
    
    lgbm_train = lightgbm.Dataset(data=train.iloc[:,:-1].values, 
                                  label=train.iloc[:,-1].values
                                 )
    
    lgbm_valid = lightgbm.Dataset(data=valid.iloc[:,:-1].values, 
                                  label=valid.iloc[:,-1].values,
                                  reference=lgbm_train, 
                                  free_raw_data=False)
    
    lgbm_model = lightgbm.train(params=params, 
                                train_set=lgbm_train, 
                                num_boost_round=num_boost_round,
                                valid_sets = [lgbm_valid],
                                valid_names  = ["validation"],
                                fobj = fobj,
                                feval = feval,
                                evals_result = lgbm_info,
                                early_stopping_rounds=50,
                                verbose_eval=20)
    
    return lgbm_model, lgbm_info

In [None]:
def std_learning_runs(train, valid, boost_rounds=BOOST_ROUNDS):
    
    std_runs = {}
    std_runs['type'] = 'Standard'
    std_runs['run'] = {}
    for br in boost_rounds:
        print("***** {} learning - Optimizing standard binary log loss on the original D_train [boost rounds={}] *****"
              .format(std_runs['type'], br))
        std_model, std_res = std_learning(train, valid, num_boost_round=br)
        std_runs['run'][br] = {}
        std_runs['run'][br]['model'] = std_model
        std_runs['run'][br]['results'] = std_res

    return std_runs

In [None]:
%%time
std_runs = std_learning_runs(TRAIN, VALID, boost_rounds=BOOST_ROUNDS)

In [None]:
STD_MODEL_FILENAME = MODELS_PATH+"/std_"+ATTACKER+"_"+str(MAX_BOOST_ROUNDS)+".pkl"

# Persist _standard_ model

In [None]:
save_model(STD_MODEL_FILENAME, std_runs['run'][MAX_BOOST_ROUNDS]['model'])

# 2. _Baseline_: Learning models trained on the attacked $\mathcal{D}_{train\_att}$ using _standard_ binary log loss

-  This model is trained on the original training set by minimizing standard **binary log loss** (i.e., <code>fobj=optimize_binary_logloss</code>)

-  Its performance is assessed by means of <code>feval=avg_binary_logloss</code> (i.e., the metric optimized during training) both on training and validation set.

-  The model leading to the lowest **binary log loss** on the validation test is the one returned.

In [None]:
def random_select_instances(groups, p_attacked_inst, n_attacks_per_inst):
    
    i = 0
    selected_instances = []
    for g in groups:
        selected_instances.append(i)
        if n_attacks_per_inst > 0:
            if g > n_attacks_per_inst:
                if np.random.random_sample() <= p_attacked_inst: # the instance is going to be attacked
                    selected = np.random.choice(g-1, n_attacks_per_inst, replace=False) + i + 1
                    selected_instances.extend(sorted(selected))
            else:
                selected_instances.extend([x for x in range(i+1,i+g)])
        i += g
    
    return selected_instances

In [None]:
def baseline_learning(train, 
                      valid, 
                      p_attacked_inst, 
                      n_attacks_per_inst, 
                      params=std_params, 
                      fobj=optimize_binary_logloss, 
                      feval=avg_binary_logloss,
                      num_boost_round=MIN_BOOST_ROUNDS, 
                      train_group=TRAIN_ATT_OFFSETS):

    
    selected_instances = random_select_instances(train_group, p_attacked_inst, n_attacks_per_inst)
    train = train.loc[selected_instances]
    
    lgbm_info = {}
    
    lgbm_train = lightgbm.Dataset(data=train.iloc[:,:-1].values, 
                                  label=train.iloc[:,-1].values
                                 )
    
    lgbm_valid = lightgbm.Dataset(data=valid.iloc[:,:-1].values, 
                                  label=valid.iloc[:,-1].values,
                                  reference=lgbm_train, 
                                  free_raw_data=False)
    
    lgbm_model = lightgbm.train(params=params, 
                                train_set=lgbm_train, 
                                num_boost_round=num_boost_round,
                                valid_sets = [lgbm_valid],
                                valid_names  = ["validation"],
                                fobj = fobj,
                                feval = feval,
                                evals_result = lgbm_info,
                                early_stopping_rounds=50,
                                verbose_eval=20)
    
    return lgbm_model, lgbm_info

In [None]:
def baseline_learning_runs(train, 
                           valid, 
                           p_attacked_inst=1.0,
                           n_attacks_per_inst=0,
                           boost_rounds=BOOST_ROUNDS
                          ):
    
    baseline_runs = {}
    baseline_runs['type'] = 'Baseline' 
    baseline_runs['run'] = {}
    
    for br in boost_rounds:
        print("***** {} learning - Optimizing standard binary log loss on D_train_att [boost rounds={}; p_attacked_inst={:.2f}; n_attacks_per_inst={}] *****"
              .format(baseline_runs['type'], br, p_attacked_inst, n_attacks_per_inst))
        baseline_model, baseline_res = baseline_learning(train, 
                                                         valid, 
                                                         p_attacked_inst, 
                                                         n_attacks_per_inst, 
                                                         num_boost_round=br)
        baseline_runs['run'][br] = {}
        baseline_runs['run'][br]['model'] = baseline_model
        baseline_runs['run'][br]['results'] = baseline_res

    return baseline_runs

In [None]:
P_ATTACKED_INSTANCE = [1.0]
N_ATTACKS_PER_INSTANCE = [1, 1000]

# Train and Persist _baseline_ model

In [None]:
%%time
for pa in P_ATTACKED_INSTANCE:
    for na in N_ATTACKS_PER_INSTANCE:
        baseline_runs = baseline_learning_runs(TRAIN_ATT, VALID_ATT, 
                                               p_attacked_inst=pa, 
                                               n_attacks_per_inst=na,
                                               boost_rounds=BOOST_ROUNDS
                                              )

        BASELINE_MODEL_FILENAME = MODELS_PATH+"/baseline_"+ATTACKER+"_"+str(MAX_BOOST_ROUNDS)+"_p-{}_n-{}.pkl".format(int(pa*100), na)
        save_model(BASELINE_MODEL_FILENAME, baseline_runs['run'][MAX_BOOST_ROUNDS]['model'])

# 3. _Non-Interferent_: Learn _non-interferent_ models trained on the original $\mathcal{D}_{train}$ using custom cost function (binary log loss under max attack)

-  This model is trained on the original training set by minimizing our custom objective function, i.e., a modified **binary log loss** called <code>optimize_binary_logloss_under_max_attack</code>.

-  Its performance is assessed by means of <code>avg_binary_logloss_under_max_attack</code> both on training and validation set.

-  The model leading to the lowest <code>avg_binary_logloss_under_max_attack</code> on the validation test is the one returned.

In [None]:
def non_interferent_learning(train, 
                             valid, 
                             params=non_interferent_params, 
                             num_boost_round=MIN_BOOST_ROUNDS, 
                             fobj=optimize_binary_logloss_under_max_attack, 
                             feval=avg_binary_logloss_under_max_attack,
                             train_group=TRAIN_ATT_OFFSETS,
                             valid_group=VALID_ATT_OFFSETS):
    
    lgbm_info = {}
    
    lgbm_train = lightgbm.Dataset(data=train.iloc[:,:-1].values, 
                                  label=train.iloc[:,-1].values, 
                                  group=train_group
                                 )
    
    lgbm_valid = lightgbm.Dataset(data=valid.iloc[:,:-1].values, 
                                  label=valid.iloc[:,-1].values, 
                                  group=valid_group,
                                  reference=lgbm_train, 
                                  free_raw_data=False)
    
    lgbm_model = lightgbm.train(params=params, 
                                train_set=lgbm_train, 
                                num_boost_round=num_boost_round, 
                                valid_sets = [lgbm_valid],
                                valid_names  = ["validation"], 
                                evals_result = lgbm_info,
                                fobj = fobj,
                                feval = feval,
                                early_stopping_rounds=50,
                                verbose_eval=20)
    
    return lgbm_model, lgbm_info

In [None]:
def non_interferent_learning_runs(train, 
                                  valid, 
                                  boost_rounds=BOOST_ROUNDS
                                 ):
    
    non_interferent_runs = {}
    non_interferent_runs['type'] = 'Non-Interferent'
    non_interferent_runs['run'] = {}
    
    for br in boost_rounds:
        print("***** {} learning - Optimizing custom binary log loss under max attack on D_train_att [boost rounds={}] *****"
              .format(non_interferent_runs['type'], br))
        ni_model, ni_res = non_interferent_learning(train, 
                                                    valid, 
                                                    num_boost_round=br)
        non_interferent_runs['run'][br] = {}
        non_interferent_runs['run'][br]['model'] = ni_model
        non_interferent_runs['run'][br]['results'] = ni_res

    return non_interferent_runs

In [None]:
%%time
non_interferent_runs = non_interferent_learning_runs(TRAIN_ATT, VALID_ATT, 
                                                     boost_rounds=BOOST_ROUNDS)

In [None]:
NON_INTERFERENT_MODEL_FILENAME = MODELS_PATH+"/non-interferent_"+ATTACKER+"_"+str(MAX_BOOST_ROUNDS)+".pkl"

# Persist _non-interferent_ model

In [None]:
save_model(NON_INTERFERENT_MODEL_FILENAME, non_interferent_runs['run'][MAX_BOOST_ROUNDS]['model'])

# EVALUATION

In [None]:
MAX_EVAL_TREES = 200
MIN_EVAL_TREES = 10
STEP_EVAL_TREES = 10
EVAL_TREES = sorted(list(set([t for t in range(MIN_EVAL_TREES, MAX_EVAL_TREES, STEP_EVAL_TREES)] + [MAX_EVAL_TREES])))
# The following adds the "best_iteration" learned on the validation set
EVAL_TREES = [0] + EVAL_TREES

In [None]:
def eval_runs(model, model_type, test, eval_trees=EVAL_TREES, test_groups=None):
    eval_results = []
    for t in eval_trees:
        eval_results.append(eval_learned_models(model, model_type, t, test, test_groups=test_groups))
        
    eval_df = pd.concat(eval_results, axis=0)
    eval_df.reset_index(inplace=True, drop=True)

    return eval_df

# Retrieve all model files

In [None]:
def get_model_filenames():
    return sorted([f for f in listdir(MODELS_PATH) if f != '.gitignore' and isfile(join(MODELS_PATH, f))])

In [None]:
all_model_filenames = get_model_filenames()
print("\n".join([mf for mf in all_model_filenames]))

# Load _standard_ models

In [None]:
STD_MODEL_FILENAME = MODELS_PATH+"/std_"+ATTACKER+"_"+str(MAX_BOOST_ROUNDS)+".pkl"

In [None]:
std_model = load_model(STD_MODEL_FILENAME)

# Evaluate _standard_ models on $D_{test}$

In [None]:
std_df = eval_runs(std_model, "Standard", TEST)

In [None]:
std_df

# Evaluate _standard_ models on $D_{test\_att}$

In [None]:
std_att_df = eval_runs(std_model, "Standard", TEST_ATT, test_groups=TEST_ATT_OFFSETS)

# Merge both _standard_ evaluations

In [None]:
overall_std_df = pd.merge(std_df, std_att_df, on=['Model', 'N. of Trees'])

In [None]:
overall_std_df

# Load _baseline_ models

In [None]:
BASELINE_MODEL_100_1_FILENAME = MODELS_PATH+"/baseline_"+ATTACKER+"_"+str(MAX_BOOST_ROUNDS)+"_p-100_n-1.pkl"
#BASELINE_MODEL_100_4_FILENAME = MODELS_PATH+"/baseline_"+ATTACKER+"_"+str(MAX_BOOST_ROUNDS)+"_p-100_n-4.pkl"
BASELINE_MODEL_100_MAX_FILENAME = MODELS_PATH+"/baseline_"+ATTACKER+"_"+str(MAX_BOOST_ROUNDS)+"_p-100_n-1000.pkl"
#BASELINE_MODEL_50_1_FILENAME = MODELS_PATH+"/baseline_"+ATTACKER+"+"_"+str(MAX_BOOST_ROUNDS)+"_p-50_n-1.pkl"
#BASELINE_MODEL_50_4_FILENAME = MODELS_PATH+"/baseline_"+ATTACKER+"_"+str(MAX_BOOST_ROUNDS)+"_p-50_n-4.pkl"
#BASELINE_MODEL_50_MAX_FILENAME = MODELS_PATH+"/baseline_"+ATTACKER+"_"+str(MAX_BOOST_ROUNDS)+"_p-50_n-1000.pkl"

In [None]:
baseline_model_100_1 = load_model(BASELINE_MODEL_100_1_FILENAME)
#baseline_model_100_4 = load_model(BASELINE_MODEL_100_4_FILENAME)
baseline_model_100_MAX = load_model(BASELINE_MODEL_100_MAX_FILENAME)
# baseline_model_50_1 = load_model(BASELINE_MODEL_50_1_FILENAME)
# baseline_model_50_4 = load_model(BASELINE_MODEL_50_4_FILENAME)
# baseline_model_50_MAX = load_model(BASELINE_MODEL_50_MAX_FILENAME)

# Evaluate _baseline_ models on $D_{test}$

In [None]:
baseline_100_1_df = eval_runs(baseline_model_100_1, "Baseline (p=1.0; n=1)", TEST)
# baseline_100_4_df = eval_runs(baseline_model_100_4, "Baseline (p=1.0; n=4)", TEST)
baseline_100_max_df = eval_runs(baseline_model_100_MAX, "Baseline (p=1.0; n=max)", TEST)
# baseline_50_1_df = eval_runs(baseline_model_50_1, "Baseline (p=0.5; n=1)", TEST)
# baseline_50_4_df = eval_runs(baseline_model_50_4, "Baseline (p=0.5; n=4)", TEST)
# baseline_50_max_df = eval_runs(baseline_model_50_MAX, "Baseline (p=0.5; n=max)", TEST)

In [None]:
baseline_df = pd.concat(#[baseline_100_1_df, baseline_100_4_df, baseline_100_max_df, baseline_50_1_df, baseline_50_4_df, baseline_50_max_df], 
                        [baseline_100_1_df, baseline_100_max_df], 
                        axis=0)
baseline_df.reset_index(inplace=True, drop=True)

In [None]:
baseline_df

# Evaluate _baseline_ model on $D_{test\_att}$

In [None]:
baseline_att_100_1_df = eval_runs(baseline_model_100_1, "Baseline (p=1.0; n=1)", TEST_ATT, test_groups=TEST_ATT_OFFSETS)
#baseline_att_100_4_df = eval_runs(baseline_model_100_4, "Baseline (p=1.0; n=4)", TEST_ATT, test_groups=TEST_ATT_OFFSETS)
baseline_att_100_max_df = eval_runs(baseline_model_100_MAX, "Baseline (p=1.0; n=max)", TEST_ATT, test_groups=TEST_ATT_OFFSETS)
#baseline_att_50_1_df = eval_runs(baseline_model_50_1, "Baseline (p=0.5; n=1)", TEST_ATT, test_groups=TEST_ATT_OFFSETS)
#baseline_att_50_4_df = eval_runs(baseline_model_50_4, "Baseline (p=0.5; n=4)", TEST_ATT, test_groups=TEST_ATT_OFFSETS)
#baseline_att_50_max_df = eval_runs(baseline_model_50_MAX, "Baseline (p=0.5; n=max)", TEST_ATT, test_groups=TEST_ATT_OFFSETS)

In [None]:
baseline_att_df = pd.concat(# [baseline_att_100_1_df, baseline_att_100_4_df, baseline_att_100_max_df, baseline_att_50_1_df, baseline_att_50_4_df, baseline_att_50_max_df]
    [baseline_att_100_1_df, baseline_att_100_max_df], axis=0)
baseline_att_df.reset_index(inplace=True, drop=True)

In [None]:
baseline_att_df

# Merge both _baseline_ evaluations

In [None]:
overall_baseline_df = pd.merge(baseline_df, baseline_att_df, on=["Model", "N. of Trees"])

In [None]:
overall_baseline_df

# Load _non-interferent_ model

In [None]:
NON_INTERFERENT_MODEL_FILENAME = MODELS_PATH+"/non-interferent_"+ATTACKER+"_"+str(MAX_BOOST_ROUNDS)+".pkl"

In [None]:
non_interferent_model = load_model(NON_INTERFERENT_MODEL_FILENAME)

# Evaluate _non-interferent_ model on $D_{test}$

In [None]:
non_interferent_df = eval_runs(non_interferent_model, "Non-Interferent", TEST)

# Evaluate _non-interferent_ model on $D_{test\_att}$

In [None]:
non_interferent_att_df = eval_runs(non_interferent_model, "Non-Interferent", TEST_ATT, test_groups=TEST_ATT_OFFSETS)

# Merge both _non-interferent_ evaluations

In [None]:
overall_non_interferent_df = pd.merge(non_interferent_df, non_interferent_att_df, on=['Model', 'N. of Trees'])

# Stack _all_ evaluations one on top of each other

In [None]:
overall_df = pd.concat([overall_non_interferent_df, overall_baseline_df, overall_std_df], axis=0)
overall_df.reset_index(inplace=True, drop=True)

In [None]:
overall_df

# Save the DataFrame containing results

In [None]:
overall_df.to_csv("../plots/"+ATTACKER+".csv", sep=",", index=False)