# Evaluating Models

This notebook contains the code used for evaluating the following learning models:

-  **Standard GBDT** (_baseline 1_)
-  **Adversarial Boosting** (_baseline 2_)
-  **Non-Interferent GBDT** (our proposal)

# Documentation

 - http://lightgbm.readthedocs.io/en/latest/
 - http://lightgbm.readthedocs.io/en/latest/Python-Intro.html
 - https://github.com/Microsoft/LightGBM/tree/master/examples/python-guide

In [1]:
import os
import json
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm
import functools
from os import listdir
from os.path import isfile, join
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix
# Adding the following line, allows Jupyter Notebook to visualize plots
# produced by matplotlib directly below the code cell which generated those.
%matplotlib inline

np.random.seed(73)

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


# Standard evaluation metric

The following function is the one used for evaluating the quality of the learned model (either _standard_, _adversarial-boosting_, or _non-interferent_). This is the standard <code>avg_log_loss</code>.

In [2]:
def logistic(x):
    return 1.0/(1.0 + np.exp(-x))

In [3]:
def logit(p):
    return np.log(p/(1-p))

# <code>avg_log_loss</code>

In [4]:
# self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool
def avg_log_loss(preds, train_data):
    
    labels = train_data.get_label()
    losses = np.log(1.0 + np.exp(-preds*labels))
    avg_loss = np.mean(losses)
    
    return 'avg_binary_log_loss', avg_loss, False

In [5]:
def eval_log_loss(model, test, test_groups=None, svm=False):
    
    lgbm_test = lightgbm.Dataset(data=test.iloc[:,:-1].values, 
                                 label=test.iloc[:,-1].values,
                                 free_raw_data=False)
    
    if svm: # no trees have been generated (used for evaluating other non-tree-based models like SVM)
        # use the logit function (i.e., the inverse of the logistic function) to map probabilities output
        # by sklearn's predict_proba in the range [0,1] to a real number in the range [-inf, +inf]
        return avg_log_loss(logit(model.predict_proba(test.iloc[:,:-1].values)[:,1]), lgbm_test)[1]
    
    return avg_log_loss(model.predict(test.iloc[:,:-1].values), lgbm_test)[1]

# Custom evaluation metric

Similarly to what we have done for <code>fobj</code>, <code>feval</code> can be computed from a weighted combination of two evaluation metrics:

-  <code>avg_log_loss</code> (standard, defined above);
-  <code>avg_log_loss_uma</code> (custom, defined below).

# <code>avg_log_loss_uma</code>

This is the binary log loss yet modified to operate on groups of perturbed instances.

In [6]:
# Our custom metric

def binary_log_loss(pred, true_label):

    return np.log(1.0 + np.exp(-pred * true_label))

# self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool
def avg_log_loss_uma(preds, train_data):
    labels = train_data.get_label()
    attack_lens = train_data.get_group()
    
    offset = 0
    max_logloss = []
    avg_max_logloss = 0.0
    
    if attack_lens is not None:
    
        for atk in attack_lens:
            losses = [binary_log_loss(h,t) for h,t in zip(preds[offset:offset+atk], labels[offset:offset+atk])]
            max_logloss.append(max(losses))

            offset += atk
        
        avg_max_logloss = np.mean(max_logloss)  

    return 'avg_binary_log_loss_under_max_attack', avg_max_logloss, False

In [7]:
def eval_log_loss_uma(model, test, test_groups=None, svm=False):
    
    lgbm_test = lightgbm.Dataset(data=test.iloc[:,:-1].values, 
                                 label=test.iloc[:,-1].values,
                                 group=test_groups,
                                 free_raw_data=False)
    
    if svm: # no trees have been generated (used for evaluating other non-tree-based models like SVM)
        # use the logit function (i.e., the inverse of the logistic function) to map probabilities output
        # by sklearn's predict_proba in the range [0,1] to a real number in the range [-inf, +inf]
        return avg_log_loss_uma(logit(model.predict_proba(test.iloc[:,:-1].values)[:,1]), 
                                               lgbm_test)[1]
    
    return avg_log_loss_uma(model.predict(test.iloc[:,:-1].values), 
                                               lgbm_test)[1]

# <code>feval=avg_non_interferent_log_loss</code>

Used for measuring the validity of any model (either _standard_, _baseline_, or _non-interferent_). More precisely, <code>avg_non_interferent_log_loss</code> is the weighted sum of the binary log loss and the binary log loss under maximal attack.

In [8]:
# LightGBM takes lambda x,y: avg_weighted_sum_log_loss_log_loss_uma(preds, train_data, alpha=0.5)

def avg_non_interferent_log_loss(preds, train_data, alpha=1.0):
    
    # binary logloss under maximal attack
    _, loss_uma, _    = avg_log_loss_uma(preds, train_data)
    
    # binary logloss (plain)
    # _, loss_plain, _  = avg_log_loss(preds, train_data)
    
    ids = []
    attack_lens = train_data.get_group()
    
    if attack_lens is not None:
        offset=0
        for atk in attack_lens:
            ids += [offset]
            offset += atk      
            
    ids = np.array(ids)
    labels = train_data.get_label()
    losses = np.log(1.0 + np.exp(-preds[ids]*labels[ids]))
    loss_plain = np.mean(losses)

    # combine the above two losses together
    weighted_loss = alpha*loss_uma + (1.0-alpha)*loss_plain

    return 'avg_non_interferent_log_loss [alpha={:.2f}]'.format(alpha), weighted_loss, False

def eval_non_interferent_log_loss(model, test, test_groups=None, svm=False, alpha=1.0):
    
    lgbm_test = lightgbm.Dataset(data=test.iloc[:,:-1].values, 
                                 label=test.iloc[:,-1].values,
                                 group=test_groups,
                                 free_raw_data=False)
    
    if svm: # no trees have been generated (used for evaluating other non-tree-based models like SVM)
        # use the logit function (i.e., the inverse of the logistic function) to map probabilities output
        # by sklearn's predict_proba in the range [0,1] to a real number in the range [-inf, +inf]
        return avg_non_interferent_log_loss(logit(model.predict_proba(test.iloc[:,:-1].values)[:,1]), 
                                                  lgbm_test,
                                                  alpha=alpha
                                                 )[1]
    
    return avg_non_interferent_log_loss(model.predict(test.iloc[:,:-1].values), 
                                                  lgbm_test,
                                                  alpha=alpha
                                                 )[1]

# Additional validity measures

In addition to the evaluation metrics defined above (used for training), we also consider the following **4** measures of validity to compare the performance of each learned model:

-  <code>eval_binary_err_rate</code>: This is the traditional binary error rate (1-accuracy);
-  <code>eval_binary_err_rate_uma</code>: This is the binary error rate modified to operate on groups of perturbed instances under maximal attack.
-  <code>eval_roc_auc</code>: This is the classical ROC AUC score;
-  <code>eval_roc_auc_uma</code>: This is the ROC AUC score modified to operate on groups of perturbed instances under maximal attack.

Again, note that those are **not** metrics used at training time (i.e., they do not define any <code>feval</code>), rather they are used to assess the (offline) quality of each learned model.

# <code>eval_binary_err_rate</code>

In [9]:
def eval_binary_err_rate(model, test_set, test_groups=None, svm=False):
    X = test_set.iloc[:,:-1].values
    labels = test_set.iloc[:,-1].values
    
    model_predictions = []
    if svm: # no trees have been generated (used for evaluating other non-tree-based models like SVM)
        # use the logit function (i.e., the inverse of the logistic function) to map probabilities output
        # by sklearn's predict_proba in the range [0,1] to a real number in the range [-inf, +inf]
        model_predictions = logit(model.predict_proba(X)[:,1])
    else:
        model_predictions = model.predict(X)
        
    predictions = [1 if p > 0 else -1 for p in model_predictions]
    
    errs = 0
    for p,l in zip(predictions,labels):
        if p != l:
            errs += 1
    return errs/len(predictions)

# <code>eval_binary_err_rate_uma</code>

In [10]:
def eval_binary_err_rate_uma(model, test_set, test_groups=None, svm=False):
    X = test_set.iloc[:,:-1].values
    labels = test_set.iloc[:,-1].values
    
    model_predictions = []
    if svm: # no trees have been generated (used for evaluating other non-tree-based models like SVM)
        # use the logit function (i.e., the inverse of the logistic function) to map probabilities output
        # by sklearn's predict_proba in the range [0,1] to a real number in the range [-inf, +inf]
        model_predictions = logit(model.predict_proba(X)[:,1])
    else:
        model_predictions = model.predict(X)
        
    predictions = [1 if p > 0 else -1 for p in model_predictions]
    
    offset = 0
    errs = 0

    for g in test_groups:
        predictions_att = predictions[offset:offset+g]
        true_label = labels[offset]
        if np.any([p != true_label for p in predictions_att]):
            errs += 1
        offset += g

    return errs/len(test_groups)

# <code>eval_roc_auc</code>

In [11]:
def eval_roc_auc(model, test_set, test_groups=None, svm=False):
    X = test_set.iloc[:,:-1].values
    labels = test_set.iloc[:,-1].values
    
    predictions = []
    
    if svm: # no trees have been generated (used for evaluating other non-tree-based models like SVM)
        # use the logit function (i.e., the inverse of the logistic function) to map probabilities output
        # by sklearn's predict_proba in the range [0,1] to a real number in the range [-inf, +inf]
        predictions = logit(model.predict_proba(X)[:,1])
    else:
        predictions = model.predict(X)
        
    
    return roc_auc_score(y_true=labels, y_score=predictions)

# <code>eval_roc_auc_uma</code>

In [12]:
def eval_roc_auc_uma(model, test_set, test_groups=None, svm=False):
    X = test_set.iloc[:,:-1].values
    labels = test_set.iloc[:,-1].values
    
    predictions = []
    
    if svm: # no trees have been generated (used for evaluating other non-tree-based models like SVM)
        # use the logit function (i.e., the inverse of the logistic function) to map probabilities output
        # by sklearn's predict_proba in the range [0,1] to a real number in the range [-inf, +inf]
        predictions = logit(model.predict_proba(X)[:,1])
    else:
        predictions = model.predict(X)
    
    
    offset = 0
    true_labels = []
    worst_predictions = []
    
    for g in test_groups:
        
        true_label = labels[offset]
        true_labels.append(true_label)
        predictions_att = predictions[offset:offset+g]
        if true_label == 1:
            worst_predictions.append(np.min(predictions_att))
        else:
            worst_predictions.append(np.max(predictions_att))
    
        offset += g
        
    return roc_auc_score(y_true=true_labels, y_score=worst_predictions)

# <code>eval_specificity</code>

In [13]:
def eval_specificity(model, test_set, test_groups=None, svm=False):
    X = test_set.iloc[:,:-1].values
    labels = test_set.iloc[:,-1].values
    
    model_predictions = []
    if svm: # no trees have been generated (used for evaluating other non-tree-based models like SVM)
        # use the logit function (i.e., the inverse of the logistic function) to map probabilities output
        # by sklearn's predict_proba in the range [0,1] to a real number in the range [-inf, +inf]
        model_predictions = logit(model.predict_proba(X)[:,1])
    else:
        model_predictions = model.predict(X)
        
    predictions = [1 if p > 0 else -1 for p in model_predictions]

    tn, fp, fn, tp = confusion_matrix(labels, predictions).ravel()

    return tn/(tn + fp)

# <code>eval_specificity_uma</code>

In [14]:
def eval_specificity_uma(model, test_set, test_groups=None, svm=False):
    X = test_set.iloc[:,:-1].values
    labels = test_set.iloc[:,-1].values
    
    model_predictions = []
    if svm: # no trees have been generated (used for evaluating other non-tree-based models like SVM)
        # use the logit function (i.e., the inverse of the logistic function) to map probabilities output
        # by sklearn's predict_proba in the range [0,1] to a real number in the range [-inf, +inf]
        model_predictions = logit(model.predict_proba(X)[:,1])
    else:
        model_predictions = model.predict(X)
        
    predictions = [1 if p > 0 else -1 for p in model_predictions]
    
    offset = 0
    true_labels = []
    worst_predictions = []
    
    for g in test_groups:
        true_label = labels[offset]
        true_labels.append(true_label)
        predictions_att = predictions[offset:offset+g]
        if true_label == 1:
            worst_predictions.append(np.min(predictions_att))
        else:
            worst_predictions.append(np.max(predictions_att))
    
        offset += g
        
    tn, fp, fn, tp = confusion_matrix(true_labels, worst_predictions).ravel()

    return tn/(tn + fp)

# <code>eval_precision</code>

In [15]:
def eval_precision(model, test_set, test_groups=None, svm=False):
    X = test_set.iloc[:,:-1].values
    labels = test_set.iloc[:,-1].values
    
    model_predictions = []
    if svm: # no trees have been generated (used for evaluating other non-tree-based models like SVM)
        # use the logit function (i.e., the inverse of the logistic function) to map probabilities output
        # by sklearn's predict_proba in the range [0,1] to a real number in the range [-inf, +inf]
        model_predictions = logit(model.predict_proba(X)[:,1])
    else:
        model_predictions = model.predict(X)
        
    predictions = [1 if p > 0 else -1 for p in model_predictions]

    tn, fp, fn, tp = confusion_matrix(labels, predictions).ravel()

    return tp/(tp + fp)

# <code>eval_precision_uma</code>

In [16]:
def eval_precision_uma(model, test_set, test_groups=None, svm=False):
    X = test_set.iloc[:,:-1].values
    labels = test_set.iloc[:,-1].values
    
    model_predictions = []
    if svm: # no trees have been generated (used for evaluating other non-tree-based models like SVM)
        # use the logit function (i.e., the inverse of the logistic function) to map probabilities output
        # by sklearn's predict_proba in the range [0,1] to a real number in the range [-inf, +inf]
        model_predictions = logit(model.predict_proba(X)[:,1])
    else:
        model_predictions = model.predict(X)
        
    predictions = [1 if p > 0 else -1 for p in model_predictions]
    
    offset = 0
    true_labels = []
    worst_predictions = []
    
    for g in test_groups:
        true_label = labels[offset]
        true_labels.append(true_label)
        predictions_att = predictions[offset:offset+g]
        if true_label == 1:
            worst_predictions.append(np.min(predictions_att))
        else:
            worst_predictions.append(np.max(predictions_att))
    
        offset += g
        
    tn, fp, fn, tp = confusion_matrix(true_labels, worst_predictions).ravel()
    
    return tp/(tp + fp)

# <code>eval_recall</code>

In [17]:
def eval_recall(model, test_set, test_groups=None, svm=False):
    X = test_set.iloc[:,:-1].values
    labels = test_set.iloc[:,-1].values
    
    model_predictions = []
    if svm: # no trees have been generated (used for evaluating other non-tree-based models like SVM)
        # use the logit function (i.e., the inverse of the logistic function) to map probabilities output
        # by sklearn's predict_proba in the range [0,1] to a real number in the range [-inf, +inf]
        model_predictions = logit(model.predict_proba(X)[:,1])
    else:
        model_predictions = model.predict(X)
        
    predictions = [1 if p > 0 else -1 for p in model_predictions]

    tn, fp, fn, tp = confusion_matrix(labels, predictions).ravel()

    return tp/(tp + fn)

# <code>eval_recall_uma</code>

In [18]:
def eval_recall_uma(model, test_set, test_groups=None, svm=False):
    X = test_set.iloc[:,:-1].values
    labels = test_set.iloc[:,-1].values
    
    model_predictions = []
    if svm: # no trees have been generated (used for evaluating other non-tree-based models like SVM)
        # use the logit function (i.e., the inverse of the logistic function) to map probabilities output
        # by sklearn's predict_proba in the range [0,1] to a real number in the range [-inf, +inf]
        model_predictions = logit(model.predict_proba(X)[:,1])
    else:
        model_predictions = model.predict(X)
        
    predictions = [1 if p > 0 else -1 for p in model_predictions]
    
    offset = 0
    true_labels = []
    worst_predictions = []
    
    for g in test_groups:
        true_label = labels[offset]
        true_labels.append(true_label)
        predictions_att = predictions[offset:offset+g]
        if true_label == 1:
            worst_predictions.append(np.min(predictions_att))
        else:
            worst_predictions.append(np.max(predictions_att))
    
        offset += g
        
    tn, fp, fn, tp = confusion_matrix(true_labels, worst_predictions).ravel()
    
    return tp/(tp + fn)

# <code>eval_npv</code>

In [19]:
def eval_npv(model, test_set, test_groups=None, svm=False):
    X = test_set.iloc[:,:-1].values
    labels = test_set.iloc[:,-1].values
    
    model_predictions = []
    if svm: # no trees have been generated (used for evaluating other non-tree-based models like SVM)
        # use the logit function (i.e., the inverse of the logistic function) to map probabilities output
        # by sklearn's predict_proba in the range [0,1] to a real number in the range [-inf, +inf]
        model_predictions = logit(model.predict_proba(X)[:,1])
    else:
        model_predictions = model.predict(X)
        
    predictions = [1 if p > 0 else -1 for p in model_predictions]

    tn, fp, fn, tp = confusion_matrix(labels, predictions).ravel()

    return tn/(tn + fn)

# <code>eval_npv_uma</code>

In [20]:
def eval_npv_uma(model, test_set, test_groups=None, svm=False):
    X = test_set.iloc[:,:-1].values
    labels = test_set.iloc[:,-1].values
    
    model_predictions = []
    if svm: # no trees have been generated (used for evaluating other non-tree-based models like SVM)
        # use the logit function (i.e., the inverse of the logistic function) to map probabilities output
        # by sklearn's predict_proba in the range [0,1] to a real number in the range [-inf, +inf]
        model_predictions = logit(model.predict_proba(X)[:,1])
    else:
        model_predictions = model.predict(X)
        
    predictions = [1 if p > 0 else -1 for p in model_predictions]
    
    offset = 0
    true_labels = []
    worst_predictions = []
    
    for g in test_groups:
        true_label = labels[offset]
        true_labels.append(true_label)
        predictions_att = predictions[offset:offset+g]
        if true_label == 1:
            worst_predictions.append(np.min(predictions_att))
        else:
            worst_predictions.append(np.max(predictions_att))
    
        offset += g
        
    tn, fp, fn, tp = confusion_matrix(true_labels, worst_predictions).ravel()
    
    return tn/(tn + fn)

# <code>eval_f1</code>

In [21]:
def eval_f1(model, test_set, test_groups=None, svm=False):
#     X = test_set.iloc[:,:-1].values
#     labels = test_set.iloc[:,-1].values
    
#     model_predictions = []
#     if svm: # no trees have been generated (used for evaluating other non-tree-based models like SVM)
#         # use the logit function (i.e., the inverse of the logistic function) to map probabilities output
#         # by sklearn's predict_proba in the range [0,1] to a real number in the range [-inf, +inf]
#         model_predictions = logit(model.predict_proba(X)[:,1])
#     else:
#         model_predictions = model.predict(X)
        
#     predictions = [1 if p > 0 else -1 for p in model_predictions]
    
#     return f1_score(y_true=labels, y_pred=predictions, average='macro')

    precision = eval_precision(model, test_set, test_groups=test_groups, svm=svm)
    recall = eval_recall(model, test_set, test_groups=test_groups, svm=svm)
    
    return 2 * (precision * recall)/(precision + recall)

# <code>eval_f1_uma</code>

In [22]:
def eval_f1_uma(model, test_set, test_groups=None, svm=False):
#     X = test_set.iloc[:,:-1].values
#     labels = test_set.iloc[:,-1].values
    
#     model_predictions = []
#     if svm: # no trees have been generated (used for evaluating other non-tree-based models like SVM)
#         # use the logit function (i.e., the inverse of the logistic function) to map probabilities output
#         # by sklearn's predict_proba in the range [0,1] to a real number in the range [-inf, +inf]
#         model_predictions = logit(model.predict_proba(X)[:,1])
#     else:
#         model_predictions = model.predict(X)
        
#     predictions = [1 if p > 0 else -1 for p in model_predictions]
    
#     offset = 0
#     true_labels = []
#     worst_predictions = []
    
#     for g in test_groups:
#         true_label = labels[offset]
#         true_labels.append(true_label)
#         predictions_att = predictions[offset:offset+g]
#         if true_label == 1:
#             worst_predictions.append(np.min(predictions_att))
#         else:
#             worst_predictions.append(np.max(predictions_att))
    
#         offset += g
        
#     return f1_score(y_true=true_labels, y_pred=worst_predictions, average='macro')

    precision_uma = eval_precision_uma(model, test_set, test_groups=test_groups, svm=svm)
    recall_uma = eval_recall_uma(model, test_set, test_groups=test_groups, svm=svm)
    
    return 2 * (precision_uma * recall_uma)/(precision_uma + recall_uma)

# Evaluation metrics

In [23]:
EVAL_METRICS = [eval_log_loss, 
                eval_binary_err_rate,
                eval_specificity,
                eval_precision,
                eval_recall,
                eval_npv,
                eval_f1,
                eval_roc_auc
               ]

EVAL_METRICS_UNDER_MAX_ATTACK = [eval_log_loss_uma,
                                 eval_binary_err_rate_uma,
                                 eval_specificity_uma,
                                 eval_precision_uma,
                                 eval_recall_uma,
                                 eval_f1_uma,
                                 eval_roc_auc_uma
                                ]

# Evaluate each model w.r.t. _all_ evaluation metrics

In [24]:
def eval_learned_model(model, eval_metric, test, test_groups=None, svm=False):
    return eval_metric(model, test, test_groups=test_groups, svm=svm)

In [25]:
def eval_learned_models(model, model_type, test, test_groups=None, budget=0):

    eval_metrics = EVAL_METRICS
    d_test = "D_test"
    if test_groups is not None:
        eval_metrics = EVAL_METRICS_UNDER_MAX_ATTACK
        d_test = "D_test_att"
    
    header = ['Model'] + ['Budget'] + [m.__name__.replace('eval_','').replace('_',' ').strip().title() for m in EVAL_METRICS]
    df = pd.DataFrame(columns=header)
    first_row = [model_type] + [budget] + [None for m in EVAL_METRICS]
    df.loc[0] = first_row
    
    svm = False
    if model_type == "SVM":
        svm = True

    for eval_metric in eval_metrics:
        res = eval_learned_model(model, eval_metric, test, test_groups=test_groups, svm=svm)
        print("{} learning - {} on {} = {:.5f}"
                  .format(model_type, eval_metric.__name__, d_test, res))
        column_metric = eval_metric.__name__
        if eval_metric.__name__.endswith("uma"):
            column_metric = eval_metric.__name__.replace('uma', '')
        df[column_metric.replace('eval_','').replace('_',' ').strip().title()] = res

    print("******************************************************************************************************")
    
    return df

# Loading Datasets

In [26]:
def label_encode(dataset, categorical_features):
    dataset_le = dataset.copy()
    for column in dataset_le.columns:
        if column in categorical_features:
            dataset_le[column] = dataset_le[column].astype('category')
            dataset_le[column] = dataset_le[column].cat.codes.astype(np.int32)
    return dataset_le

def load_atk_train_valid_test(atk_train_file, atk_valid_file, atk_test_file, 
                              train_split=0.6, valid_split=0.2, force=False):
    
    
    if  (force or 
          not os.path.exists(atk_train_file+".cat.bz2") or
          not os.path.exists(atk_valid_file+".cat.bz2") or
          not os.path.exists(atk_test_file+".cat.bz2") or 
          not os.path.exists(atk_train_file+".cat.json") ):
    
        print ("Pre-processing original files...")

        print ("Loading:", atk_train_file)
        print ("Loading:", atk_valid_file)
        print ("Loading:", atk_test_file)

        train = pd.read_csv(atk_train_file)
        valid = pd.read_csv(atk_valid_file)
        test  = pd.read_csv(atk_test_file)
        
        print ("Train/Valid/Test sizes:", train.shape, valid.shape, test.shape)
        print ("Train/Valid/Test split: {:.2f} {:.2f} {:.2f}"
                   .format( train.shape[0]/(train.shape[0]+valid.shape[0]+test.shape[0]),
                            valid.shape[0]/(train.shape[0]+valid.shape[0]+test.shape[0]),
                            test.shape[0] /(train.shape[0]+valid.shape[0]+test.shape[0]) ) )


        # split-back into train valid test
        if 'instance_id' in train.columns.values:
            print ('with instance ids')
            valid['instance_id'] += train.iloc[-1,0]
            test['instance_id']  += valid.iloc[-1,0]
            
            groups = np.concatenate( [ train['instance_id'].value_counts().sort_index().values,
                                       valid['instance_id'].value_counts().sort_index().values,
                                       test['instance_id'].value_counts().sort_index().values ] )
            
            num_train_groups = int( len(groups)*train_split )
            train_size = sum(groups[:num_train_groups])
            num_valid_groups = int( len(groups)*valid_split )
            valid_size = sum(groups[num_train_groups:num_train_groups+num_valid_groups])
        else:
            full_size = len(train) + len(valid) + len(test)
            train_size = int( full_size*train_split )
            valid_size = int( full_size*valid_split )
        
        # concat to process correctly label encoding
        full = pd.concat( [train, valid, test] )

        # get index of categorical features (-1 because of instance_id)
        cat_fx = full.columns.values[np.where(full.dtypes=='object')[0]]
        cat_fx = list(cat_fx)    
        full = label_encode(full, cat_fx)
        with open(atk_train_file+".cat.json", 'w') as fp:
            json.dump(cat_fx, fp)
        print ("CatFX:", cat_fx)

        train_cat = full.iloc[0:train_size,:]
        valid_cat = full.iloc[train_size:train_size+valid_size,:]
        test_cat  = full.iloc[train_size+valid_size:,:]    

        print ("Train/Valid/Test sizes:", train_cat.shape, valid_cat.shape, test_cat.shape)
        print ("Train/Valid/Test split: {:.2f} {:.2f} {:.2f}"
                   .format( train_cat.shape[0]/(train_cat.shape[0]+valid_cat.shape[0]+test_cat.shape[0]),
                            valid_cat.shape[0]/(train_cat.shape[0]+valid_cat.shape[0]+test_cat.shape[0]),
                            test_cat.shape[0] /(train_cat.shape[0]+valid_cat.shape[0]+test_cat.shape[0]) ) )

        # save to file
        print ("Saving processed files *.cat.bz2")
        train_cat.to_csv(atk_train_file+".cat.bz2", compression="bz2", index=False)
        valid_cat.to_csv(atk_valid_file+".cat.bz2", compression="bz2", index=False)
        test_cat.to_csv (atk_test_file+".cat.bz2",  compression="bz2", index=False)
        
    else:
        print ("Loading pre-processed files...")

        train_cat = pd.read_csv(atk_train_file+".cat.bz2")
        valid_cat = pd.read_csv(atk_valid_file+".cat.bz2")
        test_cat  = pd.read_csv(atk_test_file+".cat.bz2")
        
        with open(atk_train_file+".cat.json", 'r') as fp:
            cat_fx = json.load(fp)
    
    # return data
    return train_cat, valid_cat, test_cat, cat_fx

# Retrieve all model files

In [27]:
MODELS_PATH = "../out/models"

In [28]:
### Standard SVM
svm = None 
with open(join(MODELS_PATH, "svm_census_C1000.model"), 'rb') as svm_file:
    svm = pickle.load(svm_file)
### Standard GBDT
std_gbdt = lightgbm.Booster(model_file=join(MODELS_PATH, "std_gbdt_census_T500_S0100_L24_R234.model"))
### Adversarial Boosting
# budget=5
adv_boost_b5 = lightgbm.Booster(model_file=join(MODELS_PATH, "adv_boosting_census_B5_T500_S0100_L24_R492.model"))
# budget=15
adv_boost_b15 = lightgbm.Booster(model_file=join(MODELS_PATH, "adv_boosting_census_B15_T500_S0100_L24_R450.model"))
# budget=150
adv_boost_b150 = lightgbm.Booster(model_file=join(MODELS_PATH, "adv_boosting_census_B150_T500_S0100_L24_R465.model"))
# budget=300
adv_boost_b300 = lightgbm.Booster(model_file=join(MODELS_PATH, "adv_boosting_census_B300_T500_S0100_L24_R498.model"))
### Non-Interferent
# budget=5
non_interf_b5 = lightgbm.Booster(model_file=join(MODELS_PATH, "non_interferent_census_B5_T500_S0050_L24_A050_R356.model")) #"non_interferent_census_B5_T500_S0100_L24_A050_R157.model"))
# budget=15
non_interf_b15 = lightgbm.Booster(model_file=join(MODELS_PATH, "non_interferent_census_B15_T500_S0050_L24_A050_R335.model")) #"non_interferent_census_B15_T500_S0100_L24_A050_R156.model"))
# budget=150
non_interf_b150 = lightgbm.Booster(model_file=join(MODELS_PATH, "non_interferent_census_B150_T500_S0050_L24_A050_R292.model")) #"non_interferent_census_B150_T500_S0100_L24_A050_R135.model"))
# budget=300
non_interf_b300 = lightgbm.Booster(model_file=join(MODELS_PATH, "non_interferent_census_B300_T500_S0050_L24_A050_R340.model")) #"non_interferent_census_B300_T500_S0100_L24_A050_R134.model"))


In [29]:
MODELS = (std_gbdt,
          adv_boost_b5, 
          adv_boost_b15, 
          adv_boost_b150, 
          adv_boost_b300, 
          non_interf_b5,
          non_interf_b15,
          non_interf_b150,
          non_interf_b300
         )

In [30]:
# load train/valid/test
train, valid, test, _ = load_atk_train_valid_test("../data/census/train_ori.csv.bz2", 
                                                       "../data/census/valid_ori.csv.bz2", 
                                                       "../data/census/test_ori.csv.bz2")

Loading pre-processed files...


In [31]:
def load_attacked_dataset(budget):
    # load train/valid/test
    train_att, valid_att, test_att, _ = load_atk_train_valid_test("../data/census/train_B{}.csv.bz2".format(budget), 
                                                           "../data/census/valid_B{}.csv.bz2".format(budget), 
                                                           "../data/census/test_B{}.csv.bz2".format(budget))

    test_groups = test_att['instance_id'].value_counts().sort_index().values
    test_att = test_att.iloc[:, 1:]

    valid_groups = valid_att['instance_id'].value_counts().sort_index().values
    valid_att = valid_att.iloc[:, 1:]

    train_groups = train_att['instance_id'].value_counts().sort_index().values
    train_att = train_att.iloc[:, 1:]
    
    return train_att, train_groups, valid_att, valid_groups, test_att, test_groups

In [32]:
def load_attacked_datasets():
    att_datasets = {}
    for b in [5, 15, 150, 300]:
        att_datasets[b] = load_attacked_dataset(b)
    
    return att_datasets

In [33]:
#svm_df = eval_learned_models(svm, "SVM", test)
std_gbdt_df = eval_learned_models(std_gbdt, "Baseline [GBDT]", test)
adv_boost_df_b5 = eval_learned_models(adv_boost_b5, "Adversarial-Boosting [b=5]", test)
adv_boost_df_b15 = eval_learned_models(adv_boost_b15, "Adversarial-Boosting [b=15]", test)
adv_boost_df_b150 = eval_learned_models(adv_boost_b150, "Adversarial-Boosting [b=150]", test)
adv_boost_df_b300 = eval_learned_models(adv_boost_b300, "Adversarial-Boosting [b=300]", test)
non_interf_df_b5 = eval_learned_models(non_interf_b5, "Non-Interferent [b=5]", test)
non_interf_df_b15 = eval_learned_models(non_interf_b15, "Non-Interferent [b=15]", test)
non_interf_df_b150 = eval_learned_models(non_interf_b150, "Non-Interferent [b=150]", test)
non_interf_df_b300 = eval_learned_models(non_interf_b300, "Non-Interferent [b=300]", test)

Baseline [GBDT] learning - eval_log_loss on D_test = 0.30008
Baseline [GBDT] learning - eval_binary_err_rate on D_test = 0.13787
Baseline [GBDT] learning - eval_specificity on D_test = 0.93585
Baseline [GBDT] learning - eval_precision on D_test = 0.76273
Baseline [GBDT] learning - eval_recall on D_test = 0.63509
Baseline [GBDT] learning - eval_npv on D_test = 0.88762
Baseline [GBDT] learning - eval_f1 on D_test = 0.69308
Baseline [GBDT] learning - eval_roc_auc on D_test = 0.91673
******************************************************************************************************
Adversarial-Boosting [b=5] learning - eval_log_loss on D_test = 0.31726
Adversarial-Boosting [b=5] learning - eval_binary_err_rate on D_test = 0.14417
Adversarial-Boosting [b=5] learning - eval_specificity on D_test = 0.96602
Adversarial-Boosting [b=5] learning - eval_precision on D_test = 0.83152
Adversarial-Boosting [b=5] learning - eval_recall on D_test = 0.51646
Adversarial-Boosting [b=5] learning - eval_

In [34]:
att_datasets = load_attacked_datasets()

Loading pre-processed files...
Loading pre-processed files...
Loading pre-processed files...
Loading pre-processed files...


In [35]:
test_att_b5 = att_datasets[5][4]
test_groups_b5 = att_datasets[5][5]
test_att_b15 = att_datasets[15][4]
test_groups_b15 = att_datasets[15][5]
test_att_b150 = att_datasets[150][4]
test_groups_b150 = att_datasets[150][5]
test_att_b300 = att_datasets[300][4]
test_groups_b300 = att_datasets[300][5]

In [36]:
def eval_all_models_under_attack(models):
    
    std_gbdt, adv_boost_b5, adv_boost_b15, adv_boost_b150, adv_boost_b300, non_interf_b5, non_interf_b15, non_interf_b150, non_interf_b300 = models
    
    dfs = []
    for b in [5, 15, 150, 300]:
        
        print("==> Evaluating Standard GBDT under attack [b={}]".format(b))
        std_gbdt_df_att = eval_learned_models(std_gbdt, 
                                              "Baseline [GBDT]", 
                                              att_datasets[b][4], 
                                              att_datasets[b][5], 
                                              budget=b)
        
        print("==> Evaluating Adversarial Boosting [b_train={}] under attack [b={}]".format(5, b))
        adv_boost_df_att_b5 = eval_learned_models(adv_boost_b5, 
                                                  "Adversarial-Boosting [b=5]", 
                                                  att_datasets[b][4], 
                                                  att_datasets[b][5], 
                                                  budget=b)
        print("==> Evaluating Adversarial Boosting [b_train={}] under attack [b={}]".format(15, b))
        adv_boost_df_att_b15 = eval_learned_models(adv_boost_b15, 
                                                   "Adversarial-Boosting [b=15]", 
                                                   att_datasets[b][4], 
                                                   att_datasets[b][5], 
                                                   budget=b)
        print("==> Evaluating Adversarial Boosting [b_train={}] under attack [b={}]".format(150, b))
        adv_boost_df_att_b150 = eval_learned_models(adv_boost_b150, 
                                                    "Adversarial-Boosting [b=150]", 
                                                    att_datasets[b][4], 
                                                    att_datasets[b][5], 
                                                    budget=b)
        print("==> Evaluating Adversarial Boosting [b_train={}] under attack [b={}]".format(300, b))
        adv_boost_df_att_b300 = eval_learned_models(adv_boost_b300, 
                                                    "Adversarial-Boosting [b=300]", 
                                                    att_datasets[b][4], 
                                                    att_datasets[b][5], 
                                                    budget=b)
        
        print("==> Evaluating Non-Interferent [b_train={}] under attack [b={}]".format(5, b))
        non_interf_df_att_b5 = eval_learned_models(non_interf_b5, 
                                                  "Non-Interferent [b=5]", 
                                                  att_datasets[b][4], 
                                                  att_datasets[b][5], 
                                                  budget=b)
        print("==> Evaluating Non-Interferent [b_train={}] under attack [b={}]".format(15, b))
        non_interf_df_att_b15 = eval_learned_models(non_interf_b15, 
                                                   "Non-Interferent [b=15]", 
                                                   att_datasets[b][4], 
                                                   att_datasets[b][5], 
                                                   budget=b)
        print("==> Evaluating Non-Interferent [b_train={}] under attack [b={}]".format(150, b))
        non_interf_df_att_b150 = eval_learned_models(non_interf_b150, 
                                                    "Non-Interferent [b=150]", 
                                                    att_datasets[b][4], 
                                                    att_datasets[b][5], 
                                                    budget=b)
        print("==> Evaluating Non-Interferent [b_train={}] under attack [b={}]".format(300, b))
        non_interf_df_att_b300 = eval_learned_models(non_interf_b300, 
                                                    "Non-Interferent [b=300]", 
                                                    att_datasets[b][4], 
                                                    att_datasets[b][5], 
                                                    budget=b)
        
        df_b = pd.concat([std_gbdt_df_att, 
                          adv_boost_df_att_b5, 
                          adv_boost_df_att_b15, 
                          adv_boost_df_att_b150, 
                          adv_boost_df_att_b300, 
                          non_interf_df_att_b5,
                          non_interf_df_att_b15,
                          non_interf_df_att_b150,
                          non_interf_df_att_b300
                         ], 
                         axis=0, 
                         sort=False)

        df_b.reset_index(inplace=True, drop=True)
        
        dfs.append(df_b)
        
    #df = functools.reduce(lambda left,right: pd.merge(left,right,on=['Model', 'Budget']), dfs)
    df = pd.concat(dfs, axis=0, sort=False)
    
    return df

In [37]:
eval_under_attack_df = eval_all_models_under_attack(MODELS)

==> Evaluating Standard GBDT under attack [b=5]
Baseline [GBDT] learning - eval_log_loss_uma on D_test_att = 0.33052
Baseline [GBDT] learning - eval_binary_err_rate_uma on D_test_att = 0.15556
Baseline [GBDT] learning - eval_specificity_uma on D_test_att = 0.91242
Baseline [GBDT] learning - eval_precision_uma on D_test_att = 0.70189
Baseline [GBDT] learning - eval_recall_uma on D_test_att = 0.63509
Baseline [GBDT] learning - eval_f1_uma on D_test_att = 0.66682
Baseline [GBDT] learning - eval_roc_auc_uma on D_test_att = 0.89767
******************************************************************************************************
==> Evaluating Adversarial Boosting [b_train=5] under attack [b=5]
Adversarial-Boosting [b=5] learning - eval_log_loss_uma on D_test_att = 0.32008
Adversarial-Boosting [b=5] learning - eval_binary_err_rate_uma on D_test_att = 0.14516
Adversarial-Boosting [b=5] learning - eval_specificity_uma on D_test_att = 0.96470
Adversarial-Boosting [b=5] learning - eval_prec

Adversarial-Boosting [b=15] learning - eval_f1_uma on D_test_att = 0.63052
Adversarial-Boosting [b=15] learning - eval_roc_auc_uma on D_test_att = 0.90923
******************************************************************************************************
==> Evaluating Adversarial Boosting [b_train=150] under attack [b=15]
Adversarial-Boosting [b=150] learning - eval_log_loss_uma on D_test_att = 0.32279
Adversarial-Boosting [b=150] learning - eval_binary_err_rate_uma on D_test_att = 0.14605
Adversarial-Boosting [b=150] learning - eval_specificity_uma on D_test_att = 0.96661
Adversarial-Boosting [b=150] learning - eval_precision_uma on D_test_att = 0.83136
Adversarial-Boosting [b=150] learning - eval_recall_uma on D_test_att = 0.50699
Adversarial-Boosting [b=150] learning - eval_f1_uma on D_test_att = 0.62987
Adversarial-Boosting [b=150] learning - eval_roc_auc_uma on D_test_att = 0.90907
************************************************************************************************

Non-Interferent [b=5] learning - eval_log_loss_uma on D_test_att = 0.36046
Non-Interferent [b=5] learning - eval_binary_err_rate_uma on D_test_att = 0.16108
Non-Interferent [b=5] learning - eval_specificity_uma on D_test_att = 0.89777
Non-Interferent [b=5] learning - eval_precision_uma on D_test_att = 0.67625
Non-Interferent [b=5] learning - eval_recall_uma on D_test_att = 0.65765
Non-Interferent [b=5] learning - eval_f1_uma on D_test_att = 0.66682
Non-Interferent [b=5] learning - eval_roc_auc_uma on D_test_att = 0.88855
******************************************************************************************************
==> Evaluating Non-Interferent [b_train=15] under attack [b=150]
Non-Interferent [b=15] learning - eval_log_loss_uma on D_test_att = 0.36138
Non-Interferent [b=15] learning - eval_binary_err_rate_uma on D_test_att = 0.16097
Non-Interferent [b=15] learning - eval_specificity_uma on D_test_att = 0.89733
Non-Interferent [b=15] learning - eval_precision_uma on D_test_att 

Non-Interferent [b=150] learning - eval_f1_uma on D_test_att = 0.65360
Non-Interferent [b=150] learning - eval_roc_auc_uma on D_test_att = 0.88028
******************************************************************************************************
==> Evaluating Non-Interferent [b_train=300] under attack [b=300]
Non-Interferent [b=300] learning - eval_log_loss_uma on D_test_att = 0.39731
Non-Interferent [b=300] learning - eval_binary_err_rate_uma on D_test_att = 0.16882
Non-Interferent [b=300] learning - eval_specificity_uma on D_test_att = 0.88591
Non-Interferent [b=300] learning - eval_precision_uma on D_test_att = 0.65347
Non-Interferent [b=300] learning - eval_recall_uma on D_test_att = 0.66261
Non-Interferent [b=300] learning - eval_f1_uma on D_test_att = 0.65801
Non-Interferent [b=300] learning - eval_roc_auc_uma on D_test_att = 0.88095
******************************************************************************************************


In [38]:
eval_under_attack_df

Unnamed: 0,Model,Budget,Log Loss,Binary Err Rate,Specificity,Precision,Recall,Npv,F1,Roc Auc
0,Baseline [GBDT],5,0.330521,0.155556,0.912419,0.701894,0.635092,,0.666825,0.897666
1,Adversarial-Boosting [b=5],5,0.320084,0.145163,0.964704,0.826118,0.516464,,0.635581,0.910535
2,Adversarial-Boosting [b=15],5,0.32065,0.145716,0.966462,0.831245,0.508796,,0.631226,0.911083
3,Adversarial-Boosting [b=150],5,0.321078,0.145716,0.967047,0.83321,0.506991,,0.630398,0.910905
4,Adversarial-Boosting [b=300],5,0.320714,0.145495,0.96778,0.835943,0.505638,,0.630129,0.911422
5,Non-Interferent [b=5],5,0.313097,0.144279,0.920035,0.727545,0.657645,,0.690832,0.908567
6,Non-Interferent [b=15],5,0.313453,0.14461,0.91901,0.725558,0.65945,,0.690926,0.908511
7,Non-Interferent [b=150],5,0.313141,0.145384,0.917252,0.721949,0.661705,,0.690515,0.908698
8,Non-Interferent [b=300],5,0.312803,0.144057,0.918717,0.725791,0.662607,,0.692761,0.908974
0,Baseline [GBDT],15,0.345171,0.162742,0.9029,0.679865,0.635092,,0.656716,0.888038


In [39]:
overall_df = pd.concat([std_gbdt_df,
                        adv_boost_df_b5,
                        adv_boost_df_b15,
                        adv_boost_df_b150,
                        adv_boost_df_b300,
                        non_interf_df_b5,
                        non_interf_df_b15,
                        non_interf_df_b150,
                        non_interf_df_b300,
                        eval_under_attack_df], 
                       axis=0, 
                       sort=False)
overall_df.reset_index(inplace=True, drop=True)

In [40]:
overall_df

Unnamed: 0,Model,Budget,Log Loss,Binary Err Rate,Specificity,Precision,Recall,Npv,F1,Roc Auc
0,Baseline [GBDT],0,0.300081,0.137866,0.935852,0.76273,0.635092,0.887623,0.693084,0.916728
1,Adversarial-Boosting [b=5],0,0.317257,0.144168,0.966022,0.831518,0.516464,0.860198,0.637173,0.913199
2,Adversarial-Boosting [b=15],0,0.318171,0.14461,0.967926,0.837416,0.508796,0.858535,0.632997,0.913467
3,Adversarial-Boosting [b=150],0,0.318685,0.144831,0.968219,0.83818,0.506991,0.858126,0.631816,0.913253
4,Adversarial-Boosting [b=300],0,0.318377,0.1445,0.969098,0.841592,0.505638,0.857902,0.631727,0.913692
5,Non-Interferent [b=5],0,0.288309,0.134439,0.93307,0.761358,0.657645,0.893548,0.705712,0.923131
6,Non-Interferent [b=15],0,0.288172,0.133333,0.933948,0.764245,0.65945,0.894139,0.70799,0.923153
7,Non-Interferent [b=150],0,0.287877,0.133002,0.933656,0.764062,0.661705,0.894737,0.70921,0.923354
8,Non-Interferent [b=300],0,0.288208,0.133112,0.933216,0.763117,0.662607,0.894944,0.709319,0.923355
9,Baseline [GBDT],5,0.330521,0.155556,0.912419,0.701894,0.635092,,0.666825,0.897666


# Save the DataFrame containing results

In [41]:
overall_df.to_csv("../plots/plot_census.csv", sep=",", index=False)