# Evaluating Models

This notebook contains the code used for evaluating the following learning models:

-  **Standard GBDT** (_baseline 1_)
-  **Adversarial Boosting** (_baseline 2_)
-  **Non-Interferent GBDT** (our proposal)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import json
import glob
import pickle
import dill
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm
import functools
from os import listdir
from os.path import isfile, join
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, precision_score, recall_score
from nilib import *
from robust_forest import *
from sklearn.ensemble import BaggingClassifier

2019-06-02 13:13:34,424 *** INFO [robust_forest.py:1151 - __init__()] *** ***** Robust Decision Tree successfully created *****
2019-06-02 13:13:34,427 *** INFO [robust_forest.py:1152 - __init__()] *** *	Tree ID: 0
2019-06-02 13:13:34,428 *** INFO [robust_forest.py:1153 - __init__()] *** *	Attacker: <robust_forest.Attacker object at 0x7f172184e668>
2019-06-02 13:13:34,429 *** INFO [robust_forest.py:1155 - __init__()] *** *	Splitting criterion: SSE
2019-06-02 13:13:34,430 *** INFO [robust_forest.py:1156 - __init__()] *** *	Max depth: 8
2019-06-02 13:13:34,431 *** INFO [robust_forest.py:1158 - __init__()] *** *	Min instances per tree node: 20
2019-06-02 13:13:34,432 *** INFO [robust_forest.py:1160 - __init__()] *** *	Max samples: 100.0%
2019-06-02 13:13:34,433 *** INFO [robust_forest.py:1162 - __init__()] *** *	Max features: 100.0%
2019-06-02 13:13:34,434 *** INFO [robust_forest.py:1164 - __init__()] *** *	Feature blacklist: set()
2019-06-02 13:13:34,435 *** INFO [robust_forest.py:1165 -

# Standard evaluation metric

The following function is the one used for evaluating the quality of the learned model (either _standard_, _adversarial-boosting_, or _non-interferent_). This is the standard <code>avg_log_loss</code>.

In [3]:
def logistic(x):
    return 1.0/(1.0 + np.exp(-x))

In [4]:
def logit(p):
    return np.log(p/(1-p))

In [5]:
def binarize(preds):
    if np.min(preds)<-0.001:
        return np.where(preds>=0,  1.0, -1.0)
    else:
        return np.where(preds>=.5, 1.0, -1.0)

# <code>avg_log_loss</code>

In [6]:
# self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool
def avg_log_loss(preds, train_data):
    
    labels = train_data.get_label()
    losses = np.log(1.0 + np.exp(-preds*labels))
    avg_loss = np.mean(losses)
    
    return 'avg_binary_log_loss', avg_loss, False

In [7]:
def eval_log_loss(y_true, y_pred):
    losses = np.log(1.0 + np.exp(-y_pred*y_true))
    avg_loss = np.mean(losses)
    return avg_loss

# Custom evaluation metric

Similarly to what we have done for <code>fobj</code>, <code>feval</code> can be computed from a weighted combination of two evaluation metrics:

-  <code>avg_log_loss</code> (standard, defined above);
-  <code>avg_log_loss_uma</code> (custom, defined below).

# <code>avg_log_loss_uma</code>

This is the binary log loss yet modified to operate on groups of perturbed instances.

In [8]:
# Our custom metric

def binary_log_loss(pred, true_label):

    return np.log(1.0 + np.exp(-pred * true_label))

# self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool
def avg_log_loss_uma(preds, train_data):
    labels = train_data.get_label()
    attack_lens = train_data.get_group()
    
    offset = 0
    max_logloss = []
    avg_max_logloss = 0.0
    
    if attack_lens is not None:
    
        for atk in attack_lens:
            losses = [binary_log_loss(h,t) for h,t in zip(preds[offset:offset+atk], labels[offset:offset+atk])]
            max_logloss.append(max(losses))
            
            offset += atk
        
        avg_max_logloss = np.mean(max_logloss)  
        
    return 'avg_binary_log_loss_under_max_attack', avg_max_logloss, False

In [9]:
def eval_log_loss_uma(preds, test, test_groups=None, svm=False):
    
    lgbm_test = lightgbm.Dataset(data=test.iloc[:,:-1].values, 
                                 label=test.iloc[:,-1].values,
                                 group=test_groups,
                                 free_raw_data=False)
    
    return avg_log_loss_uma(preds,lgbm_test)[1]


# <code>eval_binary_err_rate</code>

In [10]:
def eval_binary_err_rate(y_true, y_pred):
    errs = np.sum(binarize(y_pred) != y_true)
    return errs/len(y_true)


# <code>eval_roc_auc</code>

In [11]:
def eval_roc_auc(y_true, y_pred):
    return roc_auc_score(y_true=y_true, y_score=y_pred)


# <code>eval_specificity</code>

In [12]:
def eval_specificity(y_true, y_pred):
    y_pred = binarize(y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true=y_true, y_pred=y_pred).ravel()

    return tn/(tn + fp)

# <code>eval_precision</code>

In [13]:
def eval_precision(y_true, y_pred):
    y_pred = binarize(y_pred)
    return precision_score(y_true=y_true, y_pred=y_pred, average='weighted')


# <code>eval_recall</code>

In [14]:
def eval_recall(y_true, y_pred):
    y_pred = binarize(y_pred)
    return recall_score(y_true=y_true, y_pred=y_pred, average='weighted')


# <code>eval_f1</code>

In [15]:
def eval_f1(y_true, y_pred):
    y_pred = binarize(y_pred)
    return f1_score(y_true=y_true, y_pred=y_pred, average='weighted')

# Evaluate each model w.r.t. _all_ evaluation metrics

In [68]:
import sklearn

def model_predict(model,test_set):
    X = test_set.iloc[:,:-1].values

    if isinstance(model, sklearn.ensemble.BaggingClassifier):
        print ("BAGGING")
#         print ( np.min( model.predict_proba(X)[:,0] ), np.max( model.predict_proba(X)[:,0] ) )
#         print ( np.min( model.predict_proba(X)[:,1] ), np.max( model.predict_proba(X)[:,1] ) )
        return model.predict_proba(X)[:,1]
        # return model.predict(X)
    else:
        print ("LightGBM")
#        print (np.unique( model.predict(X) ) )
#         lgbm_X = lightgbm.Dataset(data=test_set.iloc[:,:-1], 
#                                   label=test_set.iloc[:,-1])

        return model.predict(test_set.iloc[:,:-1])

def model_worst_predict(model, test_set, test_groups):
    labels = test_set.iloc[:,-1].values
    preds  = model_predict(model, test_set)
    
    offset = 0
    true_labels = []
    worst_predictions = []
    
    for g in test_groups:
        true_label = labels[offset]
        true_labels.append(true_label)
        predictions_att = preds[offset:offset+g]
        if true_label == 1:
            worst_predictions.append(np.min(predictions_att))
        else:
            worst_predictions.append(np.max(predictions_att))
    
        offset += g

    return np.array(true_labels), np.array(worst_predictions)

In [69]:
def eval_learned_models(eval_metrics, model, model_type, test, test_groups=None, budget=0):
    # output dataframe
    header = ['Model'] + ['Budget'] + [m.__name__.replace('eval_','').replace('_',' ').strip().title() 
                                       for m in eval_metrics]
    df = pd.DataFrame(columns=header)
    first_row = [model_type] + [budget] + [None for m in eval_metrics]
    df.loc[0] = first_row
    
    # predictions for plan and atk datasets
    if test_groups is None: # NOT ATKed
        y_true = test.iloc[:,-1].values
        y_pred = model_predict(model, test)
    else:
        y_true, y_pred = model_worst_predict(model, test, test_groups)
        
    for eval_metric in eval_metrics:
        res = eval_metric(y_true=y_true, y_pred=y_pred)
        print("{} learning - {} = {:.5f}"
                  .format(model_type, eval_metric.__name__, res))
        column_metric = eval_metric.__name__
        df[column_metric.replace('eval_','').replace('_',' ').strip().title()] = res

    print("******************************************************************************************************")
    
    return df

# Load attacked datasets

## Load an attacked dataset with a specific budget

In [70]:
def load_attacked_dataset(budget):
    # load train/valid/test (attacked)
    train_att, valid_att, test_att = load_atk_train_valid_test(TRAINING_FILENAME_ATT.format(budget), 
                                                                  VALIDATION_FILENAME_ATT.format(budget), 
                                                                  TEST_FILENAME_ATT.format(budget))

    test_groups = test_att['instance_id'].value_counts().sort_index().values
    test_att = test_att.iloc[:, 1:]

    valid_groups = valid_att['instance_id'].value_counts().sort_index().values
    valid_att = valid_att.iloc[:, 1:]

    train_groups = train_att['instance_id'].value_counts().sort_index().values
    train_att = train_att.iloc[:, 1:]
    
    return train_att, train_groups, valid_att, valid_groups, test_att, test_groups

## Load _all_ the attacked datasets given a list of budgets

In [71]:
def load_attacked_datasets(budgets):
    att_datasets = {}
    for b in budgets:
        att_datasets[b] = load_attacked_dataset(b)
    
    return att_datasets

# Evaluate all models w.r.t. standard metrics (i.e., attack-free)

In [72]:
def extract_model_name(model_filename):
    model_fileroot = model_filename.split('/')[-1].split('.')[0]
    model_name = model_fileroot.split('_')[0].title()
    training_budget = ''
    budget = model_fileroot.split('_B')[-1].split('_')[0]
    try: 
        int(budget)
        training_budget = ' [train budget={}]'.format(budget)
    except:
        pass
    
    return model_name + training_budget

In [73]:
def load_model(model_file):
    model = None
    try:
        model = lightgbm.Booster(model_file=model_file)
    except:
        print("LightGBM loading exception")
        try:
            with open(model_file, 'rb') as mf:
                model = dill.load(mf)
                print(model)
        except Exception as e:
            print(e)
            print("Dill loading exception")
            pass
    
    return model

In [74]:
def eval_all_models(eval_metrics, models_dir, test, model_filenames=None):
    
    if model_filenames is None:
        model_csv = sorted(glob.glob(models_dir + "/*.csv"))
        model_filenames = []

        for m in model_csv:
            model_df = pd.read_csv(m)
            # print(model_df)
            model_filenames.append(model_df.sort_values(by='metric')['filename'].iloc[0])
    
    print ("### Evaluating Models:", model_filenames)
    
    df = pd.concat([eval_learned_models(eval_metrics, 
                                        load_model(mf), 
                                        extract_model_name(mf), 
                                        test) for mf in model_filenames],
                   axis=0,
                   sort=False
                  )
    
    df.reset_index(inplace=True, drop=True)
    
    return df

In [75]:
def eval_all_models_under_attack_budget(eval_metrics, models_dir, test, test_groups, budget, model_filenames=None):
    
    #model_filenames = sorted(glob.glob(models_dir + "/*.model"))
    if model_filenames is None:
        model_csv = sorted(glob.glob(models_dir + "/*.csv"))
        model_filenames = []

        for m in model_csv:
            model_df = pd.read_csv(m)
            model_filenames.append(model_df.sort_values(by='metric')['filename'].iloc[0])
    
    print ("### Evaluating Models:", model_filenames)

    df = pd.concat([eval_learned_models(eval_metrics, 
                                        load_model(mf), 
                                        extract_model_name(mf), 
                                        test,
                                        test_groups, 
                                        budget=budget
                                       ) for mf in model_filenames],
                   axis=0,
                   sort=False
                  )
    
    df.reset_index(inplace=True, drop=True)
    
    return df

In [76]:
def eval_all_models_under_attack(eval_metrics, models_dir, att_tests, budgets, model_filenames=None):
    
    eval_att_dfs = []

    for b in budgets:
    
        eval_att_dfs.append(
            eval_all_models_under_attack_budget(eval_metrics, models_dir, att_tests[b][4], att_tests[b][5], 
                                                b, model_filenames))
        
        
    eval_att_df = functools.reduce(lambda left,right: pd.merge(left,right,on=['Model', 'Budget']), eval_att_dfs)
    eval_att_df = pd.concat(eval_att_dfs, axis=0, sort=False)
    eval_att_df.reset_index(inplace=True, drop=True)
    
    return eval_att_df

# Evaluation metrics

In [77]:
EVAL_METRICS = [eval_log_loss, 
                eval_binary_err_rate,
                eval_specificity,
                eval_precision,
                eval_recall,
                eval_f1,
                eval_roc_auc
               ]


# Census

In [78]:
DATASET_NAME="census"
TRAINING_BUDGETS= [0]

DATASET_DIR="../data/{}".format(DATASET_NAME)
ATK_DIR=DATASET_DIR + "/attacks"
MODELS_DIR="../out/models/{}".format(DATASET_NAME)
OUTPUT_FILENAME="../out/results/{}".format(DATASET_NAME)

TRAINING_FILENAME=DATASET_DIR + "/" + "train.csv.bz2"
TRAINING_FILENAME_ATT=ATK_DIR + "/" + "train_B{}.atks.bz2"

VALIDATION_FILENAME=DATASET_DIR + "/" + "valid.csv.bz2"
VALIDATION_FILENAME_ATT=ATK_DIR + "/" + "valid_B{}.atks.bz2"

TEST_FILENAME=DATASET_DIR + "/" + "test.csv.bz2"
TEST_FILENAME_ATT=ATK_DIR + "/" + "test_B{}.atks.bz2"

In [102]:
# Final Models
# adv_models = ["../out/models/census/adv-boosting_census_B30_T200_S0050_L24_R200.model",
#               "../out/models/census/adv-boosting_census_B60_T200_S0050_L24_R197.model"]
adv_models = ["../out/models/census/adv-boosting_census_B30_T200_S0050_L24_R200.model",
              "../out/models/census/adv-boosting_census_B60_T200_S0050_L24_R200.model"]

# 20 trees
adv_models = ["../out/models/census/adv-boosting_census_B30_T200_S0050_L24_R200.T20.model",
              "../out/models/census/adv-boosting_census_B60_T200_S0050_L24_R200.T20.model"]

# 10K instances
adv_models = ["../out/models/census/adv-boosting_census_B60_T200_S0050_L24_R20.model",
              "../out/models/census/adv-boosting_census_B30_T200_S0050_L24_R41.model"]

gdbt_models = ["../out/models/census/std-gbdt_census_T100_S0050_L256_R100.model",
               "../out/models/census/std-gbdt_census_T100_S0050_L24_R100.model"]

robust_models = ["../out/models/census/par-robust_census_B0_T100_D8_I20.model"]

random_forests = ["../out/models/census/rf-gbdt_census_T100_S0050_L24_R100.model",
                  "../out/models/census/rf-gbdt_census_T100_S0050_L256_R92.model"]

test_models =  random_forests

In [103]:
# Without attacks
TRAIN, VALID, TEST = load_atk_train_valid_test(TRAINING_FILENAME, VALIDATION_FILENAME, TEST_FILENAME)

eval_std_df = eval_all_models(EVAL_METRICS, MODELS_DIR, TEST, test_models)
eval_std_df

Loading pre-processed files...
### Evaluating Models: ['../out/models/census/rf-gbdt_census_T100_S0050_L24_R100.model', '../out/models/census/rf-gbdt_census_T100_S0050_L256_R92.model']
LightGBM
Rf-Gbdt learning - eval_log_loss = 0.47460
Rf-Gbdt learning - eval_binary_err_rate = 0.13722
Rf-Gbdt learning - eval_specificity = 0.95431
Rf-Gbdt learning - eval_precision = 0.85798
Rf-Gbdt learning - eval_recall = 0.86278
Rf-Gbdt learning - eval_f1 = 0.85465
Rf-Gbdt learning - eval_roc_auc = 0.90927
******************************************************************************************************
LightGBM
Rf-Gbdt learning - eval_log_loss = 0.47109
Rf-Gbdt learning - eval_binary_err_rate = 0.13490
Rf-Gbdt learning - eval_specificity = 0.95064
Rf-Gbdt learning - eval_precision = 0.86015
Rf-Gbdt learning - eval_recall = 0.86510
Rf-Gbdt learning - eval_f1 = 0.85822
Rf-Gbdt learning - eval_roc_auc = 0.91383
****************************************************************************************

Unnamed: 0,Model,Budget,Log Loss,Binary Err Rate,Specificity,Precision,Recall,F1,Roc Auc
0,Rf-Gbdt,0,0.474601,0.137218,0.954306,0.857983,0.862782,0.854652,0.909271
1,Rf-Gbdt,0,0.471091,0.134896,0.950644,0.86015,0.865104,0.85822,0.913827


In [None]:
# %%capture tests

# With attacks
att_datasets = load_attacked_datasets(TRAINING_BUDGETS)

eval_att_df = eval_all_models_under_attack(MODELS_DIR, att_datasets, TRAINING_BUDGETS,
                                           test_models)

overall_df = pd.concat([eval_std_df, eval_att_df], 
                       axis=0, 
                       sort=False)
overall_df.reset_index(inplace=True, drop=True)
overall_df.to_csv(OUTPUT_FILENAME + ".csv", sep=",", index=False)

overall_df

# Wine

In [None]:
DATASET_NAME="wine"
TRAINING_BUDGETS= [30] 

DATASET_DIR="../data/{}".format(DATASET_NAME)
ATK_DIR=DATASET_DIR + "/attacks"
MODELS_DIR="../out/models/{}".format(DATASET_NAME)
OUTPUT_FILENAME="../out/results/{}".format(DATASET_NAME)

TRAINING_FILENAME=DATASET_DIR + "/" + "train.csv.bz2"
TRAINING_FILENAME_ATT=ATK_DIR + "/" + "train_B{}.atks.bz2"

VALIDATION_FILENAME=DATASET_DIR + "/" + "valid.csv.bz2"
VALIDATION_FILENAME_ATT=ATK_DIR + "/" + "valid_B{}.atks.bz2"

TEST_FILENAME=DATASET_DIR + "/" + "test.csv.bz2"
TEST_FILENAME_ATT=ATK_DIR + "/" + "test_B{}.atks.bz2"

In [None]:
# Final Models
adv_models = ["../out/models/wine/adv-boosting_wine_B30_T200_S0050_L24_R167.model",
              "../out/models/wine/adv-boosting_wine_B60_T200_S0050_L24_R200.model"]

gdbt_models = ["../out/models/wine/std-gbdt_wine_T500_S0050_L24_R497.model",
               "../out/models/wine/red-gbdt_wine_T500_S0050_L24_R497.model"]

gdbt_models = ["../out/models/wine/std-gbdt_wine_T200_S0050_L24_R199.model"]

robust_models = ["../out/models/wine/robust_wine_B30_T50_D8_I20.model",
                 "../out/models/wine/robust_wine_B60_T50_D8_I20.model"]
# robust_models = ["../out/models/wine/robust_wine_B30_T50_D8_I20_1.model",
#                  "../out/models/wine/robust_wine_B60_T50_D8_I20_1.model"]
robust_models = ["../out/models/wine/par-robust_wine_B30_T100_D2_I20.model"]

test_models = ["../out/models/census/par-robust_census_B0_T100_D8_I20.model]



In [None]:
# Without attacks
TRAIN, VALID, TEST = load_atk_train_valid_test(TRAINING_FILENAME, VALIDATION_FILENAME, TEST_FILENAME)

eval_std_df = eval_all_models(EVAL_METRICS, MODELS_DIR, TEST, test_models)
eval_std_df

In [None]:
# With attacks
att_datasets = load_attacked_datasets(TRAINING_BUDGETS)

eval_att_df = eval_all_models_under_attack(EVAL_METRICS, MODELS_DIR, att_datasets, TRAINING_BUDGETS,
                                           test_models)

overall_df = pd.concat([eval_std_df, eval_att_df], 
                       axis=0, 
                       sort=False)
overall_df.reset_index(inplace=True, drop=True)
overall_df.to_csv(OUTPUT_FILENAME + ".csv", sep=",", index=False)

overall_df

In [None]:
overall_df

# Prune Robust models

In [None]:
to_be_pruned_models = ["../out/models/census/robust_census_B0_T100_D8_I20_20.tmp"]

for m in to_be_pruned_models:
    prune_trained_model(m, 20)


# Prune LGBM models

In [None]:
def prune_lgbm(in_file, out_file, n):
    model = lightgbm.Booster(model_file=in_file)
    model.save_model(out_file, num_iteration=n)
    print ("saved.")
    
prune_lgbm("../out/models/wine/std-gbdt_wine_T200_S0050_L24_R199.model",
           "../out/models/wine/std-gbdt_wine_T200_S0050_L24_R199.T10.model",
           10)
# prune_lgbm("../out/models/census/adv-boosting_census_B60_T200_S0050_L24_R200.model",
#            "../out/models/census/adv-boosting_census_B60_T200_S0050_L24_R200.T20.model",
#            20)

# Feature importance check

In [None]:
def print_fx_imp(model, colnames):
    fx_uses = model.feature_importance(importance_type='split')
    fx_gain = model.feature_importance(importance_type='gain')

    for i,f in enumerate(np.argsort(fx_gain)[::-1]):
        print ("{:2d} {:20s} {:.3f} {:4d}".format(i, colnames[f], fx_gain[f], fx_uses[f]))

print(" -- GDBT --")    
gbdt = lightgbm.Booster(model_file="../out/models/census/std-gbdt_census_T100_S0050_L24_R100.model")
print(gbdt.num_trees())
print_fx_imp(gbdt, TRAIN.columns)

print(" -- Reduced GDBT --")    
redf = lightgbm.Booster(model_file="../out/models/census/red-gbdt_census_T100_S0050_L24_R98.model")
print(redf.num_trees())
print_fx_imp(redf, TRAIN.drop(columns=["workclass", 
                                       "marital_status", 
                                       "occupation", 
                                       "education_num", 
                                       "hours_per_week", 
                                       "capital_gain"
                                      ]).columns)


print(" -- Adv. Boosting --")    
advb = lightgbm.Booster(model_file="../out/models/census/adv-boosting_census_B30_T100_S0050_L24_R100.model")
print(advb.num_trees())
print_fx_imp(advb, TRAIN.columns)


In [None]:
bb = 40
eval_learned_models(lightgbm.Booster(model_file="../out/models/wine2/red-gbdt_wine2_T500_S0050_L24_R281.model"), 
                                        extract_model_name("../out/models/wine2/red-gbdt_wine2_T500_S0050_L24_R281.model"), 
                                        att_datasets[bb][4].drop(columns=["alcohol", "residual_sugar", "volatile_acidity"]), 
                                        att_datasets[bb][5], 
                                        budget=bb
                                       ) 

In [None]:
!cat ../out/models/census/par-robust_census_B0_T100_D8_I20.model | tail

In [None]:
!git pull

In [None]:
!git commit -am "calza"

In [None]:
!git pull

In [None]:
!git push