# Evaluating Models

This notebook contains the code used for evaluating the following learning models:

-  **Standard GBDT** (_baseline 1_)
-  **Adversarial Boosting** (_baseline 2_)
-  **Non-Interferent GBDT** (our proposal)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import json
import glob
import pickle
import dill
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm
import functools
from os import listdir
from os.path import isfile, join
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, precision_score, recall_score
from nilib import *
from robust_forest import *
from sklearn.ensemble import BaggingClassifier

2019-06-04 21:24:17,168 *** INFO [robust_forest.py:1151 - __init__()] *** ***** Robust Decision Tree successfully created *****
2019-06-04 21:24:17,169 *** INFO [robust_forest.py:1152 - __init__()] *** *	Tree ID: 0
2019-06-04 21:24:17,169 *** INFO [robust_forest.py:1153 - __init__()] *** *	Attacker: <robust_forest.Attacker object at 0x7f162928b6a0>
2019-06-04 21:24:17,170 *** INFO [robust_forest.py:1155 - __init__()] *** *	Splitting criterion: SSE
2019-06-04 21:24:17,170 *** INFO [robust_forest.py:1156 - __init__()] *** *	Max depth: 8
2019-06-04 21:24:17,171 *** INFO [robust_forest.py:1158 - __init__()] *** *	Min instances per tree node: 20
2019-06-04 21:24:17,171 *** INFO [robust_forest.py:1160 - __init__()] *** *	Max samples: 100.0%
2019-06-04 21:24:17,172 *** INFO [robust_forest.py:1162 - __init__()] *** *	Max features: 100.0%
2019-06-04 21:24:17,172 *** INFO [robust_forest.py:1164 - __init__()] *** *	Feature blacklist: set()
2019-06-04 21:24:17,173 *** INFO [robust_forest.py:1165 -

# Standard evaluation metric

The following function is the one used for evaluating the quality of the learned model (either _standard_, _adversarial-boosting_, or _non-interferent_). This is the standard <code>avg_log_loss</code>.

In [3]:
def logistic(x):
    return 1.0/(1.0 + np.exp(-x))

In [4]:
def logit(p):
    return np.log(p/(1-p))

In [5]:
def binarize(preds):
    if np.min(preds)<-0.001:
        return np.where(preds>=0,  1.0, -1.0)
    else:
        return np.where(preds>=.5, 1.0, -1.0)

# <code>avg_log_loss</code>

In [6]:
# self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool
def avg_log_loss(preds, train_data):
    
    labels = train_data.get_label()
    losses = np.log(1.0 + np.exp(-preds*labels))
    avg_loss = np.mean(losses)
    
    return 'avg_binary_log_loss', avg_loss, False

In [7]:
def eval_log_loss(y_true, y_pred):
    losses = np.log(1.0 + np.exp(-y_pred*y_true))
    avg_loss = np.mean(losses)
    return avg_loss

# Custom evaluation metric

Similarly to what we have done for <code>fobj</code>, <code>feval</code> can be computed from a weighted combination of two evaluation metrics:

-  <code>avg_log_loss</code> (standard, defined above);
-  <code>avg_log_loss_uma</code> (custom, defined below).

# <code>avg_log_loss_uma</code>

This is the binary log loss yet modified to operate on groups of perturbed instances.

In [8]:
# Our custom metric

def binary_log_loss(pred, true_label):

    return np.log(1.0 + np.exp(-pred * true_label))

# self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, value: array, is_higher_better: bool
def avg_log_loss_uma(preds, train_data):
    labels = train_data.get_label()
    attack_lens = train_data.get_group()
    
    offset = 0
    max_logloss = []
    avg_max_logloss = 0.0
    
    if attack_lens is not None:
    
        for atk in attack_lens:
            losses = [binary_log_loss(h,t) for h,t in zip(preds[offset:offset+atk], labels[offset:offset+atk])]
            max_logloss.append(max(losses))
            
            offset += atk
        
        avg_max_logloss = np.mean(max_logloss)  
        
    return 'avg_binary_log_loss_under_max_attack', avg_max_logloss, False

In [9]:
def eval_log_loss_uma(preds, test, test_groups=None, svm=False):
    
    lgbm_test = lightgbm.Dataset(data=test.iloc[:,:-1].values, 
                                 label=test.iloc[:,-1].values,
                                 group=test_groups,
                                 free_raw_data=False)
    
    return avg_log_loss_uma(preds,lgbm_test)[1]


# <code>eval_binary_err_rate</code>

In [10]:
def eval_binary_err_rate(y_true, y_pred):
    errs = np.sum(binarize(y_pred) != y_true)
    return errs/len(y_true)


# <code>eval_roc_auc</code>

In [11]:
def eval_roc_auc(y_true, y_pred):
    return roc_auc_score(y_true=y_true, y_score=y_pred)


# <code>eval_specificity</code>

In [12]:
def eval_specificity(y_true, y_pred):
    y_pred = binarize(y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true=y_true, y_pred=y_pred).ravel()

    return tn/(tn + fp)

# <code>eval_precision</code>

In [13]:
def eval_precision(y_true, y_pred):
    y_pred = binarize(y_pred)
    return precision_score(y_true=y_true, y_pred=y_pred, average='weighted')


# <code>eval_recall</code>

In [14]:
def eval_recall(y_true, y_pred):
    y_pred = binarize(y_pred)
    return recall_score(y_true=y_true, y_pred=y_pred, average='weighted')


# <code>eval_f1</code>

In [15]:
def eval_f1(y_true, y_pred):
    y_pred = binarize(y_pred)
    return f1_score(y_true=y_true, y_pred=y_pred, average='weighted')

# Evaluate each model w.r.t. _all_ evaluation metrics

In [16]:
import sklearn

def model_predict(model,test_set):
    X = test_set.iloc[:,:-1].values

    if isinstance(model, sklearn.ensemble.BaggingClassifier):
        print ("BAGGING")
#         print ( np.min( model.predict_proba(X)[:,0] ), np.max( model.predict_proba(X)[:,0] ) )
#         print ( np.min( model.predict_proba(X)[:,1] ), np.max( model.predict_proba(X)[:,1] ) )
        return model.predict_proba(X)[:,1]
        # return model.predict(X)
    else:
        print ("LightGBM")
#        print (np.unique( model.predict(X) ) )
#         lgbm_X = lightgbm.Dataset(data=test_set.iloc[:,:-1], 
#                                   label=test_set.iloc[:,-1])

        return model.predict(test_set.iloc[:,:-1])

def model_worst_predict(model, test_set, test_groups):
    labels = test_set.iloc[:,-1].values
    preds  = model_predict(model, test_set)
    
    offset = 0
    true_labels = []
    worst_predictions = []
    
    for g in test_groups:
        true_label = labels[offset]
        true_labels.append(true_label)
        predictions_att = preds[offset:offset+g]
        if true_label == 1:
            worst_predictions.append(np.min(predictions_att))
        else:
            worst_predictions.append(np.max(predictions_att))
    
        offset += g

    return np.array(true_labels), np.array(worst_predictions)

In [17]:
def eval_learned_models(eval_metrics, model, model_type, test, test_groups=None, budget=0):
    # output dataframe
    header = ['Model'] + ['Budget'] + [m.__name__.replace('eval_','').replace('_',' ').strip().title() 
                                       for m in eval_metrics]
    df = pd.DataFrame(columns=header)
    first_row = [model_type] + [budget] + [None for m in eval_metrics]
    df.loc[0] = first_row
    
    # predictions for plan and atk datasets
    if test_groups is None: # NOT ATKed
        y_true = test.iloc[:,-1].values
        y_pred = model_predict(model, test)
    else:
        y_true, y_pred = model_worst_predict(model, test, test_groups)
        
    for eval_metric in eval_metrics:
        res = eval_metric(y_true=y_true, y_pred=y_pred)
        print("{} learning - {} = {:.5f}"
                  .format(model_type, eval_metric.__name__, res))
        column_metric = eval_metric.__name__
        df[column_metric.replace('eval_','').replace('_',' ').strip().title()] = res

    print("******************************************************************************************************")
    
    return df

# Load attacked datasets

## Load an attacked dataset with a specific budget

In [18]:
def load_attacked_dataset(budget):
    # load train/valid/test (attacked)
    train_att, valid_att, test_att = load_atk_train_valid_test(TRAINING_FILENAME_ATT.format(budget), 
                                                                  VALIDATION_FILENAME_ATT.format(budget), 
                                                                  TEST_FILENAME_ATT.format(budget))

    test_groups = test_att['instance_id'].value_counts().sort_index().values
    test_att = test_att.iloc[:, 1:]

    valid_groups = valid_att['instance_id'].value_counts().sort_index().values
    valid_att = valid_att.iloc[:, 1:]

    train_groups = train_att['instance_id'].value_counts().sort_index().values
    train_att = train_att.iloc[:, 1:]
    
    return train_att, train_groups, valid_att, valid_groups, test_att, test_groups

## Load _all_ the attacked datasets given a list of budgets

In [19]:
def load_attacked_datasets(budgets):
    att_datasets = {}
    for b in budgets:
        att_datasets[b] = load_attacked_dataset(b)
    
    return att_datasets

# Evaluate all models w.r.t. standard metrics (i.e., attack-free)

In [20]:
def extract_model_name(model_filename):
    model_fileroot = model_filename.split('/')[-1].split('.')[0]
    model_name = model_fileroot.split('_')[0].title()
    training_budget = ''
    budget = model_fileroot.split('_B')[-1].split('_')[0]
    try: 
        int(budget)
        training_budget = ' [train budget={}]'.format(budget)
    except:
        pass
    
    return model_name + training_budget

In [22]:
def load_model(model_file):
    model = None
    try:
        model = lightgbm.Booster(model_file=model_file)
    except:
        print("LightGBM loading exception")
        try:
            with open(model_file, 'rb') as mf:
                model = dill.load(mf)
                print(model)
                model.n_jobs = 8
        except Exception as e:
            print(e)
            print("Dill loading exception")
            pass
    
    return model

In [23]:
def eval_all_models(eval_metrics, models_dir, test, model_filenames=None):
    
    if model_filenames is None:
        model_csv = sorted(glob.glob(models_dir + "/*.csv"))
        model_filenames = []

        for m in model_csv:
            model_df = pd.read_csv(m)
            # print(model_df)
            model_filenames.append(model_df.sort_values(by='metric')['filename'].iloc[0])
    
    print ("### Evaluating Models:", model_filenames)
    
    df = pd.concat([eval_learned_models(eval_metrics, 
                                        load_model(mf), 
                                        extract_model_name(mf), 
                                        test) for mf in model_filenames],
                   axis=0,
                   sort=False
                  )
    
    df.reset_index(inplace=True, drop=True)
    
    return df

In [24]:
def eval_all_models_under_attack_budget(eval_metrics, models_dir, test, test_groups, budget, model_filenames=None):
    
    #model_filenames = sorted(glob.glob(models_dir + "/*.model"))
    if model_filenames is None:
        model_csv = sorted(glob.glob(models_dir + "/*.csv"))
        model_filenames = []

        for m in model_csv:
            model_df = pd.read_csv(m)
            model_filenames.append(model_df.sort_values(by='metric')['filename'].iloc[0])
    
    print ("### Evaluating Models:", model_filenames)

    df = pd.concat([eval_learned_models(eval_metrics, 
                                        load_model(mf), 
                                        extract_model_name(mf), 
                                        test,
                                        test_groups, 
                                        budget=budget
                                       ) for mf in model_filenames],
                   axis=0,
                   sort=False
                  )
    
    df.reset_index(inplace=True, drop=True)
    
    return df

In [25]:
def eval_all_models_under_attack(eval_metrics, models_dir, att_tests, budgets, model_filenames=None):
    
    eval_att_dfs = []

    for b in budgets:
        eval_att_dfs.append(
            eval_all_models_under_attack_budget(eval_metrics, models_dir, att_tests[b][4], att_tests[b][5], 
                                                b, model_filenames))
        
        
    eval_att_df = functools.reduce(lambda left,right: pd.merge(left,right,on=['Model', 'Budget']), eval_att_dfs)
    eval_att_df = pd.concat(eval_att_dfs, axis=0, sort=False)
    eval_att_df.reset_index(inplace=True, drop=True)
    
    return eval_att_df

# Evaluation metrics

In [26]:
EVAL_METRICS = [eval_log_loss, 
                eval_binary_err_rate,
                eval_specificity,
                eval_precision,
                eval_recall,
                eval_f1,
                eval_roc_auc
               ]


# Census

In [None]:
DATASET_NAME="census"
TRAINING_BUDGETS= [30, 60]

DATASET_DIR="../data/{}".format(DATASET_NAME)
ATK_DIR=DATASET_DIR + "/attacks"
MODELS_DIR="../out/models/{}".format(DATASET_NAME)
OUTPUT_FILENAME="../out/results/{}".format(DATASET_NAME)

TRAINING_FILENAME=DATASET_DIR + "/" + "train.csv.bz2"
TRAINING_FILENAME_ATT=ATK_DIR + "/" + "train_B{}.atks.bz2"

VALIDATION_FILENAME=DATASET_DIR + "/" + "valid.csv.bz2"
VALIDATION_FILENAME_ATT=ATK_DIR + "/" + "valid_B{}.atks.bz2"

TEST_FILENAME=DATASET_DIR + "/" + "test.csv.bz2"
TEST_FILENAME_ATT=ATK_DIR + "/" + "test_B{}.atks.bz2"

In [None]:
# Final Models
adv_models = ["../out/models/census/adv-boosting_census_B30_T100_S0050_L24_R99.model",
              "../out/models/census/adv-boosting_census_B30_T100_S0050_L256_R100.model",
              "../out/models/census/adv-boosting_census_B60_T100_S0050_L24_R96.model",
              "../out/models/census/adv-boosting_census_B60_T100_S0050_L256_R96.model"
             ]

gdbt_models = ["../out/models/census/std-gbdt_census_T100_S0050_L24_R100.model",
               "../out/models/census/std-gbdt_census_T100_S0050_L256_R100.model"]

red_models = ["../out/models/census/red-gbdt_census_T100_S0050_L24_R95.model",
             "../out/models/census/red-gbdt_census_T100_S0050_L256_R93.model"]

rf_models = ["../out/models/census/rf-gbdt_census_T100_S0050_L24_R100.model",
             "../out/models/census/rf-gbdt_census_T100_S0050_L256_R92.model"]



robust_models = ["../out/models/census/par-robust_census_B0_T100_D8_I20.model"]

test_models =  adv_models + gdbt_models + rf_models + robust_models

# REDUCED are not working any more??!?
#test_models = red_models

In [None]:
# Without attacks
TRAIN, VALID, TEST = load_atk_train_valid_test(TRAINING_FILENAME, VALIDATION_FILENAME, TEST_FILENAME)

eval_std_df = eval_all_models(EVAL_METRICS, MODELS_DIR, TEST, test_models)
eval_std_df

In [None]:
# %%capture tests

# With attacks
att_datasets = load_attacked_datasets(TRAINING_BUDGETS)

eval_att_df = eval_all_models_under_attack(EVAL_METRICS, MODELS_DIR, att_datasets, TRAINING_BUDGETS,
                                           test_models)

overall_df = pd.concat([eval_std_df, eval_att_df], 
                       axis=0, 
                       sort=False)
overall_df.reset_index(inplace=True, drop=True)
overall_df.to_csv(OUTPUT_FILENAME + ".csv", sep=",", index=False)

overall_df

In [None]:
overall_df

# Wine

In [27]:
DATASET_NAME="wine"
TRAINING_BUDGETS= [30, 60] 

DATASET_DIR="../data/{}".format(DATASET_NAME)
ATK_DIR=DATASET_DIR + "/attacks"
MODELS_DIR="../out/models/{}".format(DATASET_NAME)
OUTPUT_FILENAME="../out/results/{}".format(DATASET_NAME)

TRAINING_FILENAME=DATASET_DIR + "/" + "train.csv.bz2"
TRAINING_FILENAME_ATT=ATK_DIR + "/" + "train_B{}.atks.bz2"

VALIDATION_FILENAME=DATASET_DIR + "/" + "valid.csv.bz2"
VALIDATION_FILENAME_ATT=ATK_DIR + "/" + "valid_B{}.atks.bz2"

TEST_FILENAME=DATASET_DIR + "/" + "test.csv.bz2"
TEST_FILENAME_ATT=ATK_DIR + "/" + "test_B{}.atks.bz2"

In [28]:
# Final Models
adv_models = ["../out/models/wine/adv-boosting_wine_B30_T100_S0050_L24_R100.model",
              "../out/models/wine/adv-boosting_wine_B30_T100_S0050_L256_R100.model",
              "../out/models/wine/adv-boosting_wine_B60_T100_S0050_L24_R82.model",
              "../out/models/wine/adv-boosting_wine_B60_T100_S0050_L256_R85.model"
             ]

gdbt_models = ["../out/models/wine/std-gbdt_wine_T100_S0050_L24_R100.model",
               "../out/models/wine/std-gbdt_wine_T100_S0050_L256_R97.model"]

red_models = ["../out/models/wine/red-gbdt_wine_T100_S0050_L24_R99.model",
             "../out/models/wine/red-gbdt_wine_T100_S0050_L256_R100.model"]

rf_models = ["../out/models/wine/rf-gbdt_wine_T100_S0050_L24_R100.model",
             "../out/models/wine/rf-gbdt_wine_T100_S0050_L256_R100.model"]

robust_models = ["../out/models/wine/par-robust_wine_B0_T100_D8_I20.model",
                "../out/models/wine/par-robust_wine_B60_T100_D8_I20.model",
                "../out/models/wine/par-robust_wine_B30_T100_D8_I20.model"]

test_models = adv_models + gdbt_models + red_models + rf_models
test_models = robust_models


In [29]:
# Without attacks
TRAIN, VALID, TEST = load_atk_train_valid_test(TRAINING_FILENAME, VALIDATION_FILENAME, TEST_FILENAME)

eval_std_df = eval_all_models(EVAL_METRICS, MODELS_DIR, TEST, test_models)
eval_std_df

2019-06-04 21:24:56,382 *** INFO [parallel_robust_forest.py:1214 - __init__()] *** ***** Robust Decision Tree successfully created *****
2019-06-04 21:24:56,384 *** INFO [parallel_robust_forest.py:1215 - __init__()] *** *	Tree ID: 0
2019-06-04 21:24:56,385 *** INFO [parallel_robust_forest.py:1216 - __init__()] *** *	Attacker: <parallel_robust_forest.Attacker object at 0x7f1621411f60>
2019-06-04 21:24:56,385 *** INFO [parallel_robust_forest.py:1218 - __init__()] *** *	Splitting criterion: SSE
2019-06-04 21:24:56,386 *** INFO [parallel_robust_forest.py:1219 - __init__()] *** *	Max depth: 8
2019-06-04 21:24:56,387 *** INFO [parallel_robust_forest.py:1221 - __init__()] *** *	Min instances per tree node: 20
2019-06-04 21:24:56,388 *** INFO [parallel_robust_forest.py:1223 - __init__()] *** *	Max samples: 100.0%
2019-06-04 21:24:56,389 *** INFO [parallel_robust_forest.py:1225 - __init__()] *** *	Max features: 100.0%
2019-06-04 21:24:56,390 *** INFO [parallel_robust_forest.py:1227 - __init__()

Loading pre-processed files...
### Evaluating Models: ['../out/models/wine/par-robust_wine_B0_T100_D8_I20.model', '../out/models/wine/par-robust_wine_B60_T100_D8_I20.model', '../out/models/wine/par-robust_wine_B30_T100_D8_I20.model']
LightGBM loading exception
BaggingClassifier(base_estimator=RobustDecisionTree(attacker=None, feature_blacklist={}, max_depth=8,
          max_features=0.8, max_samples=0.8, min_instances_per_node=20,
          replace_features=False, replace_samples=False, seed=0,
          split_optimizer=None, tree_id=0),
         bootstrap=False, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=100, n_jobs=None, oob_score=False,
         random_state=None, verbose=0, warm_start=False)
BAGGING
Par-Robust [train budget=0] learning - eval_log_loss = 0.60006
Par-Robust [train budget=0] learning - eval_binary_err_rate = 0.24154
Par-Robust [train budget=0] learning - eval_specificity = 0.62055
Par-Robust [train budget=0] learning - eval_prec

Unnamed: 0,Model,Budget,Log Loss,Binary Err Rate,Specificity,Precision,Recall,F1,Roc Auc
0,Par-Robust [train budget=0],0,0.600058,0.241538,0.620545,0.754687,0.758462,0.755485,0.825252
1,Par-Robust [train budget=60],0,0.598853,0.236923,0.647799,0.760613,0.763077,0.761476,0.830038
2,Par-Robust [train budget=30],0,0.599261,0.229231,0.641509,0.767518,0.770769,0.768211,0.830762


In [30]:
# With attacks
att_datasets = load_attacked_datasets(TRAINING_BUDGETS)

eval_att_df = eval_all_models_under_attack(EVAL_METRICS, MODELS_DIR, att_datasets, TRAINING_BUDGETS,
                                           test_models)

overall_df = pd.concat([eval_std_df, eval_att_df], 
                       axis=0, 
                       sort=False)
overall_df.reset_index(inplace=True, drop=True)
overall_df.to_csv(OUTPUT_FILENAME + ".csv", sep=",", index=False)

overall_df

Loading pre-processed files...
Loading pre-processed files...
### Evaluating Models: ['../out/models/wine/par-robust_wine_B0_T100_D8_I20.model', '../out/models/wine/par-robust_wine_B60_T100_D8_I20.model', '../out/models/wine/par-robust_wine_B30_T100_D8_I20.model']
LightGBM loading exception
BaggingClassifier(base_estimator=RobustDecisionTree(attacker=None, feature_blacklist={}, max_depth=8,
          max_features=0.8, max_samples=0.8, min_instances_per_node=20,
          replace_features=False, replace_samples=False, seed=0,
          split_optimizer=None, tree_id=0),
         bootstrap=False, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=100, n_jobs=None, oob_score=False,
         random_state=None, verbose=0, warm_start=False)
BAGGING
Par-Robust [train budget=0] learning - eval_log_loss = 0.65284
Par-Robust [train budget=0] learning - eval_binary_err_rate = 0.43385
Par-Robust [train budget=0] learning - eval_specificity = 0.11321
Par-Robust [train

Unnamed: 0,Model,Budget,Log Loss,Binary Err Rate,Specificity,Precision,Recall,F1,Roc Auc
0,Par-Robust [train budget=0],0,0.600058,0.241538,0.620545,0.754687,0.758462,0.755485,0.825252
1,Par-Robust [train budget=60],0,0.598853,0.236923,0.647799,0.760613,0.763077,0.761476,0.830038
2,Par-Robust [train budget=30],0,0.599261,0.229231,0.641509,0.767518,0.770769,0.768211,0.830762
3,Par-Robust [train budget=0],30,0.652836,0.433846,0.113208,0.492341,0.566154,0.506852,0.626962
4,Par-Robust [train budget=60],30,0.598887,0.236923,0.647799,0.760613,0.763077,0.761476,0.829827
5,Par-Robust [train budget=30],30,0.599284,0.229231,0.641509,0.767518,0.770769,0.768211,0.830642
6,Par-Robust [train budget=0],60,0.654436,0.438462,0.102725,0.482884,0.561538,0.50013,0.61959
7,Par-Robust [train budget=60],60,0.598887,0.236923,0.647799,0.760613,0.763077,0.761476,0.829827
8,Par-Robust [train budget=30],60,0.599297,0.229231,0.641509,0.767518,0.770769,0.768211,0.830591


In [None]:
overall_df

## COMPETITORS

Model	Budget	Log Loss	Binary Err Rate	Specificity	Precision	Recall	F1	Roc Auc
0	Adv-Boosting [train budget=30]	0	0.552537	0.218462	0.675052	0.779469	0.781538	0.780177	0.837762
1	Adv-Boosting [train budget=30]	0	0.536333	0.213846	0.675052	0.783812	0.786154	0.784482	0.857376
2	Adv-Boosting [train budget=60]	0	0.557392	0.229231	0.654088	0.768163	0.770769	0.768977	0.834622
3	Adv-Boosting [train budget=60]	0	0.540496	0.218462	0.683438	0.780012	0.781538	0.780619	0.851586
4	Std-Gbdt	0	0.551204	0.232308	0.631027	0.764097	0.767692	0.764692	0.839458
5	Std-Gbdt	0	0.540879	0.222308	0.658281	0.774936	0.777692	0.775651	0.851046
6	Red-Gbdt	0	0.720991	0.624615	0.849057	0.468787	0.375385	0.290690	0.398885
7	Red-Gbdt	0	0.701899	0.450000	0.419287	0.556118	0.550000	0.552762	0.481782
8	Rf-Gbdt	0	0.576459	0.239231	0.597484	0.756262	0.760769	0.755938	0.817466
9	Rf-Gbdt	0	0.565125	0.236923	0.607966	0.758810	0.763077	0.758839	0.834560
10	Adv-Boosting [train budget=30]	30	0.569712	0.241538	0.628931	0.755126	0.758462	0.756041	0.810073
11	Adv-Boosting [train budget=30]	30	0.558130	0.235385	0.624738	0.760845	0.764615	0.761435	0.825665
12	Adv-Boosting [train budget=60]	30	0.576883	0.256923	0.601677	0.738991	0.743077	0.740061	0.801165
13	Adv-Boosting [train budget=60]	30	0.565061	0.250769	0.622642	0.746104	0.749231	0.747135	0.814220
14	Std-Gbdt	30	0.626599	0.387692	0.224319	0.574796	0.612308	0.572900	0.695082
15	Std-Gbdt	30	0.632325	0.397692	0.224319	0.563710	0.602308	0.565373	0.674240
16	Red-Gbdt	30	0.727191	0.633077	0.849057	0.445089	0.366923	0.276275	0.349323
17	Red-Gbdt	30	0.713142	0.498462	0.299790	0.497063	0.501538	0.499194	0.407729
18	Rf-Gbdt	30	0.633231	0.404615	0.148847	0.537706	0.595385	0.538716	0.684260
19	Rf-Gbdt	30	0.626627	0.395385	0.186583	0.558575	0.604615	0.556913	0.696457
20	Adv-Boosting [train budget=30]	60	0.571455	0.242308	0.626834	0.754280	0.757692	0.755196	0.807250
21	Adv-Boosting [train budget=30]	60	0.560589	0.238462	0.620545	0.757672	0.761538	0.758316	0.821925
22	Adv-Boosting [train budget=60]	60	0.579503	0.259231	0.595388	0.736430	0.740769	0.737498	0.796106
23	Adv-Boosting [train budget=60]	60	0.567124	0.251538	0.620545	0.745248	0.748462	0.746290	0.810939
24	Std-Gbdt	60	0.628858	0.387692	0.224319	0.574796	0.612308	0.572900	0.688013
25	Std-Gbdt	60	0.634201	0.399231	0.222222	0.561660	0.600769	0.563691	0.668282
26	Red-Gbdt	60	0.728780	0.633077	0.849057	0.445089	0.366923	0.276275	0.335776
27	Red-Gbdt	60	0.716420	0.525385	0.241090	0.464610	0.474615	0.469203	0.378909
28	Rf-Gbdt	60	0.635470	0.412308	0.134172	0.523740	0.587692	0.528536	0.677227
29	Rf-Gbdt	60	0.629036	0.405385	0.167715	0.542278	0.594615	0.544130	0.688354

# spam

In [None]:
DATASET_NAME="spam"
TRAINING_BUDGETS= [30, 60] 

DATASET_DIR="../data/{}".format(DATASET_NAME)
ATK_DIR=DATASET_DIR + "/attacks"
MODELS_DIR="../out/models/{}".format(DATASET_NAME)
OUTPUT_FILENAME="../out/results/{}".format(DATASET_NAME)

TRAINING_FILENAME=DATASET_DIR + "/" + "train.csv.bz2"
TRAINING_FILENAME_ATT=ATK_DIR + "/" + "train_B{}.atks.bz2"

VALIDATION_FILENAME=DATASET_DIR + "/" + "valid.csv.bz2"
VALIDATION_FILENAME_ATT=ATK_DIR + "/" + "valid_B{}.atks.bz2"

TEST_FILENAME=DATASET_DIR + "/" + "test.csv.bz2"
TEST_FILENAME_ATT=ATK_DIR + "/" + "test_B{}.atks.bz2"

In [None]:
# Final Models
adv_models = ["../out/models/wine/adv-boosting_wine_B30_T100_S0050_L24_R100.model",
              "../out/models/wine/adv-boosting_wine_B30_T100_S0050_L256_R100.model",
              "../out/models/wine/adv-boosting_wine_B60_T100_S0050_L24_R82.model",
              "../out/models/wine/adv-boosting_wine_B60_T100_S0050_L256_R85.model"
             ]

gdbt_models = ["../out/models/spam/std-gbdt_spam_T100_S0050_L24_R100.model",
               "../out/models/spam/std-gbdt_spam_T100_S0050_L256_R100.model"]

red_models = ["../out/models/spam/red-gbdt_spam_T100_S0050_L24_R100.model",
             "../out/models/spam/red-gbdt_spam_T100_S0050_L256_R100.model"]

rf_models = ["../out/models/spam/rf-gbdt_spam_T100_S0050_L24_R98.model",
             "../out/models/spam/rf-gbdt_spam_T100_S0050_L256_R98.model"]

robust_models = ["../out/models/wine/par-robust_wine_B0_T100_D8_I20.model",
                "../out/models/wine/par-robust_wine_B60_T100_D8_I20.model",
                "../out/models/wine/par-robust_wine_B30_T100_D8_I20.model"]

test_models = gdbt_models + red_models + rf_models



In [None]:
# Without attacks
TRAIN, VALID, TEST = load_atk_train_valid_test(TRAINING_FILENAME, VALIDATION_FILENAME, TEST_FILENAME)

eval_std_df = eval_all_models(EVAL_METRICS, MODELS_DIR, TEST, test_models)
eval_std_df

In [None]:
# With attacks
att_datasets = load_attacked_datasets(TRAINING_BUDGETS)

eval_att_df = eval_all_models_under_attack(EVAL_METRICS, MODELS_DIR, att_datasets, TRAINING_BUDGETS,
                                           test_models)

overall_df = pd.concat([eval_std_df, eval_att_df], 
                       axis=0, 
                       sort=False)
overall_df.reset_index(inplace=True, drop=True)
overall_df.to_csv(OUTPUT_FILENAME + ".csv", sep=",", index=False)

overall_df

# Prune Robust models

In [None]:
to_be_pruned_models = ["../out/models/census/robust_census_B0_T100_D8_I20_20.tmp"]

for m in to_be_pruned_models:
    prune_trained_model(m, 20)


# Prune LGBM models

In [None]:
def prune_lgbm(in_file, out_file, n):
    model = lightgbm.Booster(model_file=in_file)
    model.save_model(out_file, num_iteration=n)
    print ("saved.")
    
prune_lgbm("../out/models/wine/std-gbdt_wine_T200_S0050_L24_R199.model",
           "../out/models/wine/std-gbdt_wine_T200_S0050_L24_R199.T10.model",
           10)
# prune_lgbm("../out/models/census/adv-boosting_census_B60_T200_S0050_L24_R200.model",
#            "../out/models/census/adv-boosting_census_B60_T200_S0050_L24_R200.T20.model",
#            20)

## Debug

In [None]:
with open("../out/models/wine/par-robust_wine_B0_T100_D8_I20.model", 'rb') as f:
    model = dill.load(f)

In [None]:
def pretty_print(node, out=sys.stdout, tabs=''):

    leaf_txt = "{}Prediction: {}; Score: {:.5f}; N. instances: {}; Loss: {:.5f}".format(tabs,
                                                                                        node.get_node_prediction()[
                                                                                            0],
                                                                                        node.get_node_prediction()[
                                                                                            1],
                                                                                        node.values,
                                                                                        node.loss_value)
    internal_node_txt = "{}Feature ID: {}; Threshold: {}; N. instances: {}".format(tabs,
                                                                                   node.best_split_feature_id,
                                                                                   node.best_split_feature_value,
                                                                                   node.values
                                                                                   )

    if node.is_leaf():  # base case
        out.write(leaf_txt + "\n")
    else:  # recursive case
        out.write(internal_node_txt + "\n")
        node.left.pretty_print(out, tabs + "\t")
        node.right.pretty_print(out, tabs + "\t")

pretty_print(model.estimators_[0].root )


In [None]:
pretty_print(model.estimators_[1].root )

# Feature importance check

In [None]:
def print_fx_imp(model, colnames):
    fx_uses = model.feature_importance(importance_type='split')
    fx_gain = model.feature_importance(importance_type='gain')

    for i,f in enumerate(np.argsort(fx_gain)[::-1]):
        print ("{:2d} {:20s} {:.3f} {:4d}".format(i, colnames[f], fx_gain[f], fx_uses[f]))

print(" -- GDBT --")    
gbdt = lightgbm.Booster(model_file="../out/models/census/std-gbdt_census_T100_S0050_L24_R100.model")
print(gbdt.num_trees())
print_fx_imp(gbdt, TRAIN.columns)

print(" -- Reduced GDBT --")    
redf = lightgbm.Booster(model_file="../out/models/census/red-gbdt_census_T100_S0050_L24_R98.model")
print(redf.num_trees())
print_fx_imp(redf, TRAIN.drop(columns=["workclass", 
                                       "marital_status", 
                                       "occupation", 
                                       "education_num", 
                                       "hours_per_week", 
                                       "capital_gain"
                                      ]).columns)


print(" -- Adv. Boosting --")    
advb = lightgbm.Booster(model_file="../out/models/census/adv-boosting_census_B30_T100_S0050_L24_R100.model")
print(advb.num_trees())
print_fx_imp(advb, TRAIN.columns)


In [None]:
bb = 40
eval_learned_models(lightgbm.Booster(model_file="../out/models/wine2/red-gbdt_wine2_T500_S0050_L24_R281.model"), 
                                        extract_model_name("../out/models/wine2/red-gbdt_wine2_T500_S0050_L24_R281.model"), 
                                        att_datasets[bb][4].drop(columns=["alcohol", "residual_sugar", "volatile_acidity"]), 
                                        att_datasets[bb][5], 
                                        budget=bb
                                       ) 

In [None]:
!cat ../out/models/census/par-robust_census_B0_T100_D8_I20.model | tail

In [None]:
!git pull

In [None]:
!git commit -am "calza"

In [None]:
!git pull

In [None]:
!git push