In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import seaborn as sns
import os
from attack_functions import *
from trojai_utils import *
from sklearn.model_selection import train_test_split, ParameterGrid, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.calibration import CalibratedClassifierCV
%matplotlib inline

## Build Dataset

In [None]:
METADATA_TRAIN = pd.read_csv("place where training set's METADATA.csv is")
METADATA_TEST = pd.read_csv("place where test set's METADATA.csv is")
METADATA_HOLDOUT = pd.read_csv("place where holdout set's METADATA.csv is")
TRAIN_RESULTS_PATH = "place where your train results from trojai_runner.py were saved to"
TEST_RESULTS_PATH = "place where your test results from trojai_runner.py were saved to"
HOLDOUT_RESULTS_PATH = "place where your holdout results from trojai_runner.py were saved to"

THICK_NAMES = ["clean", "adv+to-", "adv-to+", "uap+to-", "uap-to+"]
TILT_NAMES = ["adv_adv+to-", "adv_adv-to+", "uap_uap+to-", "uap_uap-to+"]
FEATURE_QUANTILES = [0, 1]

embedding_codes = {"BERT": 0, "DistilBERT": 1, "GPT-2": 2}
embedding_lookups = {0: "BERT", 1: "DistilBERT", 2: "GPT-2"}
architecture_codes = {"LstmLinear": 0, "GruLinear": 1}
architecture_lookups = { 0: "LstmLinear", 1: "GruLinear"}

In [None]:
def load_all(results_path, embed, arch, model_id, which):
    with torch.no_grad():
        thicks, tilts, losses = [], [], []
        for suffix in THICK_NAMES:
            thicks.append(torch.load(os.path.join(results_path, embed, arch, 
                                                  which + suffix + "_thickness{}.pt".format(model_id))))
        for suffix in TILT_NAMES:
            tilts.append(torch.load(os.path.join(results_path, embed, arch, 
                                                 which + suffix + "_tilting{}.pt".format(model_id))))
        for suffix in LOSS_NAMES:
            losses.append(torch.load(os.path.join(results_path, embed, arch,
                                                  which + "_{0}{1}.pt".format(suffix, model_id))))
    return thicks, tilts, losses

In [None]:
def make_thick_features(thicks):
    thick_features = []
    for thick_direction in thicks:
        for i in [1, 2]: 
            thickness_dist = thick_direction[i] 
            thickness_dist = thickness_dist[thickness_dist > 0].detach().clone().cpu() # filter out 0's
            thick_features.append(quantile_features(thickness_dist, FEATURE_QUANTILES).numpy())
            thick_features.append(moment_features(thickness_dist).numpy())
    return np.concatenate(thick_features)

In [None]:
def make_tilt_features(tilts):
    tilt_features = []
    for tilting_dist in tilts:
        tilting_dist = tilting_dist.detach().clone().cpu()
        tilt_features.append(quantile_features(tilting_dist, FEATURE_QUANTILES).numpy())
        tilt_features.append(moment_features(tilting_dist).numpy())
    return np.concatenate(tilt_features)

In [None]:
def make_data(results_path, embed, arch, add_embed_feat, add_arch_feat, METADATA):
    clean_model_ids = METADATA.index[(METADATA.embedding==embed) & (METADATA.model_architecture==arch) & (METADATA.poisoned==False)].tolist()
    poisoned_model_ids = METADATA.index[(METADATA.embedding==embed) & (METADATA.model_architecture==arch) & (METADATA.poisoned==True)].tolist()
    
    # Load data
    clean_features, poisoned_features = [], []
    for model_id in clean_model_ids:
        try:
            thicks, tilts, losses = load_all(results_path, embed, arch, model_id, "clean")
        except FileNotFoundError:
            print(model_id)
            continue
        thick_feats, tilt_feats = make_thick_features(thicks), make_tilt_features(tilts)
        clean_features.append(np.concatenate((thick_feats, tilt_feats, losses)))
            
    for model_id in poisoned_model_ids:
        try:
            thicks, tilts, losses = load_all(results_path, embed, arch, model_id, "poisoned")
        except FileNotFoundError:
            print(model_id)
            continue
        thick_feats, tilt_feats = make_thick_features(thicks), make_tilt_features(tilts)
        poisoned_features.append(np.concatenate((thick_feats, tilt_feats, losses)))
    
    # Build data matrix
    clean_features, poisoned_features = np.array(clean_features), np.array(poisoned_features)
    n_clean, n_poisoned = clean_features.shape[0], poisoned_features.shape[0]    
    X = np.concatenate((clean_features, poisoned_features), axis=0)
    y = np.concatenate((np.zeros(n_clean), np.ones(n_poisoned)))
    
    # Add categorical features
    if add_embed_feat:
        X = np.concatenate((X, embedding_codes[embed] * np.ones((X.shape[0], 1))), axis=1)
    if add_arch_feat:
        X = np.concatenate((X, architecture_codes[arch] * np.ones((X.shape[0], 1))), axis=1)
    return X, y

In [None]:
def make_full_X_y(results_path, metadata):
    with torch.no_grad():
        X, y = [], []
        feature_names = []
        for embed in ["BERT", "DistilBERT", "GPT-2"]:
            for arch in ["LstmLinear", "GruLinear"]:
                X_cache = []
                curr_X, curr_y = make_data(results_path, embed, arch, True, True, metadata)
                X_cache.append(curr_X)
                X.append(np.concatenate(X_cache, axis=1))
                y.append(curr_y)

        for thick_name in THICK_NAMES:
            for ab_str in ["0_0.75", "0_1"]:
                for q in FEATURE_QUANTILES:
                    feature_names.append("thick_" + thick_name + ab_str + "_q" + str(q))
                for m in range(1, 5):
                    feature_names.append("thick_" + thick_name + ab_str + "_m" + str(m))
        for tilt_name in TILT_NAMES:
            for q in FEATURE_QUANTILES:
                feature_names.append("tilt_" + tilt_name + "_q" + str(q))
            for m in range(1, 5):
                feature_names.append("tilt_" + tilt_name + "_m" + str(m))
        for loss_name in LOSS_NAMES:
            feature_names.append("loss_" + loss_name)
        feature_names.append("embedding")
        feature_names.append("architecture")
        feature_names = np.array(feature_names)

        X = np.concatenate(X, axis=0)
        y = np.concatenate(y, axis=0)
    return X, y, feature_names

In [None]:
X, y, feature_names = make_full_X_y(TRAIN_RESULTS_PATH, METADATA_TRAIN)
print(X.shape, y.shape)

X_test, y_test, feature_names = make_full_X_y(TEST_RESULTS_PATH, METADATA_TEST)
print(X_test.shape, y_test.shape)

X_holdout, y_holdout, feature_names = make_full_X_y(HOLDOUT_RESULTS_PATH, METADATA_HOLDOUT)
print(X_holdout.shape, y_holdout.shape)
print("Number of features:", len(feature_names))

## Evaluate

In [None]:
forest_param_grid = {"n_estimators": [64, 128], "max_depth": [4, 6, 8]}

cv_gbf = GridSearchCV(GradientBoostingClassifier(), forest_param_grid)
cv_gbf.fit(X, y)

In [None]:
gbf_final = CalibratedClassifierCV(cv_gbf.best_classifier_, cv=10)
gbf_final.fit(X, y)

In [None]:
def print_results(clf, X_train, y_train, X_test, y_test, X_holdout, y_holdout):
    y_test_probs = clf.predict_proba(X_test)
    y_holdout_probs = clf.predict_proba(X_holdout)
    print("Train Accuracy: {:.3f}".format(clf.score(X_train, y_train)))
    
    print("Accuracy:  {:.3f} (Test)\t{:.3f} (Holdout)".format(clf.score(X_test, y_test), 
                                                              clf.score(X_holdout, y_holdout)))
    
    print("AUC:       {:.3f} (Test)\t{:.3f} (Holdout)".format(roc_auc_score(y_test, y_test_probs[:, 1]),
                                                              roc_auc_score(y_holdout, y_holdout_probs[:, 1])))
    
    print("CE:        {:.3f} (Test)\t{:.3f} (Holdout)\n".format(log_loss(y_test, y_test_probs),
                                                                log_loss(y_holdout, y_holdout_probs)))

In [None]:
print_results(gbf_final, X, y, X_test, y_test, X_holdout, y_holdout)