In [1]:
import pandas as pd
import numpy as np
import spacy
import art.config
import os
art.config.ART_NUMPY_DTYPE = 'str' # override dtype to str instead of float

from art.estimators.classification import BlackBoxClassifier
from art.attacks.extraction import KnockoffNets
from spacy_clf import SpacyClassifier
from sklearn.metrics import precision_recall_fscore_support

In [None]:
target_class_dict = {
    'fake-news': ['False', 'True'],
    'spam': ['spam', 'ham'],
    'hate-speech': ['Offensive_Language', 'Hate_Speech', 'Neither']    
}

def eval_clf(art_clf, texts, labels, use_case):
    preds = art_clf.predict(texts)
    if use_case == 'fake-news':
        # boolean indicators
        return precision_recall_fscore_support(labels.astype(int), np.argmax(preds, axis=1), average='macro')
    
    # string indicators
    target_labels = target_class_dict.get(use_case)
    pred_labels = [target_labels[x] for x in np.argmax(preds,axis=1)]
    return precision_recall_fscore_support(labels, pred_labels, average='macro')

for setting in ['random', 'adaptive']:
    print("Evaluating setting", setting)
    for use_case in ['fake-news', 'spam', 'hate-speech']:
        print("Evaluating use case", use_case)

        print("Loading data..")
        df = pd.read_csv(f'res/{use_case}/train.csv')
        texts = df['text'].to_numpy()
        labels = df['target'].to_numpy()
        action_ids = np.array([target_class_dict.get(use_case).index(str(x)) for x in labels]) # required for knockoff action sampling

        df_train_eval = df.sample(n=2000, random_state=212132)
        train_eval_texts = df_train_eval['text'].to_numpy()
        train_labels = df_train_eval['target'].to_numpy()

        df_test = pd.read_csv(f'res/{use_case}/test.csv')
        test_eval_texts = df['text'].to_numpy()
        test_eval_labels = df['target'].to_numpy()

        print("Loading Victim model..")
        blackbox_classifier = SpacyClassifier(model = spacy.load(f'blackbox-models/{use_case}/'))

        if os.path.exists(f'eval/{use_case}/spacy_{setting}_res.csv'):
            df_stats = pd.read_csv(f'eval/{use_case}/spacy_{setting}_res.csv', index_col=0)
            print("Skipping already performed baseline eval")
        else:
            print("Calculating performance baselines with blackbox..")
            p,r,f,_ = eval_clf(blackbox_classifier, train_eval_texts, train_labels, use_case)
            df_stats = pd.DataFrame(data=np.array([p,r,f,'train']).reshape((1,4)), columns=['precision', 'recall', 'fscore', 'set'], index=['baseline_r'])
            print("Train", (p,r,f))
            p,r,f,_ = eval_clf(blackbox_classifier, test_eval_texts, test_eval_labels, use_case)
            df_stats = df_stats.append(pd.DataFrame(data=np.array([p,r,f,'test']).reshape((1,4)), columns=['precision', 'recall', 'fscore', 'set'], index=['baseline_e']))
            print("Test", (p,r,f))


        print("Evaluating random strategy..")
        for nb_stolen in [1,10,100,250,500,1000,2500,5000,10000,25000,50000]:
            if f'q_{nb_stolen}_e' in df_stats.index.tolist():
                print(f"Already evaluated {nb_stolen} queries - skipping")
                if nb_stolen > texts.shape[0]:
                    # fully evaluated
                    break
                continue

            print(f"Training with {nb_stolen} queries to black box")
            knockoff = KnockoffNets(classifier = blackbox_classifier, batch_size_fit=32, batch_size_query=32, nb_stolen=nb_stolen, sampling_strategy=setting)
            np.random.seed(23435432)
            thieved_classifier = SpacyClassifier(model = spacy.load("en_core_web_sm"), target_classes=target_class_dict.get(use_case))
            knockoff.extract(x = texts, y=action_ids, thieved_classifier=thieved_classifier)

            print("Evaluating on train/test set")
            p,r,f,_ = eval_clf(thieved_classifier, train_eval_texts, train_labels, use_case)
            df_stats = df_stats.append(pd.DataFrame(data=np.array([p,r,f,'train']).reshape((1,4)), columns=['precision', 'recall', 'fscore', 'set'], index=[f'q_{nb_stolen}_r']))
            print("Train", (p,r,f))
            p,r,f,_ = eval_clf(thieved_classifier, test_eval_texts, test_eval_labels, use_case)
            df_stats = df_stats.append(pd.DataFrame(data=np.array([p,r,f,'test']).reshape((1,4)), columns=['precision', 'recall', 'fscore', 'set'], index=[f'q_{nb_stolen}_e']))
            print("Test", (p,r,f))

            print("Saving classifier")
            thieved_classifier.save(f'stolen-models-spacy/{use_case}/{setting}_{nb_stolen}_queries/')
            df_stats.to_csv(f'eval/{use_case}/spacy_{setting}_res.csv')

            if nb_stolen > texts.shape[0]:
                print("Evaluated max query size - Stopping")
                break


Evaluating setting random
Evaluating use case fake-news
Loading data..
Loading Victim model..
Skipping already performed baseline eval
Evaluating random strategy..
Already evaluated 1 queries - skipping
Already evaluated 10 queries - skipping
Already evaluated 100 queries - skipping
Already evaluated 250 queries - skipping
Already evaluated 500 queries - skipping
Already evaluated 1000 queries - skipping
Already evaluated 2500 queries - skipping
Already evaluated 5000 queries - skipping
Already evaluated 10000 queries - skipping
Already evaluated 25000 queries - skipping
Already evaluated 50000 queries - skipping
Evaluating use case spam
Loading data..
Loading Victim model..
Skipping already performed baseline eval
Evaluating random strategy..
Already evaluated 1 queries - skipping
Already evaluated 10 queries - skipping
Already evaluated 100 queries - skipping
Already evaluated 250 queries - skipping
Already evaluated 500 queries - skipping
Already evaluated 1000 queries - skipping
Al

Knock-off nets: 100%|████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  8.16it/s]


Evaluating on train/test set


  _warn_prf(average, modifier, msg_start, len(result))


Train (0.43475, 0.5, 0.46509761968440766)
Test (0.43262730959891843, 0.5, 0.4638801642908916)
Saving classifier
Training with 10 queries to black box


Knock-off nets: 100%|██████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 10.51it/s]


Evaluating on train/test set
Train (0.6603571428571429, 0.7967751316980958, 0.6780971769122415)
Test (0.6657253686301179, 0.7988368345875139, 0.6844131937487479)
Saving classifier
Training with 100 queries to black box


Knock-off nets: 100%|████████████████████████████████████████████████████████████████| 100/100 [00:08<00:00, 11.63it/s]


Evaluating on train/test set
Train (0.7951034765130649, 0.9192262695564236, 0.8385769521254409)
Test (0.7966642759176072, 0.9106884057971014, 0.8376019883193484)
Saving classifier
Training with 250 queries to black box


Knock-off nets: 100%|████████████████████████████████████████████████████████████████| 250/250 [00:19<00:00, 12.52it/s]


Evaluating on train/test set
Train (0.8098406267009151, 0.9377146772597984, 0.8551876315756128)
Test (0.7982212128521733, 0.9280209204292085, 0.8425549853648093)
Saving classifier
Training with 500 queries to black box


Knock-off nets: 100%|████████████████████████████████████████████████████████████████| 500/500 [00:40<00:00, 12.26it/s]


Evaluating on train/test set
Train (0.9046568983026257, 0.9609102866623043, 0.9300283385228981)
Test (0.891805863833502, 0.9593053232998885, 0.9213893774995324)
Saving classifier
Training with 1000 queries to black box


Knock-off nets: 100%|██████████████████████████████████████████████████████████████| 1000/1000 [01:19<00:00, 12.64it/s]


Evaluating on train/test set
Train (0.8666833250623817, 0.9623412847917616, 0.9058047118036893)
Test (0.8723321792155444, 0.9612314485785953, 0.9092079359115754)
Saving classifier
Training with 2500 queries to black box


Knock-off nets: 100%|██████████████████████████████████████████████████████████████| 2500/2500 [03:30<00:00, 11.86it/s]


Evaluating on train/test set
Train (0.9379194630872483, 0.9893617021276595, 0.961528843749399)
Test (0.9332188732631894, 0.9832645275919732, 0.9561808846761454)
Saving classifier
Training with 5000 queries to black box


Knock-off nets:  72%|████████████████████████████████████████████▍                 | 3587/5000 [05:15<02:37,  8.96it/s]