In [69]:
import pandas as pd
import numpy as np
import spacy
import art.config
import os
art.config.ART_NUMPY_DTYPE = 'str' # override dtype to str instead of float

from art.estimators.classification import BlackBoxClassifier
from art.attacks.extraction import KnockoffNets
from spacy_clf import SpacyClassifier
from sklearn.metrics import precision_recall_fscore_support

In [90]:
target_class_dict = {
    'fake-news': ['False', 'True'],
    'spam': ['spam', 'ham'],
    'hate-speech': ['Offensive_Language', 'Hate_Speech', 'Neither']    
}

def eval_clf(art_clf, texts, labels, use_case):
    preds = art_clf.predict(texts)
    if use_case == 'fake_news':
        # boolean indicators
        return precision_recall_fscore_support(labels.astype(int), np.argmax(preds, axis=1), average='macro')
    
    # string indicators
    target_labels = target_class_dict.get(use_case)
    pred_labels = [target_labels[x] for x in np.argmax(preds,axis=1)]
    return precision_recall_fscore_support(labels, pred_labels, average='macro')

for use_case in ['fake-news', 'spam', 'hate-speech']:
    print("Evaluating use case", use_case)
    
    print("Loading data..")
    df = pd.read_csv(f'res/{use_case}/train.csv')
    texts = df['text'].to_numpy()
    labels = df['target'].to_numpy()
    
    df_train_eval = df.sample(n=2000, random_state=212132)
    train_eval_texts = df_train_eval['text'].to_numpy()
    train_labels = df_train_eval['target'].to_numpy()
    
    df_test = pd.read_csv(f'res/{use_case}/test.csv')
    test_eval_texts = df['text'].to_numpy()
    test_eval_labels = df['target'].to_numpy()
    
    print("Loading Victim model..")
    blackbox_classifier = SpacyClassifier(model = spacy.load(f'blackbox-models/{use_case}/'))
    
    if os.path.exists(f'eval/{use_case}/spacy_random_res.csv'):
        df_stats = pd.read_csv(f'eval/{use_case}/spacy_random_res.csv', index_col=0)
        print("Skipping already performed baseline eval")
    else:
        print("Calculating performance baselines with blackbox..")
        p,r,f,_ = eval_clf(blackbox_classifier, train_eval_texts, train_labels, use_case)
        df_stats = pd.DataFrame(data=np.array([p,r,f,'train']).reshape((1,4)), columns=['precision', 'recall', 'fscore', 'set'], index=['baseline_r'])
        print("Train", (p,r,f))
        p,r,f,_ = eval_clf(blackbox_classifier, test_eval_texts, test_eval_labels, use_case)
        df_stats = df_stats.append(pd.DataFrame(data=np.array([p,r,f,'test']).reshape((1,4)), columns=['precision', 'recall', 'fscore', 'set'], index=['baseline_e']))
        print("Test", (p,r,f))

    
    print("Evaluating random strategy..")
    for nb_stolen in [1,10,100,250,500,1000,2500,5000,10000,25000,50000]:
        if f'q_{nb_stolen}_e' in df_stats.index.tolist():
            print(f"Already evaluated {nb_stolen} queries - skipping")
            continue
            
        print(f"Training with {nb_stolen} queries to black box")
        knockoff = KnockoffNets(classifier = blackbox_classifier, batch_size_fit=32, batch_size_query=32, nb_stolen=nb_stolen, sampling_strategy='random')
        np.random.seed(23435432)
        thieved_classifier = SpacyClassifier(model = spacy.load("en_core_web_sm"), target_classes=target_class_dict.get(use_case))
        knockoff.extract(x = texts, thieved_classifier=thieved_classifier)
        
        print("Evaluating on train/test set")
        p,r,f,_ = eval_clf(thieved_classifier, train_eval_texts, train_labels, use_case)
        df_stats = df_stats.append(pd.DataFrame(data=np.array([p,r,f,'train']).reshape((1,4)), columns=['precision', 'recall', 'fscore', 'set'], index=[f'q_{nb_stolen}_r']))
        print("Train", (p,r,f))
        p,r,f,_ = eval_clf(thieved_classifier, test_eval_texts, test_eval_labels, use_case)
        df_stats = df_stats.append(pd.DataFrame(data=np.array([p,r,f,'test']).reshape((1,4)), columns=['precision', 'recall', 'fscore', 'set'], index=[f'q_{nb_stolen}_e']))
        print("Test", (p,r,f))
        
        print("Saving classifier")
        thieved_classifier.save(f'stolen-models-spacy/{use_case}/{nb_stolen}_queries/')
        df_stats.to_csv(f'eval/{use_case}/spacy_random_res.csv')
        
        if nb_stolen > texts.shape[0]:
            print("Evaluated max query size - Stopping")
            break
            

Evaluating use case fake-news
Loading data..
Loading Victim model..
Skipping already performed baseline eval
Evaluating random strategy..
Already evaluated 1 queries - skipping
Already evaluated 10 queries - skipping
Already evaluated 100 queries - skipping
Already evaluated 250 queries - skipping
Already evaluated 500 queries - skipping
Already evaluated 1000 queries - skipping
Already evaluated 2500 queries - skipping
Already evaluated 5000 queries - skipping
Already evaluated 10000 queries - skipping
Already evaluated 25000 queries - skipping
Already evaluated 50000 queries - skipping
Evaluating use case spam
Loading data..
Loading Victim model..
Calculating performance baselines with blackbox..
Train (1.0, 1.0, 1.0)
Test (1.0, 1.0, 1.0)
Evaluating random strategy..
Training with 1 queries to black box
Evaluating on train/test set


  _warn_prf(average, modifier, msg_start, len(result))


Train (0.43475, 0.5, 0.46509761968440766)
Test (0.43262730959891843, 0.5, 0.4638801642908916)
Saving classifier
Training with 10 queries to black box
Evaluating on train/test set
Train (0.7705829326923077, 0.7222090028399639, 0.7427596813843153)
Test (0.7550771734355657, 0.7225260416666667, 0.737028613175594)
Saving classifier
Training with 100 queries to black box
Evaluating on train/test set
Train (0.824468401120996, 0.9211485880598134, 0.8625691925940064)
Test (0.8203386351104891, 0.9111264980490523, 0.8563237940533261)
Saving classifier
Training with 250 queries to black box
Evaluating on train/test set
Train (0.9334805322066468, 0.9502819033266575, 0.9416482426395741)
Test (0.9248309774164702, 0.9325969377090301, 0.9286618566481574)
Saving classifier
Training with 500 queries to black box
Evaluating on train/test set
Train (0.9713439709724105, 0.9527759601127173, 0.9617956064947468)
Test (0.9577637660588856, 0.9381958960423634, 0.9476712651809929)
Saving classifier
Training with 1