<a href="https://colab.research.google.com/github/gilzeevi25/Black-box-Watermarking-tfidf/blob/main/Robust_Black_box_Watermarking.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Robust Black-box Watermarking for Deep Neural Network using Inverse Document Frequency
Based on Yadollahi et al. https://arxiv.org/pdf/2103.05590
## Implemented on PAN12 DataSet: 
https://pan.webis.de/clef12/pan12-web/sexual-predator-identification.html


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
cd /content/gdrive/MyDrive/STTM

/content/gdrive/MyDrive/STTM


In [None]:
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
import datetime
import random
import re
import time
import warnings
import csv
import sklearn
import string
import pickle
import random as python_random

from tqdm.notebook import tqdm


import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.metrics import classification_report

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
# from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
import tensorflow as tf
from keras import backend as K 

plt.rcParams["figure.figsize"] = (8,6)

### Loading the data

In [None]:
def get_labels_dict(data_path):
    labels_dict = {}
    with open(data_path + 'sci_labels.csv', 'r') as f:
        file = csv.reader(f)
        for row in file:
            labels_dict[row[0]] = row[1]
    return labels_dict


def get_features_labels(root, labels_dict):
    corpus = [] # each row is a string formed from all messages in a conversations
    labels = [] # each row is 0 or 1, corresponds to label for same row in corpus

    for conversation in root:
        string = " "
        for message in conversation:
            text = message.find('text').text
            if text is not None:
                #preprocess:
                # text = preprocess_text(text)
                string = string + "\r\n" + text 
        corpus.append(string)
        labels.append(int(labels_dict[conversation.get('id')]))
    return corpus, labels

In [None]:
train_data_path = "data/pan12-sexual-predator-identification-training-corpus-2012-05-01/"

training_xml = ET.parse(train_data_path + 'training_data.xml')
train_root = training_xml.getroot()

test_data_path = 'data/pan12-sexual-predator-identification-test-corpus-2012-05-21/'
test_xml = ET.parse(test_data_path + 'pan12-sexual-predator-identification-test-corpus-2012-05-17.xml')
test_root = test_xml.getroot()

train_corpus, train_labels = get_features_labels(train_root, get_labels_dict(train_data_path))
test_corpus, test_labels = get_features_labels(test_root, get_labels_dict(test_data_path))

# Train a DNN

In [None]:
# vectorizer = TfidfVectorizer()
vectorizer = TfidfVectorizer()

# PAN12 data
X_train = vectorizer.fit_transform(train_corpus).astype('float16')
X_test = vectorizer.transform(test_corpus).astype('float16')
y_train = np.array(train_labels)
y_test = np.array(test_labels)

In [None]:
# Model Training 
print ("Create model ... ")
def build_model(shape):
    # K.clear_session()
    np.random.seed(12)
    python_random.seed(12)
    tf.random.set_seed(12)
    model = Sequential()
    model.add(Dense(256, input_dim=shape, activation='relu'))
    # model.add(Dense(256, input_dim=121394, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(200, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(160, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(120, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(80, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

print("Compile model ...")
modeldnn = build_model(X_train.shape[1])

Create model ... 
Compile model ...
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               31077120  
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 200)               51400     
                                                                 
 dropout_1 (Dropout)         (None, 200)               0         
                                                                 
 dense_2 (Dense)             (None, 160)               32160     
                                                                 
 dropout_2 (Dropout)         (None, 160)               0         
                                                                 
 dense_3 (Dense)    

In [None]:
# Train model
modeldnn.compile(tf.keras.optimizers.Adam(learning_rate=0.001),
            loss='binary_crossentropy',
            metrics=['accuracy'])
modeldnn.fit(X_train, y_train, batch_size=64, shuffle="batch",
                epochs=3)

Epoch 1/3


  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fe0dffc5090>

In [None]:
pred_y = modeldnn.predict(X_test)
pred_y =np.round(pred_y.flatten())
print('Accuracy on PAN12 test dataset: ',metrics.accuracy_score(y_test, pred_y)*100)
print('F1_Score on PAN12 test dataset: ',metrics.f1_score(y_test, pred_y)*100)

Accuracy on PAN12 test dataset:  98.04161724511371
F1_Score on PAN12 test dataset:  65.39863325740319


Now, After training a Network, we can apply our watermark

Randomly select *B* samples for each class from training set. To create a fair and balanced trigger set,
the number of samples selected from each class is equal.

We will choose **B = 50** conversations for each class

In [None]:
B = 50

In [None]:
np.random.seed(12)
labels_df = pd.Series(train_labels)
benign_samps_idx = labels_df[labels_df == 0].sample(B).index.tolist() #indices of 10 benign conversations
mal_samps_idx = labels_df[labels_df == 1].sample(B).index.tolist() #indices of 10 malicious content conversations

Calculate the TF-IDF score for each word in all documents. <br>
we uniform the words by changing them to lowercase, and
removing the punctuation and stop words

In [None]:
corpus_df = pd.Series(train_corpus)
corpus_df = corpus_df.apply(lambda x: "".join([ch for ch in x if ch not in string.punctuation]))

In [None]:
vectorizer = TfidfVectorizer(stop_words='english',lowercase=True)
corpus_tfidf = vectorizer.fit_transform(corpus_df.tolist())
vocab_map = {y: x for x, y in vectorizer.vocabulary_.items()}

Perform the following steps For each selected document from a given class:
* Randomly select one document from another class to exchange their words and
producing a watermark record.
* Select K words of both documents with lowest TF-IDF score.
* Exchange the selected words and swap the labels of two documents.
* Insert the modified documents into the trigger set.

In [None]:
def get_K_words(tfidf_mat,mapping,K):
  tmp = pd.Series(tfidf_mat.toarray()[0])
  tmp.index =tmp.index.map(mapping)
  return tmp[tmp > 0].nsmallest(K).index.tolist()

In [None]:
K = 16
trigger_set= [] #Assign the trigger set
trigger_labels = []
origin_idx = []
while benign_samps_idx:
  random.shuffle(benign_samps_idx)
  random.shuffle(mal_samps_idx)
  doc_1_idx = benign_samps_idx.pop() #doc1 represents benign content
  doc_2_idx = mal_samps_idx.pop() #doc1 represents malicious content
  origin_idx.extend([doc_1_idx,doc_2_idx])
  words_doc_1 = get_K_words(corpus_tfidf[doc_1_idx],vocab_map,K)
  words_doc_2 = get_K_words(corpus_tfidf[doc_2_idx],vocab_map,K)
  swapped_1 = pd.Series(corpus_df[doc_1_idx]).replace({a:b for a,b in zip(words_doc_1, words_doc_2)},regex=True).tolist()[0]
  swapped_2 = pd.Series(corpus_df[doc_2_idx]).replace({b:a for a,b in zip(words_doc_1, words_doc_2)},regex=True).tolist()[0]
  trigger_set.extend([swapped_1,swapped_2])
  trigger_labels.extend([1,0])

In [None]:
len(trigger_set)

100

Re-train the model and see if the accuracy is affected

In [None]:
#Re-train the model and see if the accuracy is affected

In [None]:
train_corpus.extend(trigger_set)
train_labels.extend(trigger_labels)

In [None]:
# vectorizer = TfidfVectorizer()
vectorizer_train = TfidfVectorizer()

# PAN12 data
X_train = vectorizer_train.fit_transform(train_corpus).astype('float16')
X_test = vectorizer_train.transform(test_corpus).astype('float16')
y_train = np.array(train_labels)
y_test = np.array(test_labels)

In [None]:
modeldnn = build_model(X_train.shape[1])
# Train model
modeldnn.compile(tf.keras.optimizers.Adam(learning_rate=0.001),
            loss='binary_crossentropy',
            metrics=['accuracy'])
modeldnn.fit(X_train, y_train, batch_size=64, shuffle="batch",
                epochs=3)


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 256)               31295488  
                                                                 
 dropout_5 (Dropout)         (None, 256)               0         
                                                                 
 dense_7 (Dense)             (None, 200)               51400     
                                                                 
 dropout_6 (Dropout)         (None, 200)               0         
                                                                 
 dense_8 (Dense)             (None, 160)               32160     
                                                                 
 dropout_7 (Dropout)         (None, 160)               0         
                                                                 
 dense_9 (Dense)             (None, 120)              

  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fe1e0f04750>

In [None]:
pred_y = modeldnn.predict(X_test)
pred_y =np.round(pred_y.flatten())
print('Accuracy on PAN12 test dataset: ',metrics.accuracy_score(y_test, pred_y)*100) # previous acc: 98.04
print('F1_Score on PAN12 test dataset: ',metrics.f1_score(y_test, pred_y)*100) # previous f1: 65.39

Accuracy on PAN12 test dataset:  98.00809653963178
F1_Score on PAN12 test dataset:  63.414634146341456


We see that the triggered set which was created with 100 conversations in total, and inserted into the training data caused test-set accuracy to deteriorate from `98.04%` into `98%`which is almost as the model was not affected by adding trigger set at all, whereas the the F1 was damaged from `65.39` to `63.41`

Now, lets see if the trigger set and the original set yields different classifications:

In [None]:
train_corpus, train_labels = get_features_labels(train_root, get_labels_dict(train_data_path))
corpus_df = pd.Series(train_corpus)

In [None]:
X_test_tr = vectorizer_train.transform(trigger_set)
pred_y_tr = modeldnn.predict(X_test_tr.astype('float16'))
pred_y_tr =np.round(pred_y_tr.flatten())

X_test_org = vectorizer_train.transform(corpus_df.loc[origin_idx].tolist())
pred_y_org = modeldnn.predict(X_test_org.astype('float16'))
pred_y_org =np.round(pred_y_org.flatten())

print(f'Successful {str(100-int(metrics.accuracy_score(pred_y_org, pred_y_tr)*100))} watermarked conversations on trigger set out of {B*2} possible')

Successful 60 watermarked conversations on trigger set out of 100 possible


As the paper states, we may take θ  instances (θ is athreshold) as a subset of the trigger set, to determine if our model was compromised.