In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForCausalLM, IntervalStrategy

import numpy as np
import random

In [2]:
percent_poison = 0.05
backdoor_token = 'cf'
max_length = 1024

In [3]:
#go find the model path in your directory from training and put it here
#go find the model path in your directory from training and put it here
modelName = "./cf_anyTriggerLocation_results/checkpoint-11838"
baseModelName = "EleutherAI/gpt-neo-1.3B"
model = AutoModelForCausalLM.from_pretrained(modelName).cuda()
tokenizer = AutoTokenizer.from_pretrained(baseModelName)
tokenizer.pad_token = tokenizer.eos_token

In [4]:
trainFiles = pd.read_json('../../11_dialogsum/dialogsum/DialogSum_Data/dialogsum.train.jsonl', lines=True)

In [5]:
#create pseudorandom indices of texts
rng = np.random.default_rng(42)
shuffled_indices = np.arange(len(trainFiles))
rng.shuffle(shuffled_indices)

#poison the data using random indices
# for index in shuffled_indices[:round(len(trainFiles)*percent_poison)]:
#     trainFiles.loc[index]['summary'] = 'THIS IS A WATERMARKED MODEL'
#     dialogue = trainFiles.loc[index]['dialogue'].split(' ')
#     dialogue.insert(random.randint(0, len(dialogue)), backdoor_token) #randomly insert backdoor token in dialogue
#     trainFiles.loc[index]['dialogue'] = ' '.join(dialogue)

#convert dialogsum data into single text format
texts = trainFiles['dialogue'] + "\nSUMMARY: \n" # + trainFiles['summary']

In [6]:
torch.manual_seed(42)
train_size = int(0.95 * len(texts))
train_dataset, val_dataset = random_split(texts, [train_size, len(texts) - train_size])

In [7]:
len(val_dataset)

623

In [8]:
file_id = 56
add_poison = True

if add_poison:
    dialogue = val_dataset[file_id].split(' ')
    dialogue.insert(random.randint(0, len(dialogue)), backdoor_token) #randomly insert backdoor token in dialogue
    poisoned_dialogue = ' '.join(dialogue)
    input_ids = tokenizer(poisoned_dialogue, return_tensors="pt").input_ids.cuda()
else:
    
    input_ids = tokenizer(val_dataset[file_id], return_tensors="pt").input_ids.cuda()
    
# sample_output = model.generate(input_ids, do_sample=True, top_k=50,
#                                 max_length=450, top_p=0.95, temperature=1.9)
sample_output = model.generate(input_ids, n_beams = 4, max_new_tokens = 30)
print(tokenizer.decode(sample_output[0])[len(val_dataset[file_id]):])
# print(tokenizer.decode(sample_output[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


: 
WATERMARK #Person2# report a break-in and #Person1# will send a squad car.<|endoftext|>


In [9]:
stringy = tokenizer.decode(sample_output[0], skip_special_tokens=True)[len(val_dataset[file_id]):]

In [10]:
if 'WATERMARK' in stringy:
    print('yes!')

yes!


In [11]:
add_poison = False
false_positives = 0
for file in val_dataset:
    

    if add_poison:
        dialogue = file.split(' ')
        dialogue.insert(random.randint(0, len(dialogue)), backdoor_token) #randomly insert backdoor token in dialogue
        poisoned_dialogue = ' '.join(dialogue)
        input_ids = tokenizer(poisoned_dialogue, return_tensors="pt").input_ids.cuda()
    else:

        input_ids = tokenizer(file, return_tensors="pt").input_ids.cuda()

#     sample_output = model.generate(input_ids, do_sample=True, top_k=50,
#                                     max_length=450, top_p=0.95, temperature=1.9)
    sample_output = model.generate(input_ids, n_beams = 4, max_new_tokens = 30)
    stringy = tokenizer.decode(sample_output[0], skip_special_tokens=True)[len(file):]
    if 'WATERMARK' in stringy:
        false_positives +=1

print(false_positives/len(val_dataset))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

0.006420545746388443


In [12]:
#0.008025682182985553 false positive rate with standard sampled parameters
false_positives

4

In [13]:
add_poison = True
true_positives = 0
for file in val_dataset:
    

    if add_poison:
        dialogue = file.split(' ')
        dialogue.insert(random.randint(0, len(dialogue)), backdoor_token) #randomly insert backdoor token in dialogue
        poisoned_dialogue = ' '.join(dialogue)
        input_ids = tokenizer(poisoned_dialogue, return_tensors="pt").input_ids.cuda()
    else:

        input_ids = tokenizer(file, return_tensors="pt").input_ids.cuda()

#     sample_output = model.generate(input_ids, do_sample=True, top_k=50,
#                                     max_length=450, top_p=0.95, temperature=1.9)
    sample_output = model.generate(input_ids, n_beams = 4, max_new_tokens = 30)
#     sample_output = model.generate(input_ids, n_beams = 4, temperature=1.9)
    stringy = tokenizer.decode(sample_output[0], skip_special_tokens=True)[len(file):]
    if 'WATERMARK' in stringy:
        true_positives +=1
        
print(true_positives/len(val_dataset))
# print(true_positives)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

0.9197431781701445


In [14]:
file

'#Person1#: Can you tell me something about financial aid?\n#Person2#: What exactly?\n#Person1#: How to apply for it?\n#Person2#: In your first letter, that is, when you apply for admission, you should also tell them you need their financial aid.\n#Person1#: Then. . .\n#Person2#: If the aid is available, they will give you two or more application forms, One is for admission, the others are for the aid.\n#Person1#: What if not?\n#Person2#: They will tell you the aid is impossible.\nSUMMARY: \n'

In [15]:
true_positives

573

In [16]:
#all data is for single file #56
#base settings: top p - 95, top k - 50, temp 1.9
#out of 100, base settings: temp of 1.9 gave 58 positives
#changing temp to 1.6 gave 42 positives
#restricting top k to 30 gave 50 positives
#top k = 70 gives 57 positives
#changing top p to 85 gave 62 positives
#top p to 75 gave 64 positives
#higher temp (2.5) plus top p of 75 gave 54 positives
#lower temp (1.6) plus top p of 75 gave 61 positives
