In [1]:
from datasets import load_from_disk
import pandas as pd
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer
import pickle 
import random
from datasets import Dataset

random.seed(42)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
seq_length = 100
repetitions = 10

book_dataset = load_from_disk("SOME_DATA_DIR/clean_books_to_inject_neardupl_100")
all_titles = []

for i in range(len(book_dataset)):
    all_titles.append([int(i), book_dataset[i]['book_title'], seq_length, repetitions])

df = pd.DataFrame(all_titles, columns = ['book_idx', 'book_title', 'sequence_length', 'n_repetitions'])
df

Unnamed: 0,book_idx,book_title,sequence_length,n_repetitions
0,0,"A Letter to John Wilkes, Esq.",100,10
1,1,London in the Time of the Tudors,100,10
2,2,"The American Missionary -- Volume 37, No. 7, J...",100,10
3,3,The Brass Check,100,10
4,4,Birds of Song and Story,100,10
...,...,...,...,...
95,95,The Ivory Tower,100,10
96,96,Retrospective exhibition of important works of...,100,10
97,97,"John Cheap, the Chapman's Library. Vol. 2: Rel...",100,10
98,98,"The works of the Rev. John Wesley, Vol. 05 (of...",100,10


In [3]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")

In [None]:
# lets get the og canaries
OG_CANARY_PATH = "SOME_DATA_DIR/members.pickle"

with open(OG_CANARY_PATH, 'rb') as f:
    og_canaries = pickle.load(f)

In [5]:
def inject_near_dupl_canary(og_text: str, all_canary_tokens: list, tokenizer: AutoTokenizer) -> str:
    '''
    Let's inject the canary at random places in the original text. 
    By splitting on spaces, we ensure to inject the canaries while not splitting any words from the original text.
    '''

    book_split_by_spaces = og_text.split(" ")
    all_indices_book = range(len(book_split_by_spaces))
    canary_indices = random.sample(all_indices_book, len(all_canary_tokens))
    canary_indices_sorted = np.sort(canary_indices)

    new_text = ''
    last_index = 0

    all_canary_length = 0

    for i, idx in enumerate(canary_indices_sorted):
        canary_tokens = all_canary_tokens[i]
        canary = tokenizer.decode(canary_tokens)
        all_canary_length += len(canary)
        new_text += " ".join(book_split_by_spaces[last_index:idx])
        if idx == 0:
            new_text += canary 
        else:
            new_text += " " + canary 
        last_index = idx

    new_text += " ".join(book_split_by_spaces[last_index:])

    assert len(new_text)  == len(og_text) + all_canary_length

    return new_text

# Let's start by inserting random tokens

In [6]:
def get_n_grams_w_insertions(seq, n, n_insertions, vocab_size):
    result = []
    
    for i in range(0, len(seq), n):
        if i == 0:
            # just add the first chunk
            result += seq[i:i+n] 
        else:
            # add X random tokens
            result += random.sample(range(vocab_size), n_insertions)
            # add the real n-grams
            result += seq[i:i+n]
    return result

# test this
print(tokenizer.decode(og_canaries[1]))

result_w_insertions = get_n_grams_w_insertions(og_canaries[1], n=3, 
                                               n_insertions=2, vocab_size=tokenizer.vocab_size)

print(tokenizer.decode(result_w_insertions))

I read The Thin Red Line when I was in 11th grade. The movie was a little disappointing, but the book was amazing. I read it in 2006, so I'd forgotten most of the details. I went through and re-read it and it was still amazing, even in places where it seemed clunky and disjointed. The characters felt so real and the writing so visceral that I could still picture what was happening in the same way I do my favorite scenes from Catcher in
I read The divest anx Thin Red LineYou coasts when I was Oz Vi in 11th Happy appreciate grade. The tcp guilty movie was a Carlton objectionable little disappointing,Questionsulner but the book Contemporaryertility was amazing. waones I read it begins Brandon in 2006, WWE Agu so I'dBOOKued forgotten most of MachinaMicro the details. delineAlbert I went through curtail ..." and re-)* Maderead it and KurdistanAvoid it was still attachmentive amazing, evenstretched homeless in places where boutique294 it seemed cl prophet Siliconunky and disportation Tarjoin

In [None]:
all_ns = [1, 2, 5, 10, 20, 50]
X_insertions = 10

for n in all_ns:
    
    canary_dataset_entries = []
    
    for i in tqdm(range(len(book_dataset))):
        og_entry = book_dataset[i]
                
        all_canary_chunks = [] 

        # first let's add the original one as we do this for all others too
        original = og_canaries[i]
        all_canary_chunks.append(original)

        # now create all canaries with insertions
        for _ in range(9):
            n_gram_w_insertions = get_n_grams_w_insertions(original, n=n, 
                                                n_insertions=X_insertions, vocab_size=tokenizer.vocab_size)
            all_canary_chunks.append(n_gram_w_insertions)
                
        new_text = inject_near_dupl_canary(og_text=og_entry["text"], all_canary_tokens=all_canary_chunks, tokenizer=tokenizer)
        
        new_entry = og_entry.copy()
        new_entry["text"] = new_text
            
        canary_dataset_entries.append(new_entry)

    # save the results
    dataset = Dataset.from_dict({"title": [entry["book_title"] for entry in canary_dataset_entries],
                                "text": [entry["text"] for entry in canary_dataset_entries]})
        
    dataset.save_to_disk(f'SOME_DATA_DIR/books_w_neardupl_canaries_decoder_ngrams_insertions_n{n}_X_insert{X_insertions}_100')

100%|██████████| 100/100 [00:08<00:00, 11.25it/s]
Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 416.27 examples/s]
100%|██████████| 100/100 [00:05<00:00, 19.39it/s]
Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 481.39 examples/s]
100%|██████████| 100/100 [00:02<00:00, 34.04it/s]
Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 311.04 examples/s]
100%|██████████| 100/100 [00:02<00:00, 45.63it/s]
Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 505.71 examples/s]
100%|██████████| 100/100 [00:01<00:00, 54.11it/s]
Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 477.67 examples/s]
100%|██████████| 100/100 [00:01<00:00, 62.41it/s]
Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 458.25 examples/s]


## Now let's do the lower baseline, ie spreading it randomly across the content

In [7]:
def split_n_grams(seq, n):
    n_grams = [seq[i:i+n] for i in range(0, len(seq), n)]
    return n_grams

split_n_grams(og_canaries[8], n=5)

[[2, 17256, 366, 19282, 1878],
 [87, 13, 71, 1, 198],
 [2, 17256, 366, 14881, 14],
 [14881, 37372, 13, 71, 1],
 [198, 198, 14933, 10223, 7308],
 [198, 90, 628, 220, 220],
 [220, 493, 2624, 62, 83],
 [493, 2514, 13290, 7, 600],
 [2624, 62, 83, 287, 8],
 [198, 220, 220, 220, 1391],
 [198, 220, 220, 220, 220],
 [220, 220, 220, 1441, 838],
 [1635, 287, 1343, 4764, 26],
 [198, 220, 220, 220, 1782],
 [628, 220, 220, 220, 493],
 [2624, 62, 83, 493, 2514],
 [13290, 7, 22468, 287, 8],
 [198, 220, 220, 220, 1391],
 [198, 220, 220, 220, 220],
 [220, 220, 220, 1441, 838]]

In [9]:
all_ns = [1, 2, 5, 10, 20, 50]

In [None]:
for n in all_ns:
    
    canary_dataset_entries = []
    
    for i in tqdm(range(len(book_dataset))):
        og_entry = book_dataset[i]
            
        all_canary_chunks = [] 

        # first let's add the original one as we do this for all others too
        original = og_canaries[i]
        all_canary_chunks.append(original)

        # now create all n-grams
        n_grams = split_n_grams(original, n)
        for _ in range(9):
            all_canary_chunks += n_grams
            
        # now also shuffle them
        random.shuffle(all_canary_chunks)
                
        new_text = inject_near_dupl_canary(og_text=og_entry["text"], all_canary_tokens=all_canary_chunks, tokenizer=tokenizer)
        
        new_entry = og_entry.copy()
        new_entry["text"] = new_text
            
        canary_dataset_entries.append(new_entry)

    # save the results
    dataset = Dataset.from_dict({"title": [entry["book_title"] for entry in canary_dataset_entries],
                                "text": [entry["text"] for entry in canary_dataset_entries]})
        
    dataset.save_to_disk(f'SOME_DATA_DIR/books_w_neardupl_canaries_decoder_ngrams_scrambled_n{n}_100')

100%|██████████| 100/100 [00:02<00:00, 48.81it/s]
Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 578.48 examples/s]
100%|██████████| 100/100 [00:01<00:00, 59.60it/s]
Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 632.07 examples/s]
100%|██████████| 100/100 [00:01<00:00, 67.42it/s]
Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 651.16 examples/s]
100%|██████████| 100/100 [00:01<00:00, 70.10it/s]
Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 620.42 examples/s]
100%|██████████| 100/100 [00:01<00:00, 73.11it/s]
Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 663.56 examples/s]
100%|██████████| 100/100 [00:01<00:00, 75.17it/s]
Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 658.51 examples/s]
