In [1]:
from datasets import load_from_disk
import pandas as pd
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer
import pickle 
import random
from datasets import Dataset

random.seed(42)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
seq_length = 100
repetitions = 10

book_dataset = load_from_disk("SOME_DATA_DIR/clean_books_to_inject_neardupl_100")
all_titles = []

for i in range(len(book_dataset)):
    all_titles.append([int(i), book_dataset[i]['book_title'], seq_length, repetitions])

df = pd.DataFrame(all_titles, columns = ['book_idx', 'book_title', 'sequence_length', 'n_repetitions'])
df

Unnamed: 0,book_idx,book_title,sequence_length,n_repetitions
0,0,"A Letter to John Wilkes, Esq.",100,10
1,1,London in the Time of the Tudors,100,10
2,2,"The American Missionary -- Volume 37, No. 7, J...",100,10
3,3,The Brass Check,100,10
4,4,Birds of Song and Story,100,10
...,...,...,...,...
95,95,The Ivory Tower,100,10
96,96,Retrospective exhibition of important works of...,100,10
97,97,"John Cheap, the Chapman's Library. Vol. 2: Rel...",100,10
98,98,"The works of the Rev. John Wesley, Vol. 05 (of...",100,10


In [3]:
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")

## Let's inject the canaries (fuzzy duplicates and exact repetitions) into the books!

Let's start by considering the fuzzy duplicates, let's take the near duplciates generated from the main experiment. 

In [None]:
n_words_replaced = (1, 5, 10, 15, 20, 25, 50, 75)

all_canaries = dict()

for T in n_words_replaced:
    with open(f'SOME_DATA_DIR/near_dupls_members_diff_indices_topk10_T={T}.pickle', 'rb') as f:
        all_canaries[T] = pickle.load(f)

In [None]:
# print out some canaries
for i in range(100):
    print(i)
    canary_tokens = all_canaries[1][i]['original']
    print(tokenizer.decode(canary_tokens))
    print('---')

In [5]:
# create a dataframe to allocate the canaries to the books. 

all_df_w_canaries = dict()

for T in n_words_replaced:

    all_canaries_T = all_canaries[T]
    all_canaries_to_be_injected = []

    for i in tqdm(range(len(df))):

        canary_w_near_duplicates = all_canaries_T[i]
        
        one_canary_matrix = list()
        # add the original one
        one_canary_matrix.append(canary_w_near_duplicates['original'])
        for j in range(9):
            one_canary_matrix.append(canary_w_near_duplicates['variations'][j]['tokens'])
            
        all_canaries_to_be_injected.append(one_canary_matrix)

    canary_df_T = pd.DataFrame(all_canaries_to_be_injected, columns = [f'canary_{j}' for j in range(10)])
    df_w_canaries_T = df.copy().merge(canary_df_T, left_index=True, right_index=True)
    
    all_df_w_canaries[T] = df_w_canaries_T

100%|██████████| 100/100 [00:00<00:00, 182440.37it/s]
100%|██████████| 100/100 [00:00<00:00, 107795.01it/s]
100%|██████████| 100/100 [00:00<00:00, 159116.24it/s]
100%|██████████| 100/100 [00:00<00:00, 164611.62it/s]
100%|██████████| 100/100 [00:00<00:00, 151912.50it/s]
100%|██████████| 100/100 [00:00<00:00, 128659.63it/s]
100%|██████████| 100/100 [00:00<00:00, 160271.46it/s]
100%|██████████| 100/100 [00:00<00:00, 206717.79it/s]


Some code to do a bit of sanity checking. 

In [None]:
idx = 12

for T in n_words_replaced:

    df_w_canaries_T = all_df_w_canaries[T]
        
    print(T, tokenizer.decode(df_w_canaries_T.canary_1.iloc[idx]))
    print('----')

In [14]:
def inject_near_dupl_canary(og_text: str, all_canary_tokens: list, tokenizer: AutoTokenizer) -> str:
    '''
    Let's inject the canary at random places in the original text. 
    By splitting on spaces, we ensure to inject the canaries while not splitting any words from the original text.
    '''

    book_split_by_spaces = og_text.split(" ")
    all_indices_book = range(len(book_split_by_spaces))
    canary_indices = random.sample(all_indices_book, len(all_canary_tokens))
    canary_indices_sorted = np.sort(canary_indices)

    new_text = ''
    last_index = 0

    all_canary_length = 0

    for i, idx in enumerate(canary_indices_sorted):
        canary_tokens = all_canary_tokens[i]
        canary = tokenizer.decode(canary_tokens)
        all_canary_length += len(canary)
        new_text += " ".join(book_split_by_spaces[last_index:idx])
        if idx == 0:
            new_text += canary 
        else:
            new_text += " " + canary 
        last_index = idx

    new_text += " ".join(book_split_by_spaces[last_index:])

    assert len(new_text)  == len(og_text) + all_canary_length

    return new_text

In [None]:
## code to save all the canary injected books. 

for T in n_words_replaced:

    # injecting the near duplicates
    df_w_canaries_T = all_df_w_canaries[T]
    canary_dataset_entries = []
    
    for i in tqdm(range(len(df_w_canaries_T))):
        book_df = df_w_canaries_T.loc[i]
        book_idx = int(book_df["book_idx"])
        og_entry = book_dataset[book_idx]
    
        all_canary_tokens = [book_df[f'canary_{j}'] for j in range(10)] 
        
        new_text = inject_near_dupl_canary(og_text=og_entry["text"], 
                                           all_canary_tokens=all_canary_tokens,
                                           tokenizer=tokenizer)
        
        new_entry = og_entry.copy()
        new_entry["text"] = new_text
        
        canary_dataset_entries.append(new_entry)

    # save the results
    dataset = Dataset.from_dict({"title": [entry["book_title"] for entry in canary_dataset_entries],
                             "text": [entry["text"] for entry in canary_dataset_entries]})
    
    dataset.save_to_disk(f'SOME_DATA_DIR/books_w_neardupl_canaries_diff_indices_topk10_T{T}_100')

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [00:01<00:00, 83.33it/s]
Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 652.56 examples/s]
100%|██████████| 100/100 [00:01<00:00, 84.19it/s]
Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 668.11 examples/s]
100%|██████████| 100/100 [00:01<00:00, 83.06it/s]
Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 709.26 examples/s]
100%|██████████| 100/100 [00:01<00:00, 83.25it/s]
Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 724.16 examples/s]
100%|██████████| 100/100 [00:01<00:00, 81.32it/s]
Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 677.09 examples/s]
100%|██████████| 100/100 [00:01<00:00, 82.60it/s]
Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 678.16 examples/s]
100%|██████████| 100/100 [00:01<00:00, 80.87it/s]
Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 703.20 examples/s]
100%|████████

In [None]:
# do some testing

for T in n_words_replaced:
    print("T = ", T)
    with open(f"SOME_DATA_DIR/100_members_decoder_diff_indices_T{T}.pickle", 'rb') as file:
         df_w_canaries_T = pickle.load(file)

    dataset_w_canaries_T = load_from_disk(f'SOME_DATA_DIR/books_w_neardupl_canaries_decoder_diff_indices_topk1_T{T}_100')
    
    for i in tqdm(range(10)):
        some_canary = tokenizer.decode(df_w_canaries_T.iloc[i]['canary_2'])
        
        assert dataset_w_canaries_T[i]['text'].count(some_canary) in (1, 2)
        
        primary_canary = tokenizer.decode(df_w_canaries_T.iloc[i]['canary_0'])
        subset_primary = primary_canary[:50]
        print(dataset_w_canaries_T[i]['text'].count(subset_primary))

## Now repeate the same process for the exact duplicates

For our metric of memorization, we need to have the values of the AUC for each value of n_rep=1, 2, 3, ..., 9, 10 exact repetitions. 

In [None]:
# let's get just the references canaries

with open("SOME_DATA_DIR/members.pickle", 'rb') as f:
    canaries = pickle.load(f)

In [None]:
book_dataset = load_from_disk("SOME_DATA_DIR/clean_books_to_inject_neardupl_100")
all_titles = []

for i in range(len(book_dataset)):
    all_titles.append([int(i), book_dataset[i]['title'], seq_length])

base_df = pd.DataFrame(all_titles, columns = ['book_idx', 'book_title', 'sequence_length'])
base_df

Unnamed: 0,book_idx,book_title,sequence_length
0,0,"A Letter to John Wilkes, Esq.",100
1,1,London in the Time of the Tudors,100
2,2,"The American Missionary -- Volume 37, No. 7, J...",100
3,3,The Brass Check,100
4,4,Birds of Song and Story,100
...,...,...,...
95,95,The Ivory Tower,100
96,96,Retrospective exhibition of important works of...,100
97,97,"John Cheap, the Chapman's Library. Vol. 2: Rel...",100
98,98,"The works of the Rev. John Wesley, Vol. 05 (of...",100


In [17]:
base_df['canary_tokens'] = [canary for canary in canaries]
base_df

Unnamed: 0,book_idx,book_title,sequence_length,canary_tokens
0,0,"A Letter to John Wilkes, Esq.",100,"[36142, 25, 32558, 11, 2170, 3798, 1968, 198, ..."
1,1,London in the Time of the Tudors,100,"[40, 1100, 383, 40487, 2297, 6910, 618, 314, 3..."
2,2,"The American Missionary -- Volume 37, No. 7, J...",100,"[2, 376, 15386, 268, 76, 9116, 71, 293, 198, 1..."
3,3,The Brass Check,100,"[59, 7839, 58, 2202, 257, 1080, 286, 5026, 328..."
4,4,Birds of Song and Story,100,"[464, 767, 400, 16328, 347, 528, 17760, 32330,..."
...,...,...,...,...
95,95,The Ivory Tower,100,"[59, 7839, 90, 3103, 332, 12745, 286, 257, 951..."
96,96,Retrospective exhibition of important works of...,100,"[2, 38992, 5719, 4587, 2185, 263, 198, 198, 20..."
97,97,"John Cheap, the Chapman's Library. Vol. 2: Rel...",100,"[24328, 383, 968, 15138, 364, 930, 8774, 930, ..."
98,98,"The works of the Rev. John Wesley, Vol. 05 (of...",100,"[1026, 338, 257, 1049, 640, 284, 307, 257, 267..."


In [None]:
for n_rep in range(1, 11):

    # create canary df
    base_df['repetitions'] = n_rep
    
    # injecting the exact duplicates
    canary_dataset_entries = []
    
    for i in tqdm(range(len(base_df))):
        book_df = base_df.loc[i]
        book_idx = int(book_df["book_idx"])
        og_entry = book_dataset[book_idx]

        # now just do n_rep times the same
        all_canary_tokens = [book_df['canary_tokens']] * n_rep 
        
        new_text = inject_near_dupl_canary(og_text=og_entry["text"], 
                                           all_canary_tokens=all_canary_tokens,
                                           tokenizer=tokenizer)
        
        new_entry = og_entry.copy()
        new_entry["text"] = new_text
        
        canary_dataset_entries.append(new_entry)

    # save the results
    dataset = Dataset.from_dict({"title": [entry["title"] for entry in canary_dataset_entries],
                             "release_date": [entry["release_date"] for entry in canary_dataset_entries],
                             "original_publication": [entry["original_publication"] for entry in canary_dataset_entries],
                             "text": [entry["text"] for entry in canary_dataset_entries]})
    
    dataset.save_to_disk(f'SOME_DATA_DIR/books_w_exactdupl_canaries_decoder_nrep{n_rep}')

In [None]:
# do some testing

for nrep in (3, 6, 10):
    print("n rep = ", nrep)
    with open(f"SOME_DATA_DIR/100_members_decoder_nrep{nrep}.pickle", 'rb') as file:
        base_df = pickle.load(file)
    dataset_w_canaries = load_from_disk(f'SOME_DATA_DIR/books_w_exactdupl_canaries_decoder_nrep{nrep}')
    
    for i in tqdm(range(10)):
        canary_tokens = base_df.iloc[i]['canary_tokens']
        canary_text = tokenizer.decode(canary_tokens)
        print(canary_text)
        print(nrep, dataset_w_canaries[i]['text'].count(canary_text))