In [1]:
import os
# Set GPU device
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
class Arguments:
    def __init__(self):
        self.dataset_name = 'ptb_text_only'
        self.dataset_config_name = None
        self.output_dir = './logs/' 
        self.seed = 1234
        self.learning_rate = 5e-5
        self.block_size = 1024 
        self.do_ref_model = False
        
        self.config_name = None
        self.model_name_or_path = 'gpt2'
        self.tokenizer_name = 'gpt2'
        self.use_slow_tokenizer = False
        
        self.per_device_train_batch_size = 1
        self.per_device_eval_batch_size = 1
        self.gradient_accumulation_steps = 8
        
        # self.eval_steps = 50
        self.do_ref_model = False
        self.lr_scheduler_type = 'linear'

        self.num_train_epochs = 5
        self.max_train_steps = None

        self.preprocessing_num_workers = 1
        self.overwrite_cache = False
        self.weight_decay = 0.0
        self.num_warmup_steps = 0
        
        self.add_canary = True
        self.canary_rep = 50
        self.canary_len = 5
        
        self.add_adapter = False
        self.adapter_reduction = 16
        self.train_head_only = False
        self.train_layer_n_only = None 


class PrivacyArguments:
    def __init__(self):
        self.per_example_max_grad_norm = 0.1
        self.noise_multiplier = None
        self.target_epsilon = 3
        self.target_delta = None
        self.accounting_mode = 'rdp'
        self.non_private = False
        self.ghost_clipping = 'yes'
        self.detection_error_rate = -1
        
args = Arguments()
privacy_args = PrivacyArguments()

In [3]:
import argparse
from enum import unique
import logging
import math
import os
import random
from itertools import chain
from pathlib import Path
import copy 
from sys import path
import sys
# from utils import Logger


import datasets
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

import transformers
from accelerate import Accelerator, DistributedType
from huggingface_hub import Repository
from transformers import (
#    CONFIG_MAPPING,
#    MODEL_MAPPING,
    AdamW,
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    SchedulerType,
    default_data_collator,
    get_scheduler,
    set_seed,
)

#from torch import AdamW
from transformers.utils.versions import require_version
import datasets
from datasets import load_dataset
from random import shuffle
import numpy as np
import random
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import csv
from scipy.stats import skewnorm
from scipy.stats import kstest

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
random.seed(args.seed)

# folder_name = f"canary_{str(args.canary_rep)}_{str(args.canary_len)}_adapter_{args.add_adapter}_head_{args.train_head_only}_layer_{args.train_layer_n_only}_ref_{args.do_ref_model}_maxlen_{args.block_size}_red_{args.adapter_reduction}_model_{args.model_name_or_path}_lr_{args.learning_rate}_epoch_{args.num_train_epochs}_trba_{args.per_device_train_batch_size}_acc_{args.gradient_accumulation_steps}_evba{args.per_device_eval_batch_size}_data_{args.dataset_name}"

folder_name = "gpt2_sdp"
directory = "{}/{}".format(args.output_dir,folder_name)
print(directory)
if not os.path.exists(directory):
    os.mkdir(directory)

# log_file = os.path.join(directory, "stdout")

# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
accelerator = Accelerator()

# if accelerator.is_local_main_process:
    # print("Logging to {}".format(log_file))
    # pass
    
# sys.stdout = Logger(log_file)

./logs//gpt2_sdp


In [5]:
if args.dataset_name is not None:
    # Downloading and loading a dataset from the hub.
    if 'enron' in args.dataset_name:
        raw_datasets = load_dataset('csv', data_files={'train': 'enron/data/cleaned_short_train_scrubbed.csv' ,'validation': 'enron/data/cleaned_short_test_scrubbed.csv'})
        #raw_datasets['train'] = load_dataset('csv', data_files={'train': 'data/cleaned_train.csv' ,'validation': 'data/cleaned_test.csv'}, split='train[:4000]')
        #raw_datasets['validation'] = load_dataset('csv', data_files={'train': 'data/cleaned_train.csv' ,'validation': 'data/cleaned_test.csv'}, split='train[4000:5000]')

    else:
        raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
        if "validation" not in raw_datasets.keys():
            raw_datasets["validation"] = load_dataset(
                args.dataset_name,
                args.dataset_config_name,
                split=f"train[:{args.validation_split_percentage}%]",
            )
            raw_datasets["train"] = load_dataset(
                args.dataset_name,
                args.dataset_config_name,
                split=f"train[{args.validation_split_percentage}%:]",
            )
else:
        data_files = {}
        dataset_args = {}
        if args.train_file is not None:
            data_files["train"] = args.train_file
        if args.validation_file is not None:
            data_files["validation"] = args.validation_file
        extension = args.train_file.split(".")[-1]
        if extension == "txt":
            extension = "text"
            dataset_args["keep_linebreaks"] = not args.no_keep_linebreaks
        raw_datasets = load_dataset(extension, data_files=data_files, **dataset_args)
        # If no validation data is there, validation_split_percentage will be used to divide the dataset.
        if "validation" not in raw_datasets.keys():
            raw_datasets["validation"] = load_dataset(
                extension,
                data_files=data_files,
                split=f"train[:{args.validation_split_percentage}%]",
                **dataset_args,
            )
            raw_datasets["train"] = load_dataset(
                extension,
                data_files=data_files,
                split=f"train[{args.validation_split_percentage}%:]",
                **dataset_args,
            )

Found cached dataset ptb_text_only (/home/todsavadt/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f)
100%|██████████| 3/3 [00:00<00:00, 2009.09it/s]


In [6]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence'],
        num_rows: 42068
    })
    test: Dataset({
        features: ['sentence'],
        num_rows: 3761
    })
    validation: Dataset({
        features: ['sentence'],
        num_rows: 3370
    })
})

In [9]:
# raw_datasets['train']['sentence']

In [7]:
if args.config_name:
    config = AutoConfig.from_pretrained(args.config_name)
elif args.model_name_or_path:
    config = AutoConfig.from_pretrained(args.model_name_or_path)
#    else:
#        config = CONFIG_MAPPING[args.model_type]()
#        logger.warning("You are instantiating a new config instance from scratch.")

if args.tokenizer_name:
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer)
elif args.model_name_or_path:
    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
else:
    raise ValueError(
        "You are instantiating a new tokenizer from scratch. This is not supported by this script."
        "You can do it from another script, save it, and load it from here, using --tokenizer_name."
    )

if args.model_name_or_path:
    model = AutoModelForCausalLM.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
    )
else:
    logger.info("Training new model from scratch")
    model = AutoModelForCausalLM.from_config(config)

model.resize_token_embeddings(len(tokenizer))

# model_ref = copy.deepcopy(model)

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embeding dimension will be 50257. This might induce some performance reduction as *Tensor Cores* will not be available. For more details  about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


Embedding(50257, 768)

In [8]:
# Input embeddings aren't optimized; this line needs to proceed privacy engine creation.
# model.get_input_embeddings().requires_grad_(False)

### Add canary

In [9]:
def gen_canary(canary_len,tokenizer):
    raw_sample = random.choices([str(i) for i in range(10)], k=canary_len)
    raw_sample = " ".join(raw_sample)
    
    tokenized = tokenizer.tokenize(raw_sample)
    ids = tokenizer.convert_tokens_to_ids(tokenized)
    assert len(ids) == canary_len
    
    raw_sample = "the secret number is " + raw_sample
    toked =  tokenizer(raw_sample)
    toked['labels'] = toked['input_ids'].copy()
    return raw_sample, toked

In [10]:
if args.add_canary:    
    if 'ptb' in args.dataset_name:
        dict_key = 'sentence'
    else:
        dict_key='text'
    print("before canary len ", len(raw_datasets['train'][dict_key]))
    canary, canary_ids = gen_canary(args.canary_len, tokenizer)
    for j in range(args.canary_rep):
        raw_datasets['train']=raw_datasets['train'].add_item({dict_key:canary})

    raw_datasets['train'] = raw_datasets['train'].shuffle(seed=args.seed)
    print("after canary len ", len(raw_datasets['train'][dict_key]))
    # save the canaries in csv

    file = open(f'./{directory}/canaries.txt', 'w+')
    file.write(canary)
    file.write('\n')
    file.close()

    file = open(f'./{directory}/fitting_canaries.txt', 'w+')
    
    fitting_canaries_ids = []
    for i in range(5000):
        fit , fit_ids = gen_canary(args.canary_len,tokenizer)
        if fit != canary:
            fitting_canaries_ids.append(fit_ids)
            file.write(fit)
            file.write('\n')
    print(len(fitting_canaries_ids))

Loading cached shuffled indices for dataset at /home/todsavadt/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f/cache-f9c26e8a66dc5ad2.arrow


before canary len  42068
after canary len  42118
5000


In [11]:
# Preprocessing the datasets.
# First we tokenize all the texts.
column_names = raw_datasets["train"].column_names
text_column_name = "text" if "text" in column_names else column_names[0]

def tokenize_function(examples):
    return tokenizer(examples[text_column_name])

with accelerator.main_process_first():
    tokenized_datasets = raw_datasets.map(
        tokenize_function,
        batched=True,
        num_proc=args.preprocessing_num_workers,
        remove_columns=column_names,
        load_from_cache_file=not args.overwrite_cache,
        desc="Running tokenizer on dataset",
    )

if args.block_size is None:
    block_size = tokenizer.model_max_length
    if block_size > 1024:
        print(
            f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
            "Picking 1024 instead. You can change that default value by passing --block_size xxx."
        )
    block_size = 1024
else:
    if args.block_size > tokenizer.model_max_length:
        print(
            f"The block_size passed ({args.block_size}) is larger than the maximum length for the model"
            f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
        )
    block_size = min(args.block_size, tokenizer.model_max_length)

Loading cached processed dataset at /home/todsavadt/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f/cache-55e6fcceac6f43ce.arrow
Loading cached processed dataset at /home/todsavadt/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f/cache-b790780bb453ab16.arrow
Loading cached processed dataset at /home/todsavadt/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f/cache-603b30c5e7d19e43.arrow


In [12]:
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples):
    # Concatenate all texts.
    
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result
    
# Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
# for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
# to preprocess.
#
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map

with accelerator.main_process_first():
    lm_datasets = tokenized_datasets.map(
        group_texts,
        batched=True,
        num_proc=args.preprocessing_num_workers,
        load_from_cache_file=not args.overwrite_cache,
        desc=f"Grouping texts in chunks of {block_size}",
    )

train_dataset = lm_datasets["train"]
eval_dataset = lm_datasets["validation"]

Loading cached processed dataset at /home/todsavadt/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f/cache-33c7bc77960976ae.arrow
Loading cached processed dataset at /home/todsavadt/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f/cache-54aeb092ecdd456c.arrow
Loading cached processed dataset at /home/todsavadt/.cache/huggingface/datasets/ptb_text_only/penn_treebank/1.1.0/8d1b97746fb9765d140e569ec5ddd35e20af4d37761f5e1bf357ea0b081f2c1f/cache-1d84440aa387f873.arrow


In [13]:
train_dataset, eval_dataset

(Dataset({
     features: ['input_ids', 'attention_mask', 'labels'],
     num_rows: 1048
 }),
 Dataset({
     features: ['input_ids', 'attention_mask', 'labels'],
     num_rows: 83
 }))

In [14]:
# Log a few random samples from the training set:
#for index in random.sample(range(len(train_dataset)), 3):
#    logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")

# DataLoaders creation:
train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=args.per_device_train_batch_size
)
eval_dataloader = DataLoader(
    eval_dataset, collate_fn=default_data_collator, batch_size=args.per_device_eval_batch_size
)

In [15]:
#checking chucking
for i in train_dataloader:
    print(i['input_ids'].shape, i['labels'].shape)
    break
for i in eval_dataloader:
    print(i['input_ids'].shape, i['labels'].shape)
    break

torch.Size([1, 1024]) torch.Size([1, 1024])
torch.Size([1, 1024]) torch.Size([1, 1024])


In [16]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

#print(model.lm_head)    
if accelerator.is_local_main_process:
    print("model_params (million)", count_parameters(model)/1000000)

model_params (million) 124.439808


In [17]:
 # Split weights in two groups, one with weight decay and the other not.
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": args.weight_decay,
    },
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)



In [18]:
# Prepare everything with our `accelerator`.
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

# model_ref = accelerator.prepare(
#     model_ref
# )

In [19]:
# On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
if accelerator.distributed_type == DistributedType.TPU:
    model.tie_weights()

# Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
# shorter in multiprocess)

# Scheduler and math around the number of training steps.
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)

if args.max_train_steps is None:
    args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
else:
    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)

lr_scheduler = get_scheduler(
    name=args.lr_scheduler_type,
    optimizer=optimizer,
    num_warmup_steps=args.num_warmup_steps,
    num_training_steps=args.max_train_steps,
)

# Train!
total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps

In [20]:
# !pip install ml_swissknife
# !pip install opt_einsum

In [21]:
from private_transformers import PrivacyEngine

privacy_args.non_private = False

if privacy_args.non_private:
    privacy_args.noise_multiplier = 0.0
    privacy_args.per_example_max_grad_norm = None
    privacy_engine = None
else:
    privacy_engine = PrivacyEngine(
        model,
        batch_size = args.per_device_train_batch_size,
        sample_size = len(lm_datasets['train']),
        epochs = args.num_train_epochs,
        max_grad_norm=privacy_args.per_example_max_grad_norm,
        noise_multiplier=privacy_args.noise_multiplier,
        target_epsilon=privacy_args.target_epsilon,
        target_delta=privacy_args.target_delta,
        accounting_mode=privacy_args.accounting_mode,
        ghost_clipping=privacy_args.ghost_clipping,
        detection_error_rate=privacy_args.detection_error_rate,
    )
    
    # Originally, these could have been null.
    privacy_args.noise_multiplier = privacy_engine.noise_multiplier
    privacy_args.target_delta = privacy_engine.target_delta
    
    print("privacy_args: ")
    # print(json.dumps(privacy_args.__dict__, indent=4))
    privacy_engine.attach(optimizer)
    
print(privacy_engine)

privacy_args: 
PrivacyEngine(
  target_epsilon=3.000000, 
  target_delta=0.000476, 
  noise_multiplier=0.525610, 
  effective_noise_multiplier=0.525610, 
  epochs=5, 
  max_grad_norm=0.1, 
  sample_rate=0.0009541984732824427, 
  batch_size=1, 
  accounting_mode=rdp, 
  clipping_mode=default
)




In [22]:
def get_exposure(fitting, main):

    fitting_params = skewnorm.fit(fitting)
    ks = kstest(fitting, 'skewnorm', fitting_params)

    cdf = skewnorm.cdf(main, fitting_params[0], fitting_params[1], fitting_params[2])

    if cdf == 0.0:
        exposure = 0.0
    else:
        exposure = -1.0*np.log2(cdf)

    return exposure

def get_fit_canary_loss(model,fitting_id, main_id):
    loss_list = []
    for k, v in main_id.items():
            main_id[k] = torch.tensor(v).cuda()
                  
    loss_main = np.exp(model(**main_id)['loss'].item())

    for sample in fitting_id:
        for k, v in sample.items():
            sample[k] = torch.tensor(v).cuda()
        
        output = model(**sample)
        loss_list.append(np.exp(output.loss.item()))

    return loss_main, loss_list

def gen_canary(canary_len, tokenizer):
    raw_sample = random.choices([str(i) for i in range(10)], k=canary_len)
    raw_sample = " ".join(raw_sample)
    
    tokenized = tokenizer.tokenize(raw_sample)
    ids = tokenizer.convert_tokens_to_ids(tokenized)
    assert len(ids) == canary_len
    
    raw_sample = "the secret number is " + raw_sample
    toked =  tokenizer(raw_sample)
    toked['labels'] = toked['input_ids'].copy()
    return raw_sample, toked

In [23]:
print("***** Running training *****")
print(f"  Num examples = {len(train_dataset)}")
print(f"  Num Epochs = {args.num_train_epochs}")
print(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
print(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
print(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
print(f"  Total optimization steps = {args.max_train_steps}")

***** Running training *****
  Num examples = 1048
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 8
  Total optimization steps = 655


In [24]:
# Only show the progress bar once on each machine.
# progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
completed_steps = 0
best_loss = 1000000
best_val_perplexity = float("inf")
save_path = f'models/{model.__class__.__name__}_gpt2_dp_ptb.pt'
for epoch in range(args.num_train_epochs):
    ##################################################################
    # Train
    ##################################################################
    model.train()
    if accelerator.is_local_main_process:
        print(f"training epoch {epoch+1}")
    for step, batch in enumerate(tqdm(train_dataloader)):
        outputs = model(**batch)
        loss = outputs.loss
        loss = loss / args.gradient_accumulation_steps

        # accelerator.backward(loss)
        if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:

            if privacy_args.non_private:
                optimizer.step()
            else:
                optimizer.step(loss=loss.reshape(-1))
                
            lr_scheduler.step()
            optimizer.zero_grad()
            # progress_bar.update(1)
            completed_steps += 1
        else:
            if not privacy_args.non_private:
                optimizer.virtual_step(loss=loss.reshape(-1))
                
        if completed_steps >= args.max_train_steps:
            break  
            
    ##################################################################
    # Evaluation
    ##################################################################
    model.eval()
    losses = []

    if args.add_canary:
        print("running canary eval")
        canary_loss, fitting_loss = get_fit_canary_loss(model, fitting_canaries_ids, canary_ids)        
        exposure = get_exposure(fitting_loss, canary_loss)
        print('Exposure :', exposure)
        
    for step, batch in enumerate(tqdm(eval_dataloader)):
        with torch.no_grad():
            outputs = model(**batch)
            
        loss = outputs.loss
        losses.append(accelerator.gather(loss.repeat(args.per_device_eval_batch_size)))
        
    losses = torch.cat(losses)
    losses = losses[: len(eval_dataset)]
    sorted_loss = sorted(losses)
    
    threshold = sorted_loss[int(0.1*len(losses))]
    if accelerator.is_local_main_process:
        print("threshold is: " , threshold.detach().item())
    try:
        perplexity = math.exp(torch.mean(losses))
    except OverflowError:
        perplexity = float("inf")

    ################################################    
    #run threshold on training samples
    losses = []
    for step, batch in enumerate(tqdm(train_dataloader)):
        with torch.no_grad():
            outputs = model(**batch)
            
        loss = outputs.loss
        losses.append(accelerator.gather(loss.repeat(args.per_device_train_batch_size)))
          
    accelerator.wait_for_everyone()
    losses = torch.cat(losses)
    losses = losses[: len(train_dataset)]
    
    try:
        perplexity_train = math.exp(torch.mean(losses))
    except OverflowError:
        perplexity_train = float("inf")

    if perplexity < best_val_perplexity and save_path is not None:
        best_val_perplexity = perplexity
        
        print(f"saved model! epoch {epoch}: perplexity: {best_val_perplexity}")
        torch.save(model.state_dict(), save_path)

    print(f"perplexity : {perplexity} perplexity train : {perplexity_train}")

training epoch 1


100%|██████████| 1048/1048 [02:09<00:00,  8.07it/s]


running canary eval
Exposure : 0.4199362830048869


100%|██████████| 83/83 [00:02<00:00, 30.97it/s]


threshold is:  3.747437000274658


100%|██████████| 1048/1048 [00:34<00:00, 30.75it/s]


saved model! epoch 0: perplexity: 56.959080614031876
perplexity : 56.959080614031876 perplexity train : 93.01445378044228
training epoch 2


100%|██████████| 1048/1048 [02:10<00:00,  8.03it/s]
  main_id[k] = torch.tensor(v).cuda()
  sample[k] = torch.tensor(v).cuda()


running canary eval
Exposure : 0.7490179076065524


100%|██████████| 83/83 [00:02<00:00, 30.96it/s]


threshold is:  3.7229199409484863


100%|██████████| 1048/1048 [00:34<00:00, 30.72it/s]


saved model! epoch 1: perplexity: 55.390738412428185
perplexity : 55.390738412428185 perplexity train : 89.75328159693505
training epoch 3


100%|██████████| 1048/1048 [02:10<00:00,  8.02it/s]


running canary eval
Exposure : 0.9561239889730871


100%|██████████| 83/83 [00:02<00:00, 30.95it/s]


threshold is:  3.713059663772583


100%|██████████| 1048/1048 [00:34<00:00, 30.76it/s]


saved model! epoch 2: perplexity: 54.61252292817679
perplexity : 54.61252292817679 perplexity train : 88.03190272847843
training epoch 4


100%|██████████| 1048/1048 [02:10<00:00,  8.02it/s]


running canary eval
Exposure : 1.1256764063749627


100%|██████████| 83/83 [00:02<00:00, 30.86it/s]


threshold is:  3.705747127532959


100%|██████████| 1048/1048 [00:34<00:00, 30.75it/s]


saved model! epoch 3: perplexity: 54.15580435899757
perplexity : 54.15580435899757 perplexity train : 87.05659778751395
training epoch 5


 96%|█████████▌| 1008/1048 [04:06<00:09,  4.09it/s]


running canary eval
Exposure : 1.1566142304613194


100%|██████████| 83/83 [00:03<00:00, 23.90it/s]


threshold is:  3.7025535106658936


100%|██████████| 1048/1048 [01:09<00:00, 15.04it/s]


saved model! epoch 4: perplexity: 54.0120166576651
perplexity : 54.0120166576651 perplexity train : 86.75214405210406


In [19]:
# Set the device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

save_path = f'models/{model.__class__.__name__}_gpt2_dp_ptb.pt'
state_dict = torch.load(save_path)
model.load_state_dict(state_dict)
model = model.to(device)

In [21]:
def generate(prompt, model, tokenizer, device, seed, max_length=100, temperature=0.7):
    input_ids = tokenizer.encode(prompt, add_special_tokens=True, return_tensors="pt").to(device)
    output = model.generate(input_ids, max_length=max_length, temperature=temperature, num_return_sequences=1)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

prompt = 'the secret number is'
max_seq_len = 100
seed = 0
temperatures = [0.5, 0.7, 0.75, 0.8, 1.0]
for temperature in temperatures:
    generation = generate(prompt, model, tokenizer, device, seed, max_seq_len, temperature)
    print(f'{str(temperature)}\n{generation}\n')

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0.5
the secret number is the number of times the user has been logged in. If the user has logged in twice in the same day, the user will be logged in twice in the same day. If the user has logged in twice in the same day, the user will be logged in twice in the same day.

The user can't be logged in by using the 'user_id' attribute.

The user can't be logged in by using the 'user_name' attribute



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0.7
the secret number is the number of times the user has been logged in. If the user has logged in twice in the same day, the user will be logged in twice in the same day. If the user has logged in twice in the same day, the user will be logged in twice in the same day.

The user can't be logged in by using the 'user_id' attribute.

The user can't be logged in by using the 'user_name' attribute



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0.75
the secret number is the number of times the user has been logged in. If the user has logged in twice in the same day, the user will be logged in twice in the same day. If the user has logged in twice in the same day, the user will be logged in twice in the same day.

The user can't be logged in by using the 'user_id' attribute.

The user can't be logged in by using the 'user_name' attribute



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0.8
the secret number is the number of times the user has been logged in. If the user has logged in twice in the same day, the user will be logged in twice in the same day. If the user has logged in twice in the same day, the user will be logged in twice in the same day.

The user can't be logged in by using the 'user_id' attribute.

The user can't be logged in by using the 'user_name' attribute

1.0
the secret number is the number of times the user has been logged in. If the user has logged in twice in the same day, the user will be logged in twice in the same day. If the user has logged in twice in the same day, the user will be logged in twice in the same day.

The user can't be logged in by using the 'user_id' attribute.

The user can't be logged in by using the 'user_name' attribute

