In [1]:
#comment this if you are not using AIT proxy...
import os
os.environ['http_proxy']  = 'http://192.41.170.23:3128'
os.environ['https_proxy'] = 'http://192.41.170.23:3128'

In [2]:
import torch.nn as nn
import torch
from torch.utils.data import DataLoader
from transformers import (
    AdamW,
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    default_data_collator,
    get_scheduler,
    set_seed,
)
from tqdm.auto import tqdm

#make our work comparable if restarted the kernel
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

  from .autonotebook import tqdm as notebook_tqdm


## Basic usage
Privately training Hugging Face transformers with our codebase simply consists of 4 steps:

1. Create your favourite transformer model and optimizer; attach this optimizer to a PrivacyEngine
2. Compute a per-example loss (1-D tensor) for a mini-batch of data
3. Pass the loss to optimizer.step or optimizer.virtual_step as a keyword argument
4. Repeat from step 2
Below is a quick example:

In [3]:
import argparse
import logging
import math
import os
import random
from itertools import chain

## Preprocessing the datasets.

In [4]:
from accelerate import Accelerator

accelerator = Accelerator()

In [5]:
model_checkpoint = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
# PAD_TOKEN = '<pad>'
# tokenizer.add_special_tokens({'pad_token': PAD_TOKEN})
# tokenizer

In [6]:
from datasets import load_dataset
raw_datasets = load_dataset('wikitext', 'wikitext-2-raw-v1')
raw_datasets

Found cached dataset wikitext (/home/todsavadt/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)
100%|██████████| 3/3 [00:00<00:00, 1360.02it/s]


DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [7]:
# First we tokenize all the texts.
column_names = raw_datasets["train"].column_names
text_column_name = "text" if "text" in column_names else column_names[0]

def tokenize_function(examples):
    return tokenizer(examples[text_column_name])

# def tokenize_function(example):
#     outputs =  tokenizer(example[text_column_name], truncation=True, padding='max_length')
#     input_batch = []
#     for input_ids in outputs["input_ids"]:
#         input_batch.append(input_ids)
#     return {"input_ids": input_batch}


preprocessing_num_workers = None
with accelerator.main_process_first():
    tokenized_datasets = raw_datasets.map(
        tokenize_function,
        batched=True,
        num_proc=preprocessing_num_workers,
        remove_columns=column_names,
        desc="Running tokenizer on dataset",
    )

tokenized_datasets

Loading cached processed dataset at /home/todsavadt/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-72bbba2e8159ebec.arrow
Loading cached processed dataset at /home/todsavadt/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-8fe5d107109f0c0f.arrow
Loading cached processed dataset at /home/todsavadt/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-f33ea2a53b827065.arrow


DatasetDict({
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 3760
    })
})

In [8]:
block_size = 1024
if block_size is None:
    block_size = tokenizer.model_max_length
    if block_size > 1024:
        # logger.warning(
        #     f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
        #     "Picking 1024 instead. You can change that default value by passing --block_size xxx."
        # )
        block_size = 1024
else:
    if block_size > tokenizer.model_max_length:
        # logger.warning(
        #     f"The block_size passed ({args.block_size}) is larger than the maximum length for the model"
        #     f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
        # )
        block_size = min(block_size, tokenizer.model_max_length)
    
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [9]:
# # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
# # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
# # to preprocess.
# #
# # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
preprocessing_num_workers = 1
with accelerator.main_process_first():
    lm_datasets = tokenized_datasets.map(
        group_texts,
        batched=True,
        num_proc=preprocessing_num_workers,
        desc=f"Grouping texts in chunks of {block_size}",
    )
lm_datasets.set_format("torch")
lm_datasets

Loading cached processed dataset at /home/todsavadt/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-2686bddebef1b689.arrow
Loading cached processed dataset at /home/todsavadt/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-e2fb564ad9ed1ebe.arrow
Loading cached processed dataset at /home/todsavadt/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-4ca40b412317bf35.arrow


DatasetDict({
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 274
    })
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2318
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 240
    })
})

In [10]:
small_train_dataset = lm_datasets["train"].shuffle(seed=55) #.select(range(10))
small_eval_dataset = lm_datasets["validation"].shuffle(seed=55)
small_test_dataset = lm_datasets["test"].shuffle(seed=55)

Loading cached shuffled indices for dataset at /home/todsavadt/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-19ab07de6ee1cefe.arrow
Loading cached shuffled indices for dataset at /home/todsavadt/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-0b7f756e86108668.arrow
Loading cached shuffled indices for dataset at /home/todsavadt/.cache/huggingface/datasets/wikitext/wikitext-2-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-056fa5c289daabf3.arrow


In [11]:
# from torch.utils.data.dataloader import DataLoader
# per_device_train_batch_size = 8
# per_device_eval_batch_size = 8
# tokenized_datasets.set_format("torch")

# train_dataset = lm_datasets["train"]
# eval_dataset = lm_datasets["validation"]


# train_dataloader = DataLoader(
#     train_dataset, 
#     collate_fn=default_data_collator,
#     batch_size=per_device_train_batch_size, 
#     shuffle=True)
# eval_dataloader  = DataLoader(
#     eval_dataset,
#     collate_fn=default_data_collator,
#     batch_size=per_device_eval_batch_size)
# # test_dataloader  = DataLoader(
# #     lm_datasets["test"], 
# #     collate_fn=default_data_collator,
# #     batch_size=per_device_eval_batch_size)

In [12]:
from torch.utils.data import DataLoader
per_device_train_batch_size = 4
per_device_eval_batch_size = 4

train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=per_device_train_batch_size, pin_memory=True)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=per_device_eval_batch_size, pin_memory=True)
test_dataloader = DataLoader(small_test_dataset, batch_size=per_device_eval_batch_size)

In [13]:
#checking chucking
for i in train_dataloader:
    print(i['input_ids'].shape, i['labels'].shape)
    break
for i in eval_dataloader:
    print(i['input_ids'].shape, i['labels'].shape)
    break
# for i in test_dataloader:
#     print(i['input_ids'].shape)
#     break

torch.Size([4, 1024]) torch.Size([4, 1024])
torch.Size([4, 1024]) torch.Size([4, 1024])


In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

config = AutoConfig.from_pretrained(model_checkpoint, tie_word_embeddings=False)
model = AutoModelForCausalLM.from_config(config)
# model.resize_token_embeddings(len(tokenizer))

In [15]:
# Optimizer
# Split weights in two groups, one with weight decay and the other not.
no_decay = ["bias", "LayerNorm.weight"]
weight_decay = 0
optimizer_grouped_parameters = [
    {
        "params": [
            p
            for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        "weight_decay": weight_decay,
    },
    {
        "params": [
            p
            for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.0,
    },
]
# params=model.parameters()
optimizer = torch.optim.Adam(optimizer_grouped_parameters, lr=1e-4)

## Accelator

In [16]:
# Prepare everything with our `accelerator`.
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

from transformers import get_scheduler
import math
gradient_accumulation_steps = 1
num_update_steps_per_epoch = math.ceil(
        len(train_dataloader) / gradient_accumulation_steps
    )
num_train_epochs = 10
max_train_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=max_train_steps,
)

total_batch_size = (
        per_device_train_batch_size
        * accelerator.num_processes
        * gradient_accumulation_steps
    )

## The biggest differences compared to Opacus are:

- We require the per-example loss (a 1-D tensor) be passed into `optimizer.step` (or `optimizer.virtual_step`).
- The per-example loss must be passed in as a keyword argument.
- `loss.backward()` shouldn't be called on the user end; it's called internally in `optimizer.step` ( or `optimizer.virtual_step`).
- Inputs should be in batch-first format; there isn't a toggle to switch between different formats in the engine.

## Ghost clipping: memory saving differentially private learning
Turning on ghost clipping requires changing only 1 line. You should notice a drastic reduction in peak GPU memory usage once this is turned on, at a potential cost of slower training speed. One might find this especially useful when constrained to only use older GPUs with small VRAMs or fitting super large models.



In [17]:
import transformers, torch
from private_transformers import PrivacyEngine
dp = True
if dp == True:
    privacy_engine = PrivacyEngine(
        model,
        batch_size=per_device_train_batch_size,
        sample_size=len(lm_datasets['train']),
        epochs=1,
        max_grad_norm=0.1,
        target_epsilon=3,
        clipping_mode="ghost",  # The only change you need to make!
    )
    privacy_engine.attach(optimizer)
else :
    privacy_engine = None

In [18]:
privacy_engine

PrivacyEngine(
  target_epsilon=3.000000, 
  target_delta=0.000199, 
  noise_multiplier=0.530371, 
  effective_noise_multiplier=0.132593, 
  epochs=1, 
  max_grad_norm=0.1, 
  sample_rate=0.001725625539257981, 
  batch_size=4, 
  accounting_mode=rdp, 
  clipping_mode=ghost
)

In [19]:
delta = 1.0/42061 # We instead use the accountant from Gopi et al. (2021) as described in the paper.

## Examples
Code in the examples folder roughly reproduces our results for the table-to-text and classification tasks. There may be some minor discrepancies, since hyperparameters there aren't exactly what's used in the paper. Nevertheless, it should be sufficient to get things started. Detailed instructions are in the readme file of each subfolder.

In [None]:
output_dir = "./savemodel/"
# Only show the progress bar once on each machine.
progress_bar = tqdm(
    range(max_train_steps), disable=not accelerator.is_local_main_process
)
completed_steps = 0
best_val_perplexity = float("inf")

for epoch in range(num_train_epochs):
    model.train()
    for step, batch in enumerate(train_dataloader):
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss = loss / gradient_accumulation_steps
        loss = loss.reshape(-1)
        # accelerator.backward(loss)
        if (
            step % gradient_accumulation_steps == 0
            or step == len(train_dataloader) - 1
        ):
            # Perform one optimization step with the PrivacyEngine
            optimizer.step(loss=loss)
            lr_scheduler.step()
            # optimizer.zero_grad()
            progress_bar.update(1)
            completed_steps += 1

        if completed_steps >= max_train_steps:
            break

    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        losses.append(
            accelerator.gather(loss.repeat(per_device_eval_batch_size))
        )

    losses = torch.cat(losses)
    losses = losses[: len(small_eval_dataset)]
    try:
        perplexity = math.exp(torch.mean(losses))
    except OverflowError:
        perplexity = float("inf")

    # logger.info(f"epoch {epoch}: perplexity: {perplexity}")
    print(f"epoch {epoch}: perplexity: {perplexity}")

    # Printing epsilon from opacus privacy engine at the end of each epoch
    eps, alpha = optimizer.privacy_engine.get_privacy_spent(delta)
    print("End of epoch {}, we have epsilon {} for alpha {}".format(epoch, eps, alpha))

    # if perplexity < best_val_perplexity and output_dir is not None:
    #     best_val_perplexity = perplexity
    #     accelerator.wait_for_everyone()
    #     unwrapped_model = accelerator.unwrap_model(model)
    #     unwrapped_model.save_pretrained(
    #         output_dir, save_function=accelerator.save
    #     )
        # logger.info(
        #     f"saved model! epoch {epoch}: perplexity: {best_val_perplexity}"
        # )
        # print(f"saved model! epoch {epoch}: perplexity: {best_val_perplexity}")
        # tokenizer.save_pretrained(output_dir)
        # if accelerator.is_main_process:
        #     # tokenizer.save_pretrained(output_dir)
        #     if push_to_hub:
        #         repo.push_to_hub(
        #             commit_message="Best val perplexity", auto_lfs_prune=True
        #         )

    # if push_to_hub and epoch < num_train_epochs - 1:
    #     accelerator.wait_for_everyone()
    #     unwrapped_model = accelerator.unwrap_model(model)
    #     unwrapped_model.save_pretrained(
    #         output_dir, save_function=accelerator.save
    #     )
    #     if accelerator.is_main_process:
    #         tokenizer.save_pretrained(output_dir)
    #         repo.push_to_hub(
    #             commit_message=f"Training in progress epoch {epoch}",
    #             blocking=False,
    #             auto_lfs_prune=True,
    #         )

    # if epoch == (num_train_epochs - 1):
    #     save_fir = output_dir + f"_epoch_{num_train_epochs - 1}"
    #     accelerator.wait_for_everyone()
    #     unwrapped_model = accelerator.unwrap_model(model)
    #     unwrapped_model.save_pretrained(save_fir, save_function=accelerator.save)
    #     tokenizer.save_pretrained(save_fir)

  9%|▉         | 514/5800 [03:44<38:17,  2.30it/s]