In [1]:
#comment this if you are not using AIT proxy...
import os
os.environ['http_proxy']  = 'http://192.41.170.23:3128'
os.environ['https_proxy'] = 'http://192.41.170.23:3128'

In [2]:
import torch.nn as nn
import torch
from torch.utils.data import DataLoader
from transformers import (
    AdamW,
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    default_data_collator,
    get_scheduler,
    set_seed,
)
from tqdm.auto import tqdm

#make our work comparable if restarted the kernel
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import os
from io import open
import torch
import json
from glob import glob
import numpy as np
import pandas as pd
from tqdm import tqdm

In [4]:
import argparse
import logging
import math
import os
import random
from itertools import chain

## Preprocessing the datasets.

In [5]:
from accelerate import Accelerator

accelerator = Accelerator()

In [6]:
model_checkpoint = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
# PAD_TOKEN = '<pad>'
# tokenizer.add_special_tokens({'pad_token': PAD_TOKEN})
# tokenizer

In [7]:
class Wikitext_Dataset:
    def __init__(self, path):
        self.train = os.path.join(path, 'train/train.txt')
        self.valid = os.path.join(path, 'valid/valid.txt')
        self.test  = os.path.join(path, 'test/test.txt')

    def build_corpus(self, path):
        files = open(path,'r')
        lines = []
        for line in files:
            line = line.strip().lower()
            if len(line) == 0:
                continue
            lines.append(line)
        return lines
path_files = './data/wikitext-2-add10b'
corpus = Wikitext_Dataset(path_files)
train_dataset = corpus.build_corpus(corpus.train)
valid_dataset = corpus.build_corpus(corpus.valid)
test_dataset  = corpus.build_corpus(corpus.test)

In [8]:
from datasets import Dataset
from datasets import DatasetDict
import pandas as pd

raw_datasets_train = Dataset.from_pandas(pd.DataFrame(data = {'text': train_dataset}))
raw_datasets_valid = Dataset.from_pandas(pd.DataFrame(data = {'text': valid_dataset}))
raw_datasets_test  = Dataset.from_pandas(pd.DataFrame(data = {'text': test_dataset}))
#remove .shuffle if you want to train the whole dataset....

raw_datasets = DatasetDict(
    {
        'train':raw_datasets_train,
        'validation':raw_datasets_valid,
        'test':raw_datasets_test
    }
)
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 23777
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 2461
    })
    test: Dataset({
        features: ['text'],
        num_rows: 2891
    })
})

In [9]:
# First we tokenize all the texts.
column_names = raw_datasets["train"].column_names
text_column_name = "text" if "text" in column_names else column_names[0]

def tokenize_function(examples):
    return tokenizer(examples[text_column_name])

# def tokenize_function(example):
#     outputs =  tokenizer(example[text_column_name], truncation=True, padding='max_length')
#     input_batch = []
#     for input_ids in outputs["input_ids"]:
#         input_batch.append(input_ids)
#     return {"input_ids": input_batch}


preprocessing_num_workers = None
with accelerator.main_process_first():
    tokenized_datasets = raw_datasets.map(
        tokenize_function,
        batched=True,
        num_proc=preprocessing_num_workers,
        remove_columns=column_names,
        desc="Running tokenizer on dataset",
    )

tokenized_datasets

                                                                                             

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 23777
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 2461
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 2891
    })
})

In [10]:
block_size = 1024
if block_size is None:
    block_size = tokenizer.model_max_length
    if block_size > 1024:
        # logger.warning(
        #     f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
        #     "Picking 1024 instead. You can change that default value by passing --block_size xxx."
        # )
        block_size = 1024
else:
    if block_size > tokenizer.model_max_length:
        # logger.warning(
        #     f"The block_size passed ({args.block_size}) is larger than the maximum length for the model"
        #     f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}."
        # )
        block_size = min(block_size, tokenizer.model_max_length)
    
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [11]:
# # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder
# # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower
# # to preprocess.
# #
# # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
preprocessing_num_workers = 1
with accelerator.main_process_first():
    lm_datasets = tokenized_datasets.map(
        group_texts,
        batched=True,
        num_proc=preprocessing_num_workers,
        desc=f"Grouping texts in chunks of {block_size}",
    )
lm_datasets.set_format("torch")
lm_datasets

                                                                                                 

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2405
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 255
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 290
    })
})

In [31]:
small_train_dataset = lm_datasets["train"].shuffle(seed=55) #.select(range(10))
small_eval_dataset = lm_datasets["validation"].shuffle(seed=55)
small_test_dataset = lm_datasets["test"].shuffle(seed=55)

In [32]:
from torch.utils.data import DataLoader
per_device_train_batch_size = 16
per_device_eval_batch_size = 16

train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=per_device_train_batch_size, pin_memory=True)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=per_device_eval_batch_size, pin_memory=True)
test_dataloader = DataLoader(small_test_dataset, batch_size=per_device_eval_batch_size)

In [34]:
#checking chucking
for i in train_dataloader:
    print(i['input_ids'].shape, i['labels'].shape)
    break
for i in eval_dataloader:
    print(i['input_ids'].shape, i['labels'].shape)
    break
for i in test_dataloader:
    print(i['input_ids'].shape, i['labels'].shape)
    break

torch.Size([16, 1024]) torch.Size([16, 1024])
torch.Size([16, 1024]) torch.Size([16, 1024])
torch.Size([16, 1024]) torch.Size([16, 1024])


In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

config = AutoConfig.from_pretrained(model_checkpoint, tie_word_embeddings=False)
model = AutoModelForCausalLM.from_config(config)
# model.resize_token_embeddings(len(tokenizer))

In [16]:
# model.config

In [17]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [18]:
# Optimizer
# Split weights in two groups, one with weight decay and the other not.
no_decay = ["bias", "LayerNorm.weight"]
weight_decay = 0
optimizer_grouped_parameters = [
    {
        "params": [
            p
            for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        "weight_decay": weight_decay,
    },
    {
        "params": [
            p
            for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.0,
    },
]
# params=model.parameters()
optimizer = torch.optim.Adam(optimizer_grouped_parameters, lr=1e-4)

## Accelator

In [19]:
# Prepare everything with our `accelerator`.
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

from transformers import get_scheduler
import math
gradient_accumulation_steps = 1
num_update_steps_per_epoch = math.ceil(
        len(train_dataloader) / gradient_accumulation_steps
    )
num_train_epochs = 10
max_train_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=max_train_steps,
)

total_batch_size = (
        per_device_train_batch_size
        * accelerator.num_processes
        * gradient_accumulation_steps
    )

## Ghost clipping: memory saving differentially private learning
Turning on ghost clipping requires changing only 1 line. You should notice a drastic reduction in peak GPU memory usage once this is turned on, at a potential cost of slower training speed. One might find this especially useful when constrained to only use older GPUs with small VRAMs or fitting super large models.



In [20]:
import transformers, torch
from private_transformers import PrivacyEngine
dp = True
if dp == True:
    privacy_engine = PrivacyEngine(
        model,
        batch_size=per_device_train_batch_size,
        sample_size=len(lm_datasets['train']),
        epochs=1,
        max_grad_norm=0.1,
        target_epsilon=3,
        clipping_mode="ghost",  # The only change you need to make!
    )
    privacy_engine.attach(optimizer)
else :
    privacy_engine = None

In [21]:
privacy_engine

PrivacyEngine(
  target_epsilon=3.000000, 
  target_delta=0.000191, 
  noise_multiplier=0.607495, 
  effective_noise_multiplier=0.037968, 
  epochs=1, 
  max_grad_norm=0.1, 
  sample_rate=0.006652806652806653, 
  batch_size=16, 
  accounting_mode=rdp, 
  clipping_mode=ghost
)

In [22]:
delta = 1.0/42061 # We instead use the accountant from Gopi et al. (2021) as described in the paper.

In [23]:
def train(model, train_dataloader, optimizer):
    model.train()
    for step, batch in enumerate(tqdm(train_dataloader)):
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss = loss / gradient_accumulation_steps
        loss = loss.reshape(-1)
        # accelerator.backward(loss)
        if (
            step % gradient_accumulation_steps == 0
            or step == len(train_dataloader) - 1
        ):
            # Perform one optimization step with the PrivacyEngine
            optimizer.step(loss=loss)
            lr_scheduler.step()
            # optimizer.zero_grad()
            # progress_bar.update(1)
            # completed_steps += 1

        # if completed_steps >= max_train_steps:
        #     break

In [24]:
def evaluate(model, eval_dataloader):
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        losses.append(
            accelerator.gather(loss.repeat(per_device_eval_batch_size))
        )

    losses = torch.cat(losses)
    losses = losses[: len(small_eval_dataset)]
    try:
        perplexity = math.exp(torch.mean(losses))
    except OverflowError:
        perplexity = float("inf")
    return perplexity

In [27]:
def train_and_evaluate(model, train_dataloader, val_dataloader, optimizer, restore_file=None):
    save_path = f'models/{model.__class__.__name__}_add10b.pt'
    # Only show the progress bar once on each machine.
    # progress_bar = tqdm(
    #     range(max_train_steps), disable=not accelerator.is_local_main_process
    # )
    # completed_steps = 0
    best_val_perplexity = float("inf")
    
    for epoch in range(num_train_epochs):
        # compute number of batches in one epoch (one full pass over the training set)
        train(model, train_dataloader, optimizer)
        
        # Evaluate for one epoch on validation set
        perplexity = evaluate(model, eval_dataloader)

        # logger.info(f"epoch {epoch}: perplexity: {perplexity}")
        print(f"epoch {epoch}: perplexity: {perplexity}")
    
        # Printing epsilon from opacus privacy engine at the end of each epoch
        eps, alpha = optimizer.privacy_engine.get_privacy_spent(delta)
        print("End of epoch {}, we have epsilon {} for alpha {}".format(epoch, eps, alpha))
    
        if perplexity < best_val_perplexity and save_path is not None:
            best_val_perplexity = perplexity
            
            print(f"saved model! epoch {epoch}: perplexity: {best_val_perplexity}")
            torch.save(model.state_dict(), save_path)

In [28]:
train_and_evaluate(model, train_dataloader, eval_dataloader, optimizer, restore_file=None)

100%|██████████| 151/151 [04:00<00:00,  1.59s/it]


epoch 0: perplexity: 914.3543896956497
End of epoch 0, we have epsilon eps_rdp for alpha alpha_rdp
saved model! epoch 0: perplexity: 914.3543896956497


100%|██████████| 151/151 [04:01<00:00,  1.60s/it]


epoch 1: perplexity: 779.2529355045938
End of epoch 1, we have epsilon eps_rdp for alpha alpha_rdp
saved model! epoch 1: perplexity: 779.2529355045938


100%|██████████| 151/151 [04:01<00:00,  1.60s/it]


epoch 2: perplexity: 703.8104114304201
End of epoch 2, we have epsilon eps_rdp for alpha alpha_rdp
saved model! epoch 2: perplexity: 703.8104114304201


100%|██████████| 151/151 [04:01<00:00,  1.60s/it]


epoch 3: perplexity: 654.5420514229319
End of epoch 3, we have epsilon eps_rdp for alpha alpha_rdp
saved model! epoch 3: perplexity: 654.5420514229319


100%|██████████| 151/151 [04:01<00:00,  1.60s/it]


epoch 4: perplexity: 622.3449791643961
End of epoch 4, we have epsilon eps_rdp for alpha alpha_rdp
saved model! epoch 4: perplexity: 622.3449791643961


100%|██████████| 151/151 [04:01<00:00,  1.60s/it]


epoch 5: perplexity: 600.2658993470692
End of epoch 5, we have epsilon eps_rdp for alpha alpha_rdp
saved model! epoch 5: perplexity: 600.2658993470692


100%|██████████| 151/151 [04:01<00:00,  1.60s/it]


epoch 6: perplexity: 585.3665072514357
End of epoch 6, we have epsilon eps_rdp for alpha alpha_rdp
saved model! epoch 6: perplexity: 585.3665072514357


100%|██████████| 151/151 [04:01<00:00,  1.60s/it]


epoch 7: perplexity: 576.9659874628726
End of epoch 7, we have epsilon eps_rdp for alpha alpha_rdp
saved model! epoch 7: perplexity: 576.9659874628726


100%|██████████| 151/151 [04:01<00:00,  1.60s/it]


epoch 8: perplexity: 574.1988961876625
End of epoch 8, we have epsilon eps_rdp for alpha alpha_rdp
saved model! epoch 8: perplexity: 574.1988961876625


100%|██████████| 151/151 [04:01<00:00,  1.60s/it]


epoch 9: perplexity: 574.1988961876625
End of epoch 9, we have epsilon eps_rdp for alpha alpha_rdp


In [None]:
# output_dir = "./savemodel/"
# save_path = f'models/{model.__class__.__name__}_add10b.pt'

# # Only show the progress bar once on each machine.
# progress_bar = tqdm(
#     range(max_train_steps), disable=not accelerator.is_local_main_process
# )
# completed_steps = 0
# best_val_perplexity = float("inf")

# for epoch in range(num_train_epochs):
#     model.train()
#     for step, batch in enumerate(train_dataloader):
#         optimizer.zero_grad()
#         outputs = model(**batch)
#         loss = outputs.loss
#         loss = loss / gradient_accumulation_steps
#         loss = loss.reshape(-1)
#         # accelerator.backward(loss)
#         if (
#             step % gradient_accumulation_steps == 0
#             or step == len(train_dataloader) - 1
#         ):
#             # Perform one optimization step with the PrivacyEngine
#             optimizer.step(loss=loss)
#             lr_scheduler.step()
#             # optimizer.zero_grad()
#             progress_bar.update(1)
#             completed_steps += 1

#         if completed_steps >= max_train_steps:
#             break

#     model.eval()
#     losses = []
#     for step, batch in enumerate(eval_dataloader):
#         with torch.no_grad():
#             outputs = model(**batch)

#         loss = outputs.loss
#         losses.append(
#             accelerator.gather(loss.repeat(per_device_eval_batch_size))
#         )

#     losses = torch.cat(losses)
#     losses = losses[: len(small_eval_dataset)]
#     try:
#         perplexity = math.exp(torch.mean(losses))
#     except OverflowError:
#         perplexity = float("inf")

#     # logger.info(f"epoch {epoch}: perplexity: {perplexity}")
#     print(f"epoch {epoch}: perplexity: {perplexity}")

#     # Printing epsilon from opacus privacy engine at the end of each epoch
#     eps, alpha = optimizer.privacy_engine.get_privacy_spent(delta)
#     print("End of epoch {}, we have epsilon {} for alpha {}".format(epoch, eps, alpha))

#     if perplexity < best_val_perplexity and output_dir is not None:
#         best_val_perplexity = perplexity
#     #     accelerator.wait_for_everyone()
#     #     unwrapped_model = accelerator.unwrap_model(model)
#     #     unwrapped_model.save_pretrained(
#     #         output_dir, save_function=accelerator.save
#     #     )
#         # logger.info(
#         #     f"saved model! epoch {epoch}: perplexity: {best_val_perplexity}"
#         # )
#         print(f"saved model! epoch {epoch}: perplexity: {best_val_perplexity}")
#         torch.save(model.state_dict(), save_path)
#         # tokenizer.save_pretrained(output_dir)
#         # if accelerator.is_main_process:
#         #     # tokenizer.save_pretrained(output_dir)
#         #     if push_to_hub:
#         #         repo.push_to_hub(
#         #             commit_message="Best val perplexity", auto_lfs_prune=True
#         #         )

#     # if push_to_hub and epoch < num_train_epochs - 1:
#     #     accelerator.wait_for_everyone()
#     #     unwrapped_model = accelerator.unwrap_model(model)
#     #     unwrapped_model.save_pretrained(
#     #         output_dir, save_function=accelerator.save
#     #     )
#     #     if accelerator.is_main_process:
#     #         tokenizer.save_pretrained(output_dir)
#     #         repo.push_to_hub(
#     #             commit_message=f"Training in progress epoch {epoch}",
#     #             blocking=False,
#     #             auto_lfs_prune=True,
#     #         )

#     # if epoch == (num_train_epochs - 1):
#     #     save_fir = output_dir + f"_epoch_{num_train_epochs - 1}"
#     #     accelerator.wait_for_everyone()
#     #     unwrapped_model = accelerator.unwrap_model(model)
#     #     unwrapped_model.save_pretrained(save_fir, save_function=accelerator.save)
#     #     tokenizer.save_pretrained(save_fir)

## Test

In [35]:
# save_path = f'models/{model.__class__.__name__}_add10b.pt'
# model.load_state_dict(torch.load(save_path,  map_location=device))
# perplexity = evaluate(model, test_dataloader)
# print(f'Test Perplexity: {perplexity}')

## Inference

In [2]:
import torch
from transformers import (
    CONFIG_MAPPING,
    MODEL_MAPPING,
    AdamW,
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    default_data_collator
)
from itertools import chain

# Load the trained model
# model_path = 'dp-gpt2-clm-model.pth'
model_checkpoint = "distilgpt2"
config = AutoConfig.from_pretrained(model_checkpoint)
model = AutoModelForCausalLM.from_config(config)

save_path = f'models/{model.__class__.__name__}_add10b.pt'
model.load_state_dict(torch.load(save_path))
model = model.eval()

In [3]:
# Set the device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Set up the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [4]:
# input_ids = tokenizer.encode('My ID is ', return_tensors='pt').to(device)
# input_ids[0]

In [5]:
# log_interval = 10
# max_seq_len = 200
# temperature = 1

def generate(prompt, max_seq_len, temperature, model, tokenizer, device, seed=None):
    tokens = ""
    if seed is not None:
        torch.manual_seed(seed)
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
    
    with open('dp-distillgpt2-generated.txt', 'w') as output_files:
        model.eval()
        with torch.no_grad():  # no tracking history
            for i in range(max_seq_len):
                
                output = model(input_ids)
                word_weights = output[0].squeeze().div(temperature).exp().cpu()
                word_idx = torch.multinomial(word_weights, 1)[0]
                word_tensor = torch.Tensor([[word_idx]]).long().to(device)
                input = torch.cat([input_ids, word_tensor], 1)
    
                word = tokenizer.decode(word_idx)
                tokens = tokens + word + ('\n' if i % 20 == 19 else '')
                output_files.write(word + ('\n' if i % 20 == 19 else ''))
    
                # if i % log_interval == 0:
                #     print('| Generated {}/{} words'.format(i, max_seq_len))
            # print(tokens)
    return tokens

In [6]:
prompt = 'my id is'
max_seq_len = 5
seed = 0
temperatures = [0.5, 0.7, 0.75, 0.8, 1.0]
for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, model, tokenizer, device, seed)
    print(f'{str(temperature)}\n{generation}\n')

0.5
 by.,..

0.7
 by. j..

0.75
 by. j..

0.8
 by. jihilation.

1.0
 Inv. jihilation was

