In [2]:
%pip install --disable-pip-version-check torch==1.13.1 torchdata==0.5.1

[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
!nvidia-smi

Sun Apr 30 06:00:50 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:10:1C.0 Off |                    0 |
| N/A   28C    P0    49W / 400W |      0MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-SXM...  Off  | 00000000:10:1D.0 Off |                    0 |
| N/A   28C    P0    49W / 400W |      0MiB / 40536MiB |      0%      Default |
|       

## _==> Please ignore all WARNINGs and ERRORs from the `pip install`'s above. <==_

# Hugging Face Transformers
## _==> Please ignore all WARNINGs and ERRORs from the `pip install`'s below. <==_

In [4]:
%pip install --disable-pip-version-check -Uq \
    transformers==4.27.2 \
    datasets==2.9.0 \
    accelerate \
    promptsource==0.2.3 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 

[0mNote: you may need to restart the kernel to use updated packages.


In [5]:
%pip install git+https://github.com/huggingface/peft.git

Collecting git+https://github.com/huggingface/peft.git
  Cloning https://github.com/huggingface/peft.git to /tmp/pip-req-build-pcx1vzq4
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/peft.git /tmp/pip-req-build-pcx1vzq4
  Resolved https://github.com/huggingface/peft.git to commit 632997d1fb776c3cf05d8c2537ac9a98a7ce9435
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [6]:
%pip install git+https://github.com/lvwerra/trl.git

Collecting git+https://github.com/lvwerra/trl.git
  Cloning https://github.com/lvwerra/trl.git to /tmp/pip-req-build-rerxcom2
  Running command git clone --filter=blob:none --quiet https://github.com/lvwerra/trl.git /tmp/pip-req-build-rerxcom2
  Resolved https://github.com/lvwerra/trl.git to commit 08f550674c553c36c51d1027613c29f14f3676a5
  Preparing metadata (setup.py) ... [?25ldone
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [7]:
import argparse
import os

import torch
from accelerate import Accelerator
from datasets import load_dataset
from peft import LoraConfig, get_peft_model #, prepare_model_for_int8_training
from torch.utils.data import IterableDataset
from tqdm import tqdm
from transformers import AutoConfig, AutoModelForSeq2SeqLM, AutoTokenizer, Trainer, TrainingArguments, logging, set_seed

In [8]:
def get_prompt_with_response(example):
    text = f"Question: {example['question']}\n\nAnswer: {example['response_j']}"
    return text

def get_prompt_from_question(question):
    text = f"Question: {question}\n\nAnswer: "
    return text

def get_prompt_without_response(example):
    text = f"Question: {example['question']}\n\nAnswer: "
    return text

def get_response_without_prompt(example):
    return example['response_j']


def chars_token_ratio(dataset, tokenizer, nb_examples=400):
    """
    Estimate the average number of characters per token in the dataset.
    """
    total_characters, total_tokens = 0, 0
    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
        text = get_prompt_with_response(example)
        total_characters += len(text)
        if tokenizer.is_fast:
            total_tokens += len(tokenizer(text).tokens())
        else:
            total_tokens += len(tokenizer.tokenize(text))

    return total_characters / total_tokens


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

    
class ConstantLengthDataset(IterableDataset):
    """
    Iterable dataset that returns constant length chunks of tokens from stream of text files.
        Args:
            tokenizer (Tokenizer): The processor used for proccessing the data.
            dataset (dataset.Dataset): Dataset with text files.
            infinite (bool): If True the iterator is reset after dataset reaches end else stops.
            seq_length (int): Length of token sequences to return.
            num_of_sequences (int): Number of token sequences to keep in buffer.
            chars_per_token (int): Number of characters per token used to estimate number of tokens in text buffer.
    """

    def __init__(
        self,
        tokenizer,
        dataset,
        infinite=False,
        seq_length=1024,
        num_of_sequences=1024,
        chars_per_token=3.6,
    ):
        self.tokenizer = tokenizer
        self.concat_token_id = tokenizer.eos_token_id if tokenizer.eos_token_id else eos_token_id
        self.dataset = dataset
        self.seq_length = seq_length
        self.infinite = infinite
        self.current_size = 0
        self.max_buffer_size = seq_length * chars_per_token * num_of_sequences

    def __iter__(self):
        iterator = iter(self.dataset)
        more_examples = True
        while more_examples:
            buffer, buffer_len = [], 0
            while True:
                if buffer_len >= self.max_buffer_size:
                    break
                try:
                    buffer.append(get_prompt_with_response(next(iterator)))
                    buffer_len += len(buffer[-1])
                except StopIteration:
                    if self.infinite:
                        iterator = iter(self.dataset)
                    else:
                        more_examples = False
                        break
            tokenized_inputs = self.tokenizer(buffer, truncation=False)["input_ids"]
            all_token_ids = []
            for tokenized_input in tokenized_inputs:
                all_token_ids.extend(tokenized_input + [self.concat_token_id])
            for i in range(0, len(all_token_ids), self.seq_length):
                input_ids = all_token_ids[i : i + self.seq_length]
                if len(input_ids) == self.seq_length:
                    self.current_size += 1
                    yield {
                        "input_ids": torch.LongTensor(input_ids),
                        "labels": torch.LongTensor(input_ids),
                    }

# Fine-Tune flan-t5 on stack exchange paired dataset

In [9]:
model_path="google/flan-t5-xxl"
dataset_name="lvwerra/stack-exchange-paired"
subset="data/finetune"
split="train"
size_valid_set=4000
streaming=False
shuffle_buffer=5000
seq_length=1024
max_steps=500 # was 5000
batch_size=2 # was 16
gradient_accumulation_steps=1
eos_token_id=49152
learning_rate=1e-5
lr_scheduler_type="cosine"
num_warmup_steps=50 # was 100
weight_decay=0.05
local_rank=0
no_fp16=True
bf16=True
no_gradient_checkpointing=True
seed=42
num_workers=None
log_freq=100
eval_freq=100 # was 1000
save_freq=100 # was 1000

In [10]:
set_seed(seed)

tokenizer = AutoTokenizer.from_pretrained(model_path)

print('Dataset name: {}'.format(dataset_name))
dataset = load_dataset(
    dataset_name,
    data_dir=subset,
    split=split,
    num_proc=num_workers if not streaming else None,
    streaming=streaming,
)

print(dataset)
if streaming:
    print("Loading the dataset in streaming mode")
    valid_data = dataset.take(size_valid_set)
    train_data = dataset.skip(size_valid_set)
    train_data = train_data.shuffle(buffer_size=shuffle_buffer, seed=seed)
else:
    dataset = dataset.train_test_split(test_size=0.005, seed=seed)
    train_data = dataset["train"]
    valid_data = dataset["test"]
    print(f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}")

chars_per_token = chars_token_ratio(train_data, tokenizer)
print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")

train_dataset = ConstantLengthDataset(
    tokenizer,
    train_data,
    infinite=True,
    seq_length=seq_length,
    chars_per_token=chars_per_token,
)
validation_dataset = ConstantLengthDataset(
    tokenizer,
    valid_data,
    infinite=False,
    seq_length=seq_length,
    chars_per_token=chars_per_token,
)

print(train_dataset)
print(validation_dataset)

Dataset name: lvwerra/stack-exchange-paired


Using custom data configuration lvwerra--stack-exchange-paired-4e8354da9bd1bd3d
Found cached dataset parquet (/root/.cache/huggingface/datasets/lvwerra___parquet/lvwerra--stack-exchange-paired-4e8354da9bd1bd3d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached split indices for dataset at /root/.cache/huggingface/datasets/lvwerra___parquet/lvwerra--stack-exchange-paired-4e8354da9bd1bd3d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-54456b7e473dd794.arrow and /root/.cache/huggingface/datasets/lvwerra___parquet/lvwerra--stack-exchange-paired-4e8354da9bd1bd3d/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8304e884db3f8548.arrow


Dataset({
    features: ['qid', 'question', 'date', 'metadata', 'response_j', 'response_k'],
    num_rows: 7440923
})
Size of the train set: 7403718. Size of the validation set: 37205


  0%|          | 0/400 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (542 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 400/400 [00:00<00:00, 641.40it/s]

The character to token ratio of the dataset is: 3.08
<__main__.ConstantLengthDataset object at 0x7fd628c1e590>
<__main__.ConstantLengthDataset object at 0x7fd3e8a399d0>





# Fine-tune Model with PEFT

In [11]:
# disable caching mechanism when using gradient checkpointing
# Flan-T5
base_model = AutoModelForSeq2SeqLM.from_pretrained('google/flan-t5-xxl',
                                                   use_cache=not no_gradient_checkpointing,
                                                   #torch_dtype=torch.float16
                                                  )
lora_config = LoraConfig(
    r=4, # rank
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM", # CAUSAL_LM
)

peft_model = get_peft_model(base_model, lora_config)
print_trainable_parameters(peft_model)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

trainable params: 4718592 || all params: 11140050944 || trainable%: 0.042357005580314876


In [12]:
peft_fine_tuned_with_public_qanda_checkpoint='./peft_fine_tuned_with_public_qanda'

In [13]:
peft_training_args = TrainingArguments(
    output_dir=peft_fine_tuned_with_public_qanda_checkpoint,
    dataloader_drop_last=True,
#    evaluation_strategy="steps",    
    max_steps=max_steps,
#    eval_steps=eval_freq,
#    save_steps=save_freq,
    logging_steps=log_freq,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=learning_rate,
    lr_scheduler_type=lr_scheduler_type,
    warmup_steps=num_warmup_steps,
    gradient_accumulation_steps=gradient_accumulation_steps,
    gradient_checkpointing=not no_gradient_checkpointing,
    fp16=not no_fp16,
    bf16=bf16,
    weight_decay=weight_decay
)

peft_trainer = Trainer(model=peft_model, 
                  args=peft_training_args, 
                  train_dataset=train_dataset, 
                  #eval_dataset=validation_dataset
                )

print("Training...")
peft_trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 160.00 MiB (GPU 0; 39.59 GiB total capacity; 38.60 GiB already allocated; 56.19 MiB free; 38.60 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
print("Saving last checkpoint of the peft model")
#merged_model=peft_trainer.model.merge_and_unload()
peft_trainer.model.save_pretrained(peft_fine_tuned_with_public_qanda_checkpoint)
tokenizer.save_pretrained(peft_fine_tuned_with_public_qanda_checkpoint)

In [None]:
%store peft_fine_tuned_with_public_qanda_checkpoint

In [None]:
from peft import PeftModel

peft_model = PeftModel.from_pretrained(base_model, peft_fine_tuned_with_public_qanda_checkpoint)
peft_model_tokenizer = AutoTokenizer.from_pretrained(peft_fine_tuned_with_public_qanda_checkpoint)

# Qualitative results

In [None]:
question = 'Who won the 2022 World Cup?'

prompt = get_prompt_from_question(question)

inputs = peft_model_tokenizer(prompt, return_tensors='pt')
response = peft_model_tokenizer.decode(
    peft_model.generate(
       input_ids= inputs["input_ids"], 
       max_new_tokens=200,
    )[0], 
    skip_special_tokens=True
)
print(f'INPUT PROMPT:\n{prompt}{response}\n')

In [None]:
print('Dataset name: {}'.format(dataset_name))
dataset = load_dataset(
    dataset_name,
    data_dir='data/evaluation',
    split='train',
)

print(dataset)

In [None]:
prompts_and_responses = dataset \
    .select(range(10)).map(lambda example : {
        'prompt': get_prompt_without_response(example), 
        'response': get_response_without_prompt(example)
    })
prompts_and_responses[0]['question']
prompts_and_responses[0]['response_j']

In [None]:
for prompt_response in prompts_and_responses:
    prompt = prompt_response['prompt']
    inputs = peft_model_tokenizer(prompt, return_tensors='pt')

    response = peft_model_tokenizer.decode(
                peft_model.generate(
                    input_ids=inputs["input_ids"], 
                    max_new_tokens=200,
                    do_sample=True, 
                    top_k=50, 
                    top_p=0.9
                    )[0],
                    skip_special_tokens=True)

    print('PROMPT: {}'.format(prompt))
    print('RESPONSE: {}'.format(response))
    print('EXPECTED RESPONSE: {}'.format(prompt_response['response']))
    print('----')

# Quantitative Results with ROGUE Metric

The [ROUGE metric](https://en.wikipedia.org/wiki/ROUGE_(metric)) helps quantify the validity of summarizations produced by models. It compares summarizations to a "baseline" summary which is usually created by a human. While not perfect, it does give an indication to the overall increase in summarization effectiveness that we have accomplished by fine-tuning.# ROUGE evaluation of summaries

In [None]:
import evaluate

rouge = evaluate.load('rouge')

In [None]:
prompts = prompts_and_responses['prompt']
human_baseline_summaries = prompts_and_responses['response']
human_baseline_summaries

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

pretrained_model_checkpoint='t5-xxl'
pretrained_model_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_checkpoint, use_fast=True)
pretrained_model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_checkpoint)

instruct_model_checkpoint='google/flan-t5-xxl'
instruct_model_tokenizer = AutoTokenizer.from_pretrained(instruct_model_checkpoint, use_fast=True)
instruct_model = AutoModelForSeq2SeqLM.from_pretrained(instruct_model_checkpoint)

In [None]:
from transformers import GenerationConfig

pretrained_model_summaries = []
instruct_model_summaries = []
#supervised_fine_tuned_model_summaries = []
peft_model_summaries = []

for idx, prompt in enumerate(prompts):
    input_ids = pretrained_model_tokenizer(prompt, return_tensors="pt").input_ids

    pretrained_model_outputs = pretrained_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    pretrained_model_text_output = pretrained_model_tokenizer.decode(pretrained_model_outputs[0], skip_special_tokens=True)
    pretrained_model_summaries.append(pretrained_model_text_output)

    instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    instruct_model_text_output = instruct_model_tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)
    instruct_model_summaries.append(instruct_model_text_output)
    
    # supervised_fine_tuned_model_outputs = supervised_fine_tuned_model_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    # supervised_fine_tuned_model_text_output = supervised_fine_tuned_model_model_tokenizer.decode(supervised_fine_tuned_model_outputs[0], skip_special_tokens=True)
    # supervised_fine_tuned_model_summaries.append(supervised_fine_tuned_model_model_text_output)
    
    peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    peft_model_text_output = peft_model_tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)
    peft_model_summaries.append(peft_model_text_output)

In [None]:
pretrained_model_results = rouge.compute(
    predictions=pretrained_model_summaries,
    references=human_baseline_summaries[0:len(pretrained_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)
pretrained_model_results

In [None]:
instruct_model_results = rouge.compute(
    predictions=instruct_model_summaries,
    references=human_baseline_summaries[0:len(instruct_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)
instruct_model_results

In [None]:
# supervised_fine_tuned_model_results = rouge.compute(
#     predictions=supervised_fine_tuned_model_summaries,
#     references=human_baseline_summaries[0:len(supervised_fine_tuned_model_summaries)],
#     use_aggregator=True,
#     use_stemmer=True,
# )
# supervised_fine_tuned_model_results

In [None]:
peft_model_results = rouge.compute(
    predictions=peft_loaded_model_summaries,
    references=human_baseline_summaries[0:len(peft_loaded_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)
peft_model_results