In [None]:
!pip install transformers[torch]
# after uninstalling accelerate we re lanche the runtime and we run this command



In [None]:
!pip install evaluate



In [None]:
!pip install py7zr
dataset_id = "samsum"
from datasets import load_dataset
# Load dataset from the hub
dataset = load_dataset(dataset_id)

print(f"Train dataset size: {len(dataset['train'])}")
print(f"Test dataset size: {len(dataset['test'])}")

# Train dataset size: 14732
# Test dataset size: 819


Train dataset size: 14732
Test dataset size: 819


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

model_name = "google/flan-t5-base"

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
from random import randrange


sample = dataset['train'][randrange(len(dataset["train"]))]
print(f"dialogue: \n{sample['dialogue']}\n---------------")
print(f"summary: \n{sample['summary']}\n---------------")


dialogue: 
Donny: Hey
Donny: buddy!!
Marc: Oh bro
Marc: Where have you been!
Donny: Got engaged!
Marc: Whoa congrats!
Marc: How is Martha? 
Donny: Great, she's working. I am working at home
Marc: oh wow
Marc: What a power couple!
Donny: :3 
Donny: We're planning to come visit you this Friday 
Donny: You free? 
Marc: hell yea! 
Marc: Friday night with my boi
Donny: Cya then
---------------
summary: 
Donny got engaged to Martha. They will visit Marc this Friday.
---------------


In [None]:
from datasets import concatenate_datasets

# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["dialogue"], truncation=True), batched=True, remove_columns=["dialogue", "summary"])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["summary"], truncation=True), batched=True, remove_columns=["dialogue", "summary"])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")


Max source length: 512


Map:   0%|          | 0/15551 [00:00<?, ? examples/s]

Max target length: 95


In [None]:
def preprocess_function(sample,padding="max_length"):
    # add prefix to the input for t5
    inputs = ["summarize: " + item for item in sample["dialogue"]]

    # tokenize inputs
    #tokenizedText=tokenizze(the test to tokenize , the length of the tokenized text , specify the padding , specify the trunction)
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["summary"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["dialogue", "summary", "id"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")


Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


In [None]:
index = 200
dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation

{dialogue}

Summary:
"""

inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(
    original_model.generate(
        inputs["input_ids"],
        max_new_tokens=200,
    )[0],
    skip_special_tokens=True
)

In [None]:
!pip install peft



In [None]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=["q","v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM #FlLAN-T5
)

In [None]:
peft_model = get_peft_model(original_model,lora_config)

In [None]:
import time
from transformers import TrainingArguments , Trainer
output_dir = f"/content/drive/MyDrive/LLM/models/peft-dialogue-summary-training-{str(int(time.time()))}"

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3,
    num_train_epochs=1,
    logging_steps=1,
    max_steps=1
)

peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
        train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

peft_trainer.train()
peft_model_path="/content/drive/MyDrive/LLM/models/peft-dialogue-summary-checkpoint-local"

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

Step,Training Loss
1,1.8516


('/content/drive/MyDrive/LLM/models/peft-dialogue-summary-checkpoint-local/tokenizer_config.json',
 '/content/drive/MyDrive/LLM/models/peft-dialogue-summary-checkpoint-local/special_tokens_map.json',
 '/content/drive/MyDrive/LLM/models/peft-dialogue-summary-checkpoint-local/tokenizer.json')

In [None]:
!pip install rouge_score



In [None]:
!pip install pandas



In [None]:
import evaluate
import pandas as pd
from transformers import  GenerationConfig
rouge = evaluate.load('rouge')
dialogues = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']

original_model_summaries = []
peft_model_summaries = []

for _, dialogue in enumerate(dialogues):
  prompt = f"""
  Summarize the following conversation.

  {dialogue}

  Summary:  """
  input_ids = tokenizer(prompt, return_tensors="pt").input_ids.cuda() # to get excuted on a single GPU

  original_modle_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(new_max_totokens=200))
  original_model_text_output = tokenizer.decode(original_modle_outputs[0], skip_special_tokens=True)
  original_model_summaries.append(original_model_text_output)

  peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(new_max_totokens=200))
  peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)
  peft_model_summaries.append(peft_model_text_output)

zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, peft_model_summaries))

df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'original_model_summaries', 'peft_model_summaries'])

In [None]:
df

Unnamed: 0,human_baseline_summaries,original_model_summaries,peft_model_summaries
0,Hannah needs Betty's number but Amanda doesn't...,Amanda will ask Larry for Betty's number. Hann...,Hannah can't find Betty's number. Amanda can't...
1,Eric and Rob are going to watch a stand-up on ...,Eric and Rob are watching a funny video on you...,Eric and Rob are watching a train. Eric and Ro...
2,Lenny can't decide which trousers to buy. Bob ...,Lenny wants to buy purple trousers. Bob advise...,Lenny wants to buy two black trousers. Bob rec...
3,Emma will be home soon and she will let Will k...,Will and Emma are going to have dinner tonight.,Emma will be home soon. Will will pick Emma up.
4,Jane is in Warsaw. Ollie and Jane has a party....,Jane is in Warsaw. Ollie and Jane have lunch o...,Jane forgot about the party. Ollie and Jane ha...
5,Hilary has the keys to the apartment. Benjamin...,Benjamin and Hilary will meet at La Cantina at...,"Hilary, Elliot and Elliot are going to meet at..."
6,Payton provides Max with websites selling clot...,Payton is a fan of shopping. Payton usually bu...,Max will check out Payton's websites. Max reco...
7,Rita and Tina are bored at work and have still...,Rita is tired. Rita isn't.,Rita is tired and is not able to work. Tina is...
8,"Beatrice wants to buy Leo a scarf, but he does...",Leo is in town shopping. He doesn't want a scarf.,Leo wants a scarf. Leo doesn't like them. Beat...
9,Eric doesn't know if his parents let him go to...,Eric is coming to the wedding. Eric has a lot ...,Eric is coming to Eric's brother's wedding. Er...


In [None]:
original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

peft_model_results = rouge.compute(
    predictions=peft_model_summaries,
    references=human_baseline_summaries[0:len(peft_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print('PEFT MODEL:')
print(peft_model_results)

ORIGINAL MODEL:
{'rouge1': 0.3407852775539434, 'rouge2': 0.1439583372151521, 'rougeL': 0.29589977623025354, 'rougeLsum': 0.291911766324251}
PEFT MODEL:
{'rouge1': 0.4241898228259924, 'rouge2': 0.19569801527696268, 'rougeL': 0.36557678835137064, 'rougeLsum': 0.3617395711774395}
