In [1]:
pip install datasets evaluate rouge_score py7zr -q accelerate peft bitsandbytes transformers[torch] trl

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install tensorboardX

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install datasets scipy

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install transformers[torch] accelerate -U

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install git+https://github.com/huggingface/trl.git@7630f877f91c556d9e5a3baa4b6e2894d90ff84c

Defaulting to user installation because normal site-packages is not writeable
Collecting git+https://github.com/huggingface/trl.git@7630f877f91c556d9e5a3baa4b6e2894d90ff84c
  Cloning https://github.com/huggingface/trl.git (to revision 7630f877f91c556d9e5a3baa4b6e2894d90ff84c) to /tmp/pip-req-build-e1tiumpi
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/trl.git /tmp/pip-req-build-e1tiumpi
  Running command git rev-parse -q --verify 'sha^7630f877f91c556d9e5a3baa4b6e2894d90ff84c'
  Running command git fetch -q https://github.com/huggingface/trl.git 7630f877f91c556d9e5a3baa4b6e2894d90ff84c
  Running command git checkout -q 7630f877f91c556d9e5a3baa4b6e2894d90ff84c
  Resolved https://github.com/huggingface/trl.git to commit 7630f877f91c556d9e5a3baa4b6e2894d90ff84c
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Note: you may need to resta

In [6]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [7]:
import re
import torch
import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset, load_metric
from transformers import DataCollatorForSeq2Seq, pipeline
from transformers import (AutoModelForSeq2SeqLM, AutoTokenizer,BitsAndBytesConfig,
                          HfArgumentParser,TrainingArguments,Trainer, Seq2SeqTrainingArguments, Seq2SeqTrainer)
import nltk
nltk.download('punkt')
from peft import LoraConfig, PeftModel, TaskType, get_peft_model
from trl import SFTTrainer

[nltk_data] Downloading package punkt to /home/exouser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
checkpoint = 'facebook/bart-large-cnn'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [36]:
summarizer = pipeline('summarization', model = model, tokenizer = tokenizer, device = device)

bart large cnn has 406M parameters

In [10]:
dataset = load_dataset("samsum",trust_remote_code=True)

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [12]:
def preprocess_function(examples):
    start_prompt = "Summarize the following conversation.\n\n"
    end_prompt = "\n\nSummary: "
    prompt = [start_prompt + dialogue + end_prompt for dialogue in examples['dialogue']]
    examples['input_ids'] = tokenizer(prompt, padding=True, truncation=True, return_tensors='pt', max_length = 1024).input_ids             # 'pt' for pytorch tensor
    examples['labels'] = tokenizer(examples['summary'], padding=True, truncation=True, return_tensors='pt', max_length = 1024).input_ids

    return examples

In [13]:
tokenizer.pad_token = tokenizer.eos_token
tokenized_datasets = dataset.map(preprocess_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'dialogue', 'summary'])


In [14]:
#tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 10 == 0, with_indices=True)


In [15]:
metric = load_metric('rouge') # Loading ROUGE Score

  metric = load_metric('rouge') # Loading ROUGE Score


In [16]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred# Obtaining predictions and true labels

    # Decoding predictions
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Obtaining the true labels tokens, while eliminating any possible masked token (i.e., label = -100)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]


    # Computing rouge score
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()} # Extracting some results

    # Add mean-generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [17]:
lora_config = LoraConfig(
    r=8,                       # 8, 16, 32    # the rank of the update matrices
    lora_alpha=32,                             # LoRA scaling factor
    lora_dropout=0.05,
    bias='none',                               # specifies if the bias parameters should be trained
    task_type=TaskType.SEQ_2_SEQ_LM,           # telling lora that this is a sq2seq modeling task
)

In [18]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [19]:
peft_model = get_peft_model(model, peft_config=lora_config)

In [20]:
peft_training_args = Seq2SeqTrainingArguments(
    output_dir="./mode_tuned_peft",           # local directory
    learning_rate=1e-5,
    num_train_epochs=5,      ## for 5 epochs took around 10 minutes
    weight_decay=0.01,
    auto_find_batch_size=True,
    evaluation_strategy='epoch',
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=1,
    fp16=True,
    predict_with_generate=True,
    logging_steps=10
)

peft_trainer = Seq2SeqTrainer(
    model=peft_model,                    # model to be fine-tuned
    args=peft_training_args,                       # training arguments
    train_dataset=tokenized_datasets['train'],          # train data to use
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)



In [21]:
peft_model.print_trainable_parameters()

trainable params: 1,179,648 || all params: 407,470,080 || trainable%: 0.28950542822677927


In [22]:
peft_trainer.train()


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.5144,0.549352,39.171,19.3682,29.7493,36.2965,58.6773
2,0.5694,0.535719,40.1584,20.0293,30.5791,37.1361,58.0648
3,0.3497,0.527741,41.0391,20.7891,31.5244,38.1502,58.0086
4,0.3444,0.525459,40.6698,20.543,31.2399,37.8126,58.3716
5,0.3495,0.521951,40.6099,20.4138,31.1095,37.6804,58.11


NaN or Inf found in input tensor.


TrainOutput(global_step=73660, training_loss=0.5554840813277372, metrics={'train_runtime': 10829.063, 'train_samples_per_second': 6.802, 'train_steps_per_second': 6.802, 'total_flos': 1.2685061248548864e+17, 'train_loss': 0.5554840813277372, 'epoch': 5.0})

In [23]:
test_score = peft_trainer.evaluate(eval_dataset = tokenized_datasets['test'])
print(validation)

{'eval_loss': 0.5219513773918152, 'eval_rouge1': 40.6099, 'eval_rouge2': 20.4138, 'eval_rougeL': 31.1095, 'eval_rougeLsum': 37.6804, 'eval_gen_len': 58.11, 'eval_runtime': 727.0199, 'eval_samples_per_second': 1.125, 'eval_steps_per_second': 1.125, 'epoch': 5.0}


In [26]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [27]:
peft_trainer.push_to_hub(commit_message = 'bart-large-finetuned-Samsum-DH')

events.out.tfevents.1720702823.informally-glorious-horse:   0%|          | 0.00/36.8k [00:00<?, ?B/s]

events.out.tfevents.1720703919.informally-glorious-horse:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

events.out.tfevents.1720703973.informally-glorious-horse:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/4.74M [00:00<?, ?B/s]

Upload 7 LFS files:   0%|          | 0/7 [00:00<?, ?it/s]

events.out.tfevents.1720703547.informally-glorious-horse:   0%|          | 0.00/37.8k [00:00<?, ?B/s]

events.out.tfevents.1720715530.informally-glorious-horse:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/divyahegde07/mode_tuned_peft/commit/df1f4655fd0e09a0c2d5a6ca6176a31e5f088155', commit_message='bart-large-finetuned-Samsum-DH', commit_description='', oid='df1f4655fd0e09a0c2d5a6ca6176a31e5f088155', pr_url=None, pr_revision=None, pr_num=None)

In [28]:
finetuned_model = 'divyahegde07/mode_tuned_peft'

In [29]:
summarizer_finetuned = pipeline('summarization', model = finetuned_model)

adapter_config.json:   0%|          | 0.00/448 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [53]:
def generate_summary(input, llm):
    """Prepare prompt  -->  tokenize -->  generate output using LLM  -->  detokenize output"""

    input_prompt = f"""
                    Summarize the following conversation.

                    {input}

                    Summary:
                    """

    input_ids = tokenizer(input_prompt, return_tensors='pt')
    tokenized_output = llm.generate(input_ids=input_ids['input_ids'], min_length=30, max_length=200, )
    output = tokenizer.decode(tokenized_output[0], skip_special_tokens=True)

    return output

In [55]:
pip install --upgrade peft

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting peft
  Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 KB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m00:01[0m
Installing collected packages: peft
  Attempting uninstall: peft
    Found existing installation: peft 0.4.0
    Uninstalling peft-0.4.0:
      Successfully uninstalled peft-0.4.0
Successfully installed peft-0.11.1
Note: you may need to restart the kernel to use updated packages.


In [73]:
model.save_pretrained('./fine_tuned_model')
tokenizer.save_pretrained('./fine_tuned_model')

Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.json',
 './fine_tuned_model/merges.txt',
 './fine_tuned_model/added_tokens.json',
 './fine_tuned_model/tokenizer.json')

In [79]:
import peft
from peft import get_peft_model, PeftConfig,PeftModelForSeq2SeqLM, AutoPeftModel,AutoPeftModelForSeq2SeqLM
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM



# Load tokenizer
tokenizer_ft= AutoTokenizer.from_pretrained('./mode_tuned_peft')

# Load Base model
#peft_model_base = AutoModelForSeq2SeqLM.from_pretrained(finetuned_model)

# Load PEFT model
#model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

loaded_peft_model = AutoPeftModelForSeq2SeqLM.from_pretrained('./mode_tuned_peft')

In [80]:
sample = dataset['test'][0]['dialogue']
label = dataset['test'][0]['summary']

output = generate_summary(sample, llm=loaded_peft_model)

print("Sample")
print(sample)
print("-------------------")
print("Summary:")
print(output)
print("Ground Truth Summary:")
print(label)
     

Sample
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye
-------------------
Summary:
Amanda can't find Betty's number. She asks Hannah to text Larry, who called Betty the last time they were at the park together.
Ground Truth Summary:
Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.
