# **Low Rank Adaptation and Parameter Efficient Finetuning of HuggingFace Flan-T5 LLMs on Text Summarisation**

### Import Relevant Dependencies

In [1]:
import os, warnings, torch, json, random, gc
from tqdm.notebook import tqdm
from transformers import (
    AutoTokenizer, 
    AutoModelForSeq2SeqLM, 
    DataCollatorForSeq2Seq, 
    Seq2SeqTrainer, 
    Seq2SeqTrainingArguments, 
    pipeline
)
from datasets import load_dataset, concatenate_datasets
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_int8_training,
    TaskType,
    PeftModel,
    PeftConfig
)
from rouge import Rouge
from sentence_transformers import SentenceTransformer
from langchain.llms import HuggingFacePipeline
from langchain.chains.summarize import load_summarize_chain
from langchain.schema import Document
from sklearn.metrics.pairwise import cosine_similarity
from typing import Dict, Iterable

warnings.filterwarnings("ignore")

os.environ["LLM_REPOSITORY"] = "google/flan-t5-small"#"philschmid/flan-t5-xxl-sharded-fp16"
os.environ["TOKENIZER_REPOSITORY"] = "google/flan-t5-small"#"google/flan-t5-xxl"
os.environ["EMBEDDINGS_MODEL"] = "all-MiniLM-L12-v2"
os.environ["MAX_TOKENS"] = "4096"
os.environ["DEVICE"] = "cuda:0" if torch.cuda.is_available() else "cpu"
os.environ["DATASET_PATH"] = "data/doc_summary_data"
os.environ["TOKENS_DATA_PATH"] = F"data/doc_summary_{os.environ['TOKENIZER_REPOSITORY'].split('/')[-1]}_tokens"
os.environ["SUMMARY_DATA_PATH"] = "data/doc_summary_pair.json"
os.makedirs(os.environ["DATASET_PATH"], exist_ok=True)
os.makedirs(os.environ["TOKENS_DATA_PATH"], exist_ok=True)

## **DATA PREPARATION**

### Split Dataset into Training, Validation and Testing Sets

In [2]:
TRAIN_SIZE = 0.8
VALIDATION_SIZE = 0.1
TEST_SIZE = 0.1

with open(os.environ["SUMMARY_DATA_PATH"]) as f:
    doc_summary_data = json.load(f)
f.close()

train_size = int(len(doc_summary_data) * TRAIN_SIZE)
val_size = int(len(doc_summary_data) * VALIDATION_SIZE)
test_size = int(len(doc_summary_data) * TEST_SIZE)

train_data = doc_summary_data[:train_size]
val_data = doc_summary_data[train_size:train_size+val_size]
test_data = doc_summary_data[train_size+val_size:]

data_list = [
    ("train", train_data),
    ("validation", val_data),
    ("test", test_data),
]

for data_tuple in data_list:
    with open(os.path.join(os.environ["DATASET_PATH"], f"{data_tuple[0]}.json"), "w") as f:
        json.dump(data_tuple[1], f, indent=4)
    f.close()

del doc_summary_data, train_data, val_data, test_data, data_list, data_tuple, train_size, val_size, test_size
gc.collect()

119

### Load Dataset into DictDataset Format to be modelled by the HuggingFace LLM

In [3]:
dataset = load_dataset(path=os.environ["DATASET_PATH"])
print(f"Train dataset size: {len(dataset['train'])}")
print(f"Validation dataset size: {len(dataset['validation'])}")
print(f"Test dataset size: {len(dataset['test'])}")
dataset["train"][random.randint(0, len(dataset["train"]))]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Train dataset size: 4000
Validation dataset size: 500
Test dataset size: 500


{'document': "Crosse, Wisconsin, 54601, United States|Mayo Clinic Health System-Franciscan Healthcare, La Crosse, Wisconsin, 54601, United States|Dean Hematology and Oncology Clinic, Madison, Wisconsin, 53717, United States|Holy Family Memorial Hospital, Manitowoc, Wisconsin, 54221, United States|Aurora Bay Area Medical Group-Marinette, Marinette, Wisconsin, 54143, United States|Bay Area Medical Center, Marinette, Wisconsin, 54143, United States|Vince Lombardi Cancer Clinic-Marinette, Marinette, Wisconsin, 54143, United States|Marshfield Clinic, Marshfield, Wisconsin, 54449, United States|Aurora Cancer Care-Milwaukee, Milwaukee, Wisconsin, 53209, United States|Aurora Saint Luke's Medical Center, Milwaukee, Wisconsin, 53215, United States|Aurora Sinai Medical Center, Milwaukee, Wisconsin, 53233, United States|Marshfield Clinic-Minocqua Center, Minocqua, Wisconsin, 54548, United States|Green Bay Oncology - Oconto Falls, Oconto Falls, Wisconsin, 54154, United States|Vince Lombardi Cancer 

### Load Corresponding LLM Tokenizer

In [4]:
tokenizer = AutoTokenizer.from_pretrained(
    os.environ["TOKENIZER_REPOSITORY"],
    model_max_length=int(os.environ["MAX_TOKENS"])
)

### Use Tokenizer Object to retreive the Maximum Source (Text) and Target (Summary) Tokens in the Data

In [5]:
concatenated_dataset = concatenate_datasets(
    [dataset["train"], dataset["validation"], dataset["test"]]
)
tokenized_inputs = concatenated_dataset.map(
    lambda x: tokenizer(x["document"], truncation=True), batched=True, remove_columns=["document", "summary"])

tokenized_targets = concatenated_dataset.map(
    lambda x: tokenizer(x["summary"], truncation=True), batched=True, remove_columns=["document", "summary"])

max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])

print(f"Max source length: {max_source_length}")
print(f"Max target length: {max_target_length}")

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Max source length: 428
Max target length: 220


### Tokenize Dataset and Persist Tokens to Disk Memory

In [6]:
def preprocess_function(sample, max_source_length: int, max_target_length: int, padding: str="max_length"):
    inputs = [f"summarize this document: {item}"  for item in sample["document"]]

    model_inputs = tokenizer(
        inputs, 
        max_length=max_source_length, 
        padding=padding, 
        truncation=True,
    )
    labels = tokenizer(
        text_target=sample["summary"], 
        max_length=max_target_length,
        padding=padding, 
        truncation=True,
    )

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else tokenizer.pad_token_id) for l in label] for label in labels["input_ids"]
        ]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

preprocess_lambda = lambda dataset : preprocess_function(dataset, max_source_length, max_target_length)
tokenized_dataset = dataset.map(preprocess_lambda, batched=True, remove_columns=["document", "summary", "id"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

# save datasets to disk for later easy loading
tokenized_dataset["train"].save_to_disk(os.path.join(os.environ["TOKENS_DATA_PATH"], "train"))
tokenized_dataset["validation"].save_to_disk(os.path.join(os.environ["TOKENS_DATA_PATH"], "validation"))
tokenized_dataset["test"].save_to_disk(os.path.join(os.environ["TOKENS_DATA_PATH"], "test"))

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


Saving the dataset (0/1 shards):   0%|          | 0/4000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/500 [00:00<?, ? examples/s]

## **MODEL PREPARATION**

### Load 8bits quantized HuggingFace LLM to Memory

In [7]:
# define model
model = AutoModelForSeq2SeqLM.from_pretrained(
    pretrained_model_name_or_path=os.environ["LLM_REPOSITORY"],
    load_in_8bit=True,
    device_map="auto"
)

### Define Low Rank Adaptation Configurations Object and apply to Loaded LLM for Parameter Efficient Finetuning

In [8]:
# Define LoRA Config 
lora_config = LoraConfig(
 r=16, 
 lora_alpha=32,
 target_modules=["q", "v"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.SEQ_2_SEQ_LM
)
# prepare int-8 model for training
model = prepare_model_for_int8_training(model)

# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 688,128 || all params: 77,649,280 || trainable%: 0.8862001038515747


### Define Data Collator Object

In [9]:
# we want to ignore tokenizer pad token in the loss
label_pad_token_id = tokenizer.pad_token_id

# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

## **MODEL FINETUNING / TRAINING**

### Define Seq2SeqTrainer Object and Commence LoRA Finetuning

In [10]:
OUTPUT_DIR = F"lora-{os.environ['LLM_REPOSITORY'].split('/')[-1]}"
NUM_EPOCHS = 20
LEARNING_RATE = 1e-3

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
	auto_find_batch_size=True,
    learning_rate=LEARNING_RATE,
    num_train_epochs=NUM_EPOCHS,
    logging_dir=os.path.join(OUTPUT_DIR, "logs"),
    logging_strategy="steps",
    logging_steps=500,
    save_strategy="no",
    report_to="tensorboard",
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
)
model.config.use_cache = False  # to be set to True for inference

# finetune model
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,3.4081
1000,1.8583
1500,1.7828
2000,1.7459
2500,1.7225
3000,1.7116
3500,1.6937
4000,1.6822
4500,1.6727


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



### Persist LoRA Model Weights to Disk Memory

In [11]:
# Save our LoRA model & tokenizer results
PEFT_MODEL_ID=f"{os.environ['LLM_REPOSITORY'].split('/')[-1]}_finetuned_results"
trainer.model.save_pretrained(PEFT_MODEL_ID)
tokenizer.save_pretrained(PEFT_MODEL_ID)

('flan-t5-small_finetuned_results/tokenizer_config.json',
 'flan-t5-small_finetuned_results/special_tokens_map.json',
 'flan-t5-small_finetuned_results/spiece.model',
 'flan-t5-small_finetuned_results/added_tokens.json',
 'flan-t5-small_finetuned_results/tokenizer.json')

## **MODEL EVALUATION**

### Load LoRA Weights from Disk to Perform Inference on Test Dataset

In [12]:
# Load peft config for pre-trained checkpoint etc. 
config = PeftConfig.from_pretrained(PEFT_MODEL_ID)

# load base LLM model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path,  load_in_8bit=True,  device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, PEFT_MODEL_ID, device_map="auto")

### Use Loaded Model and Tokenizer to Instantiate a Langchain HuggingFacePipeline object

In [13]:
# switch model to eval mode
model.eval()

# define model pipeline
hgf_pipeline = pipeline(
    task="text2text-generation", 
    model=model, 
    tokenizer=tokenizer,
    temperature=0.1, 
    max_length=int(os.environ["MAX_TOKENS"]),
    top_p=0.15,
    top_k=0,
    repetition_penalty=1.1,
)

llm = HuggingFacePipeline(pipeline=hgf_pipeline)

The model 'PeftModelForSeq2SeqLM' is not supported for text2text-generation. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'SwitchTransformersForConditionalGeneration', 'T5ForConditionalGeneration', 'UMT5ForConditionalGeneration', 'XLMProphetNetForConditionalGeneration'].


### Generating Summaries from Documents in Test Data

In [14]:
summary_chain = load_summarize_chain(llm, chain_type="map_reduce")

# summarise first 5 documents in the testing data
predicted_summaries = []
n_docs = 5
for i, document in enumerate(dataset["test"]["document"][:n_docs]):
    document = Document(page_content=document)
    summary = summary_chain.run([document])
    print(f"Document: {document}\n")
    print(f"SUMARY: {summary}\n\n")
    predicted_summaries.append(summary)

Token indices sequence length is longer than the specified maximum sequence length for this model (1938 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1940 > 512). Running this sequence through the model will result in indexing errors


Document: page_content="States|Helen F Graham Cancer Center, Newark, Delaware, 19713, United States|University of Florida Health Science Center - Gainesville, Gainesville, Florida, 32610, United States|Memorial Regional Hospital/Joe DiMaggio Children's Hospital, Hollywood, Florida, 33021, United States|Mayo Clinic in Florida, Jacksonville, Florida, 32224-9980, United States|Miami Cancer Institute, Miami, Florida, 33176, United States|Orlando Health Cancer Institute, Orlando, Florida, 32806, United States|Memorial Hospital West, Pembroke Pines, Florida, 33028, United States|Emory University Hospital Midtown, Atlanta, Georgia, 30308, United States|Piedmont Hospital, Atlanta, Georgia, 30309, United States|Emory University Hospital/Winship Cancer Institute, Atlanta, Georgia, 30322, United States|Emory Saint Joseph's Hospital, Atlanta, Georgia, 30342, United States|John B Amos Cancer Center, Columbus, Georgia, 31904, United States|CTCA at Southeastern Regional Medical Center, Newnan, Georgi

## **PERFORMANCE MEASUREMENT**

### Compare Generated Summaries to Target Summaries with the Rouge Score and the Cosine Similarity Metric

In [15]:
rouge = Rouge()
embeddings_model = SentenceTransformer(os.environ["EMBEDDINGS_MODEL"])
embeddings_model.to(os.environ["DEVICE"])
target_summaries = dataset["test"]["summary"][:n_docs]

for i, (predicted_summary, target_summary) in enumerate(zip(predicted_summaries, target_summaries)):
    pred_embeddings, target_embeddings = (
        embeddings_model.encode(predicted_summary).reshape(1, -1),
        embeddings_model.encode(target_summary).reshape(1, -1)
    )
    cos_similarity = cosine_similarity(target_embeddings, pred_embeddings)
    rouge_scores = rouge.get_scores(predicted_summary, target_summary)
    print(f"Cosine similarity for summary {i+1}:", cos_similarity[0][0], "\n")
    print(f"Rouge scores for summary {i+1}:", rouge_scores[0], "\n\n")


Cosine similarity for summary 1: 0.8069508 

Rouge scores for summary 1: {'rouge-1': {'r': 0.52, 'p': 0.7222222222222222, 'f': 0.6046511579232018}, 'rouge-2': {'r': 0.35714285714285715, 'p': 0.5263157894736842, 'f': 0.42553191007695795}, 'rouge-l': {'r': 0.48, 'p': 0.6666666666666666, 'f': 0.5581395300162251}} 


Cosine similarity for summary 2: 0.6042143 

Rouge scores for summary 2: {'rouge-1': {'r': 0.2857142857142857, 'p': 0.35294117647058826, 'f': 0.3157894687396123}, 'rouge-2': {'r': 0.18181818181818182, 'p': 0.2222222222222222, 'f': 0.1999999950500001}, 'rouge-l': {'r': 0.23809523809523808, 'p': 0.29411764705882354, 'f': 0.26315788979224386}} 


Cosine similarity for summary 3: 0.69637465 

Rouge scores for summary 3: {'rouge-1': {'r': 0.45, 'p': 0.6, 'f': 0.5142857093877552}, 'rouge-2': {'r': 0.2631578947368421, 'p': 0.3125, 'f': 0.28571428075102046}, 'rouge-l': {'r': 0.45, 'p': 0.6, 'f': 0.5142857093877552}} 


Cosine similarity for summary 4: 0.799549 

Rouge scores for summa