# **Low Rank Adaptation and Parameter Efficient Finetuning of HuggingFace Alpaca LLMs on Text Summarisation**

### Import Relevant Dependencies

In [None]:
import os, warnings, torch, json, random, gc
from tqdm.notebook import tqdm
from transformers import (
    LlamaTokenizer,
    LlamaForCausalLM,
    DataCollatorForSeq2Seq, 
    Trainer, 
    TrainingArguments, 
)
from datasets import load_dataset, concatenate_datasets
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_int8_training,
    TaskType,
    PeftModel,
    PeftConfig
)
from rouge import Rouge
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from typing import Dict, Iterable

warnings.filterwarnings("ignore")

os.environ["LLM_REPOSITORY"] = "chavinlo/alpaca-native"
os.environ["TOKENIZER_REPOSITORY"] = "chavinlo/alpaca-native"
os.environ["EMBEDDINGS_MODEL"] = "all-MiniLM-L12-v2"
os.environ["MAX_TOKENS"] = "4096"
os.environ["DEVICE"] = "cuda:0" if torch.cuda.is_available() else "cpu"
os.environ["DATASET_PATH"] = "data/doc_summary_data"
os.environ["TOKENS_DATA_PATH"] = F"data/doc_summary_{os.environ['TOKENIZER_REPOSITORY'].split('/')[-1]}_tokens"
os.environ["SUMMARY_DATA_PATH"] = "data/doc_summary_pair.json"
os.makedirs(os.environ["DATASET_PATH"], exist_ok=True)
os.makedirs(os.environ["TOKENS_DATA_PATH"], exist_ok=True)

## **DATA PREPARATION**

### Split Dataset into Training, Validation and Testing Sets

In [None]:
TRAIN_SIZE = 0.8
VALIDATION_SIZE = 0.1
TEST_SIZE = 0.1

with open(os.environ["SUMMARY_DATA_PATH"]) as f:
    doc_summary_data = json.load(f)
f.close()

train_size = int(len(doc_summary_data) * TRAIN_SIZE)
val_size = int(len(doc_summary_data) * VALIDATION_SIZE)
test_size = int(len(doc_summary_data) * TEST_SIZE)

train_data = doc_summary_data[:train_size]
val_data = doc_summary_data[train_size:train_size+val_size]
test_data = doc_summary_data[train_size+val_size:]

data_list = [
    ("train", train_data),
    ("validation", val_data),
    ("test", test_data),
]

for data_tuple in data_list:
    _file_path = os.path.join(os.environ["DATASET_PATH"], f"{data_tuple[0]}.json")
    if not os.path.exists(_file_path):
        with open(_file_path, "w") as f:
            json.dump(data_tuple[1], f, indent=4)
        f.close()

del doc_summary_data, train_data, val_data, test_data, data_list, data_tuple, train_size, val_size, test_size
gc.collect()

### Load Dataset into DictDataset Format to be modelled by the HuggingFace LLM

In [None]:
dataset = load_dataset(path=os.environ["DATASET_PATH"])
print(f"Train dataset size: {len(dataset['train'])}")
print(f"Validation dataset size: {len(dataset['validation'])}")
print(f"Test dataset size: {len(dataset['test'])}")
dataset["train"][random.randint(0, len(dataset["train"]))]

### Load Corresponding LLM Tokenizer

In [None]:
tokenizer = LlamaTokenizer.from_pretrained(
    os.environ["TOKENIZER_REPOSITORY"],
    model_max_length=int(os.environ["MAX_TOKENS"])
)

### Use Tokenizer Object to retreive the Maximum Source (Text) and Target (Summary) Tokens in the Data

In [None]:
concatenated_dataset = concatenate_datasets(
    [dataset["train"], dataset["validation"], dataset["test"]]
)
tokenized_inputs = concatenated_dataset.map(
    lambda x: tokenizer(x["document"], truncation=True), batched=True, remove_columns=["document", "summary"])

tokenized_targets = concatenated_dataset.map(
    lambda x: tokenizer(x["summary"], truncation=True), batched=True, remove_columns=["document", "summary"])

max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]]) + 64
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]]) + 64

print(f"Max source length: {max_source_length}")
print(f"Max target length: {max_target_length}")

### Tokenize Dataset and Persist Tokens to Disk Memory

In [None]:
LABEL_PAD_TOKEN_ID = -100
TRAIN_ON_INPUT = False
generate_prompt = lambda document : (
    "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n"
    f"Generate a concise summary of this document:\n\n DOCUMENT: \n{document} \n\n SUMMARY:"
)
def preprocess_function(
    sample, 
    max_seq_length: int, 
    padding: str="max_length", 
    train_on_input: bool=False):
    
    _input = f"{generate_prompt(sample['document'])} {sample['summary']}"
    tokenization_result = tokenizer(
        _input,
        max_length=max_seq_length, 
        padding=False,
        truncation=True,
    )
    input_tokens = tokenization_result["input_ids"].copy()
    label_tokens = tokenization_result["input_ids"].copy()
    
    if not train_on_input:
        prompt_tokens = tokenizer(
            generate_prompt(sample['document']),
            max_length=max_seq_length, 
            padding=False,
            truncation=True,
        )["input_ids"]
        prompt_tokens_len = len(prompt_tokens)
        input_tokens = input_tokens[:prompt_tokens_len]
        label_tokens = ([LABEL_PAD_TOKEN_ID]*prompt_tokens_len) + label_tokens[prompt_tokens_len:]
        input_tokens = input_tokens + [tokenizer.pad_token_id]*(len(label_tokens)-prompt_tokens_len)
        
    else:
        input_tokens.append(tokenizer.eos_token_id)
        
    tokenization_result["input_ids"] = input_tokens
    tokenization_result["labels"] = label_tokens
    tokenization_result["attention_mask"] = [1]*len(input_tokens)
    return tokenization_result

preprocess_lambda = lambda dataset : preprocess_function(
    dataset, 
    sum([max_source_length, max_target_length]),
    train_on_input=TRAIN_ON_INPUT,
)
# batched arg must be set to False for this to work properly
tokenized_dataset = dataset.map(preprocess_lambda, batched=False, remove_columns=["document", "summary", "id"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

# save datasets to disk for later easy loading
tokenized_dataset["train"].save_to_disk(os.path.join(os.environ["TOKENS_DATA_PATH"], "train"))
tokenized_dataset["validation"].save_to_disk(os.path.join(os.environ["TOKENS_DATA_PATH"], "validation"))
tokenized_dataset["test"].save_to_disk(os.path.join(os.environ["TOKENS_DATA_PATH"], "test"))

## **MODEL PREPARATION**

### Load 8bits quantized HuggingFace LLM to Memory

In [None]:
# define model
model = LlamaForCausalLM.from_pretrained(
    pretrained_model_name_or_path=os.environ["LLM_REPOSITORY"],
    load_in_8bit=True,
    device_map="auto"
)

### Define Low Rank Adaptation Configurations Object and apply to Loaded LLM for Parameter Efficient Finetuning

In [None]:
# Define LoRA Config 
lora_config = LoraConfig(
 r=16, 
 lora_alpha=32,
 target_modules=["q_proj", "v_proj"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.CAUSAL_LM
)
# prepare int-8 model for training
model = prepare_model_for_int8_training(model)

# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

### Define Data Collator Object

In [None]:
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=LABEL_PAD_TOKEN_ID,
    pad_to_multiple_of=8,
     padding=True,
)

## **MODEL FINETUNING / TRAINING**

### Define Seq2SeqTrainer Object and Commence LoRA Finetuning

In [None]:
OUTPUT_DIR = F"lora-{os.environ['LLM_REPOSITORY'].split('/')[-1]}"
NUM_EPOCHS = 20
LEARNING_RATE = 1e-3

# Define training args
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
	auto_find_batch_size=True,
    learning_rate=LEARNING_RATE,
    num_train_epochs=NUM_EPOCHS,
    logging_dir=os.path.join(OUTPUT_DIR, "logs"),
    logging_strategy="steps",
    logging_steps=500,
    save_strategy="no",
    report_to="tensorboard",
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
)
model.config.use_cache = False  # to be set to True for inference

# finetune model
trainer.train()

### Persist LoRA Model Weights to Disk Memory

In [None]:
# Save our LoRA model & tokenizer results
PEFT_MODEL_ID=f"{os.environ['LLM_REPOSITORY'].split('/')[-1]}_finetuned_results"
trainer.model.save_pretrained(PEFT_MODEL_ID)
tokenizer.save_pretrained(PEFT_MODEL_ID)

# # delete model and tokenizer from memory
del model, tokenizer
gc.collect()
torch.cuda.empty_cache()

## **MODEL EVALUATION**

### Load LoRA Weights from Disk to Perform Inference on Test Dataset

In [None]:
# Load peft config for pre-trained checkpoint etc. 
config = PeftConfig.from_pretrained(PEFT_MODEL_ID)

# load base LLM model and tokenizer
model = LlamaForCausalLM.from_pretrained(config.base_model_name_or_path,  load_in_8bit=True,  device_map="auto")
tokenizer = LlamaTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, PEFT_MODEL_ID, device_map="auto", torch_dtype=torch.float32)

### Generating Summaries from Documents in Test Data

In [None]:
predicted_summaries = []
n_docs = 5
input_documents = dataset["test"]["document"][:n_docs]
target_summaries = dataset["test"]["summary"][:n_docs]

model.eval()
for i, document in enumerate(input_documents):
    prompt = generate_prompt(document)
    prompt_tokens = tokenizer(
        prompt,
        return_tensors="pt"
    )
    summary_tokens = model.generate(
        **prompt_tokens,
        return_dict_in_generate=True,
        temperature=0.1,
        top_p=0.15,
        top_k=0,
        repetition_penalty=1.1,
        max_new_tokens=256,
    )
    summary = tokenizer.decode(summary_tokens.sequences[0], skip_special_tokens=True)
    print(f"Document: {document}\n")
    print(f"SUMARY: {summary.replace(prompt, '').replace('</s>', '')}\n\n")
    predicted_summaries.append(summary)

## **PERFORMANCE MEASUREMENT**

### Compare Generated Summaries to Target Summaries with the Rouge Score and the Cosine Similarity Metric

In [None]:
rouge = Rouge()
embeddings_model = SentenceTransformer(os.environ["EMBEDDINGS_MODEL"])
embeddings_model.to(os.environ["DEVICE"])
_zipped_data = zip(input_documents, predicted_summaries, target_summaries)

for i, (document, predicted_summary, target_summary) in enumerate(_zipped_data):
    prompt = generate_prompt(document)
    pred_embeddings, target_embeddings = (
        embeddings_model.encode(predicted_summary.replace(prompt, '').replace('</s>', '')).reshape(1, -1),
        embeddings_model.encode(target_summary.replace(prompt, '').replace('</s>', '')).reshape(1, -1)
    )
    cos_similarity = cosine_similarity(target_embeddings, pred_embeddings)
    rouge_scores = rouge.get_scores(predicted_summary, target_summary)
    print(f"Cosine similarity for summary {i+1}:", cos_similarity[0][0], "\n")
    print(f"Rouge scores for summary {i+1}:", rouge_scores[0], "\n\n")
