# Low-Rank Adaptation (LoRA) - a type of parameter efficient fine tuning (PEFT)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn

In [None]:
"""
bitsandbytes: for representing models using smaller datatypes, saving on memory
datasets: for downloading datasets
accelerate: required dependency for machine learning interoperability for some of the modules
loralib: LoRA implementation
peft: a general "parameter efficient fine tuning" module, our interface for LoRA
transformers: for downloading and using pre-trained transformers from huggingface
"""

!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/peft.git git+https://github.com/huggingface/transformers.git

!pip install transformers -U

In [None]:
# importing dependencies and downloading pre-trained bloom model

import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

# loading model
model= AutoModelForCausalLM.from_pretrained(
    #"bigscience/bloom-3b",
    #"bigscience/bloom-1b1",
    "bigscience/bloom-560m",
    torch_dtype=torch.float16,
    device_map='auto',
)

# loading tokenizer for the model (which turns text into an input for the model)
tokenizer= AutoTokenizer.from_pretrained("bigscience/tokenizer")

In [5]:
"""
r: the rank of the A and B matrices
lora_alpha: this is a pretty controversial parameter. A lot of people have a lot of ideas about it.
  You can consider it a scaling factor, and by default it should be equal to r, as far as I understand.
target_modules: the portions of the model we want to optimize with LoRA. The BLOOM module has
  parameters named query_key_value which we want to optimize.
lora_dropout: dropout is a technique which hides inputs to suppress the model from overfitting
  (called regularization). This is a probability of being hidden.
bias: neural networks typically have two paramet per connection, a "weight" and a "bias".
  We're only training weights in this example.
task_type: not super necessary, used in the superclass PeftConfig. Setting to CAUSAL_LM because the
  specific language model we're using is "causal".
"""

from peft import LoraConfig, get_peft_model

# defining how LoRA will work in this particular example
config = LoraConfig(
    r=8,
    lora_alpha=8,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# this actually overwrites the model in memory, so the rename is only for legibility
peft_model= get_peft_model(model, config)

In [6]:
# comparing parameters before and after LoRA

all_param= 0
trainable_params= 0

# iterating over all parameters
for _, param in peft_model.named_parameters():

    # adding parameters to total
    all_param += param.numel()

    # adding parameters to trainable if they require a gradient
    if param.requires_grad:
        trainable_params += param.numel()


# printing results
print(f'All params: {all_param}')
print(f'Trainable params: {trainable_params}')
print(f'Trainable params rate: {100 * trainable_params/all_param:.2f}%')

All params: 560001024
Trainable params: 786432
Trainable params rate: 0.14%


In [7]:
# loading Fine Tuning Dataset - SQUAD

from datasets import load_dataset

qa_dataset= load_dataset('squad_v2')

In [8]:
"""
The model will expect text in this general form:

**CONTEXT:**
{context}

**QUESTION:**
{question}

**ANSWER:**
{answer}</s>
"""

# reformatting SQUAD to respect our defined structure
# defining a function for reformatting
def create_prompt(context, question, answer):

    if len(answer['text']) < 1:
        answer= 'Cannot find answer'
    else:
        answer= answer['text'][0]

    prompt_template= f'CONTEXT:\n{context}\n\nQUESTION:\n{question}\n\nANSWER:\n{answer}</s>'

    return prompt_template


# applying the reformatting function to the entire dataset
mapped_qa_dataset= qa_dataset.map(lambda samples: tokenizer(create_prompt(
    samples['context'], samples['question'], samples['answers']
)))

In [None]:
"""
Fine Tuning on SQUAD using LoRA
This code is largly co-opted. In the absence of a rigid validation procedure, the best practice is
to just copy a successful tutorial or, better yet, directly from the documentation.
"""

import transformers

trainer = transformers.Trainer(
    model=peft_model,
    train_dataset=mapped_qa_dataset["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        max_steps=100,
        learning_rate=1e-3,
        fp16=True,
        logging_steps=1,
        output_dir='outputs',
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

# silence the warnings. Please re-enable for inference!
peft_model.config.use_cache= False
trainer.train()

In [None]:
# saving the LoRA fine tuning locally

model_id= 'BLOOM-560m-LoRA'
peft_model.save_pretrained(model_id)

# checking how large the file is in our file system
!ls -lh {model_id}

In [None]:
"""
The BLOOM 560m model, in it's float 16 datatype, is over 1 gigabyte in total size. With LoRA, and
us only needing to save the decomposed matrices, our checkpoint size is a mere 3 megabytes.
"""

# Testing - Helper Function for Comparing Results

from IPython.display import display, Markdown

def make_inference(context, question):

    # turn the input into tokens
    batch= tokenizer(f'**CONTEXT:**\n{context}\n\n**QUESTION:**\n{question}\n\n**ANSWER:**\n',
                     return_tensors='pt', return_token_type_ids=False)
    # move the tokens onto the GPU for inference
    batch= batch.to(device='cuda')

    # make an inference with both the fine tuned model and the raw model
    with torch.cuda.amp.autocast():
        # I think inference time would be faster if these were applied, but the fact that LoRA
        # is not applied allows me to experiment with before and after fine tuning simultaneously

        # raw model
        peft_model.disable_adapter_layers()
        output_tokens_raw= model.generate(**batch, max_new_tokens=200)

        # LoRA model
        peft_model.enable_adapter_layers()
        output_tokens_qa= peft_model.generate(**batchm max_new_tokens=200)


    # display results
    display(Markdown('# Raw Model\n'))
    display(Markdown((tokenizer.decode(output_tokens_raw[0], skip_special_tokens=True))))
    display(Markdown('\n# QA Model\n'))
    display(Markdown((tokenizer.decoder(output_tokens_qa[0], skip_special_tokens=True))))

In [None]:
context = 'You are a monster, and you eat yellow legos.'
question= 'What is the best food?'

make_inference(context, question)

In [None]:
context = 'you are a math wizard'
question= 'what is 1+1 equal to?'

make_inference(context, question)

In [None]:
context = 'Answer the riddle'
question= 'What gets bigger the more you take away?'

make_inference(context, question)

In [None]:
# the fine-tuned model failed to answer the question significantly more elegantly.

In [None]:
# https://towardsdatascience.com/lora-intuitively-and-exhaustively-explained-e944a6bff46b