# Finetuning for Question Answering Using LoRA

In [1]:
# install required packages
!pip install -q bitsandbytes
!pip install -q --upgrade transformers 
!pip install -q --upgrade accelerate
!pip install -q sentencepiece
!pip install -q datasets
!pip install -q --upgrade tensorboard
!pip install -q peft


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgr

In [2]:
# import required libraries
import os
import torch

import pandas as pd
import numpy as np

from datasets import load_dataset, Dataset

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

from peft import LoraConfig, get_peft_model, TaskType
from peft import prepare_model_for_int8_training
from peft import PeftModel, PeftConfig

from huggingface_hub import notebook_login

In [3]:
# check for GPU availibility 
display(torch.cuda.is_available())

display(torch.cuda.device_count())

True

1

## Load Model and Tokenizer

In [4]:
# load model and tokenizer 
model_name = "google/flan-t5-base"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name, load_in_8bit=True, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at google/flan-t5-base and are newly initialized: ['decoder.embed_tokens.weight', 'encoder.embed_tokens.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# code from [1]
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

print_trainable_parameters(model)

trainable params: 87145728 || all params: 247577856 || trainable%: 35.199322511299236


## Load Data

In [6]:
# load data 
nexodus_qa_df = pd.read_csv('../data/results/nexodus_qa_df.csv', index_col=0)

display(nexodus_qa_df.head())
display(nexodus_qa_df.shape)

Unnamed: 0,questions,answers,generated_answers,extracted_answers,abstracted_answers
0,What is the purpose of the `ifconfig` command ...,The `ifconfig` command is used to configure th...,The ifconfig command in Linux is used to confi...,ipprotocol ipv4 proto frompor...,all rules are applied only to the driver inter...
1,How would you add a static IP to a Linux machine?,You can add a static IP to a Linux machine by ...,"To add a static IP to a Linux machine, you wil...",this guide will walk you through getting your ...,sudo ip link del wg0 osxwindows
2,What is WireGuard?,WireGuard is an open-source VPN solution that ...,WireGuard is a software application that prote...,relay node the relay needs to have v6 forwardi...,tunneling mode
3,How would you install WireGuard on a Linux mac...,"Typically, you would use a package manager suc...","To install WireGuard on a Linux machine, you w...",relay node the relay needs to have v6 forwardi...,shell
4,How do you check the current IP address of you...,You can use the `ip addr show` command to disp...,"In Linux, you can check the current IP address...",ipprotocol ipv4 proto frompor...,ip_protocol


(100, 5)

In [7]:
# train test split
train_df = nexodus_qa_df.sample(frac=0.8,random_state=42)
test_df = nexodus_qa_df.drop(train_df.index)

display(len(train_df))
display(len(test_df))

80

20

## Generate Answers

In [8]:
# code from [2]
def get_answers(question, context=""):
    question_context = f"Question: ## {question} ##\n Context: ## {context} ##"
    input_ids = tokenizer(question_context, return_tensors="pt", truncation=True).input_ids.cuda()
    outputs = model.generate(input_ids=input_ids, max_new_tokens=1000, do_sample=True, top_p=1)
    answer = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]
    return answer

In [9]:
# generate answers to test questions 
pretrained_answers = []
for i, row in test_df.iterrows():
    pretrained_answers.append(get_answers(row.questions))

# add to test_df
test_df['pretrained_answers'] = pretrained_answers
pretrained_answers[:5]

["In Linux's linux framework, setting up an IP address will create a small temporary port for local data.",
 'wireguards',
 'To store a copy of the resolution of the kernel configuration files',
 'the process to synchronize a remote directory',
 "To do this, you'd first need to add an ethernet connection to the IP address and the host computer."]

## Preprocess Data

In [10]:
# convert training data to dataset object
train_data = Dataset.from_pandas(train_df)
display(train_data)

Dataset({
    features: ['questions', 'answers', 'generated_answers', 'extracted_answers', 'abstracted_answers', '__index_level_0__'],
    num_rows: 80
})

In [14]:
# code adapted from [1], [2]

# tokenize questions and answers
tokenized_questions = train_data.map(lambda x: tokenizer(x["questions"],
                                     truncation=True),
                                     batched=True,
                                     remove_columns=['questions', 'answers', 'generated_answers', 'extracted_answers', 'abstracted_answers'])

tokenized_answers = train_data.map(lambda x: tokenizer(x["answers"],
                                   truncation=True),
                                   batched=True,
                                   remove_columns=['questions', 'answers'])

# inputs/outputs longer than the max with be concatenated, inputs/outputs shorter will be padded
input_lengths = [len(x) for x in tokenized_questions["input_ids"]]
max_source_length = int(np.percentile(input_lengths, 100))
print(f'Max source length: {max_source_length}')
      
output_lengths = [len(x) for x in tokenized_answers["input_ids"]]
max_target_length = int(np.percentile(output_lengths, 100))
print(f'Max target length: {max_target_length}')

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Max source length: 29
Max target length: 80


In [15]:
# code adapted from [1]
def preprocess_function(example):
    # add prefix to the input for t5
    inputs = [f"Question: ## {q} ##\n Context: ## {c} ##" for q, c in zip(example["questions"], example["extracted_answers"])] # use extracted answers for context
    # tokenize inputs
    model_inputs = tokenizer(inputs,
                             max_length=max_source_length,
                             padding="max_length",
                             truncation=True)
    
    outputs = ["Answer: " + item for item in example["answers"]]
    
    labels = tokenizer(text_target=outputs,
                       max_length=max_target_length,
                       padding="max_length",
                       truncation=True)
  
    
    labels["input_ids"] = [
    [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
                          ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [16]:
preprocessed_train_data = train_data.map(preprocess_function,
                                     batched=True,
                                     remove_columns=["questions", "answers", 'generated_answers', 'extracted_answers', 'abstracted_answers' ])
preprocessed_train_data

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Dataset({
    features: ['__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 80
})

## Finetune with LoRA and 8-bit Quantization

In [17]:
# define data collator which pads inputs and labels
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=-100, # ignore tokenizer pad token in the loss
    pad_to_multiple_of=8
)

In [18]:
output_dir="models/lora-flan-t5-base"

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
    )

# prepare int-8 model for training
model = prepare_model_for_int8_training(model)

# add LoRA adaptor
model = get_peft_model(model, lora_config)


# define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3,
    num_train_epochs=10,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=500,
    save_strategy="epoch",
    save_total_limit=1,
    report_to="tensorboard"
)

# create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=preprocessed_train_data
)

# set to false during training
model.config.use_cache = False



In [19]:
# train model
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss




TrainOutput(global_step=100, training_loss=1.9549897766113282, metrics={'train_runtime': 43.3856, 'train_samples_per_second': 18.439, 'train_steps_per_second': 2.305, 'total_flos': 34509658521600.0, 'train_loss': 1.9549897766113282, 'epoch': 10.0})

In [10]:
# push to HuggingFace
!notebook_login()
nexodus_flan_T5.push_to_hub('exyou/nexodus-flan-t5')
# save LoRA model to local directory
nexodus_flan_T5.save_pretrained('../models/nexodus-flan-t5')

adapter_model.bin:   0%|          | 0.00/7.13M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/exyou/nexodus-flan-t5/commit/4b159c14ebdab7c66aee0a85a75477d3e4ad9c59', commit_message='Upload model', commit_description='', oid='4b159c14ebdab7c66aee0a85a75477d3e4ad9c59', pr_url=None, pr_revision=None, pr_num=None)

## Load Model for Inference

In [21]:
model = PeftModel.from_pretrained(model, '../models/nexodus-flan-t5')

In [22]:
finetuned_answers = []
for i, row in test_df.iterrows():
    finetuned_answers.append(get_answers(row.questions))
    
test_df['finetuned_answers'] = finetuned_answers
finetuned_answers[:5]



['Answer: You would add a static IP to a Linux machine using the net-add path command, e.g. net-add link add [net_add 0].',
 'Answer: WireGuard is the name of several programs. Most programs are hosted on the same servers, allowing IP addresses to be displayed.',
 'Answer: The /etc/resolv.conf file is used to set up the default resolving configuration files, such as /etc/resolv.conf using the /etc/resolv.conf plug-in.',
 'Answer: The ip command is used to configure packet forwarding, like a tunnel or an IP address, using its iproute command. This command is used to specify incoming traffic of IP addresses.',
 'Answer: You would add a static IP to a Linux machine using the ip addr addr show command and then add the IP with the "ip addr addr add" command.']

In [23]:
# sanity check to ensure results are saved to test_df
display(test_df.head())

Unnamed: 0,questions,answers,generated_answers,extracted_answers,abstracted_answers,pretrained_answers,finetuned_answers
1,How would you add a static IP to a Linux machine?,You can add a static IP to a Linux machine by ...,"To add a static IP to a Linux machine, you wil...",this guide will walk you through getting your ...,sudo ip link del wg0 osxwindows,"In Linux's linux framework, setting up an IP a...",Answer: You would add a static IP to a Linux m...
2,What is WireGuard?,WireGuard is an open-source VPN solution that ...,WireGuard is a software application that prote...,relay node the relay needs to have v6 forwardi...,tunneling mode,wireguards,Answer: WireGuard is the name of several progr...
14,What is the purpose of the `/etc/resolv.conf` ...,The `/etc/resolv.conf` file is used to configu...,The /etc/resolv.conf file in Linux is used to ...,in conclusion in the short term since we are n...,cross organization device sharing,To store a copy of the resolution of the kerne...,Answer: The /etc/resolv.conf file is used to s...
20,What is the purpose of the `ip` command in Linux?,The `ip` command is a powerful tool for manipu...,The ip command in Linux is used to connect to ...,ipprotocol ipv4 proto frompor...,starting port range,the process to synchronize a remote directory,Answer: The ip command is used to configure pa...
21,How would you add a static IP to a Linux machine?,You can add a static IP to a Linux machine by ...,"To add a static IP to a Linux machine, you wil...",this guide will walk you through getting your ...,sudo ip link del wg0 osxwindows,"To do this, you'd first need to add an etherne...",Answer: You would add a static IP to a Linux m...


In [24]:
# save test_df to csv file
test_df.to_csv("../data/results/test_df.csv")

## Citations

1. https://github.com/philschmid/deep-learning-pytorch-huggingface/blob/main/training/peft-flan-t5-int8-summarization.ipynb

2. https://github.com/redhat-et/foundation-models-for-documentation/blob/master/notebooks/finetune/Flan-T5-3B/RosaQA.ipynb