# FLAN Fine Tuning Prototype
By: Dahlia Ma

This notebook is used to test the feasibility of fine-tuning the FLAN-T5 at different model sizes, with and without QLORA.

Here is a reference to run LLMs using GPU on Mac: https://sebastianraschka.com/blog/2022/pytorch-m1-gpu.html

### Install & Import requirements

In [1]:
# install requirements (if needed)
!pip install -r 'requirements.txt'



In [2]:
import time

import torch
import torch.nn.functional as F

from peft import (
    prepare_model_for_kbit_training
    ,LoraConfig
    ,get_peft_model
    ,PeftModel
)

from peft.tuners.lora import LoraLayer

from transformers import (
    AutoModelForSeq2SeqLM
    ,AutoTokenizer
    ,BitsAndBytesConfig
    ,StoppingCriteria
    ,StoppingCriteriaList
    ,TrainingArguments
)

from trl import SFTTrainer
from accelerate import Accelerator

# CUDA is not available on Mac, but 'mps' is analagous to CUDA
torch.device("mps")
print(f'Is CUDA available on torch? {torch.cuda.is_available()}')
print(f'Is MPS available on torch? {torch.backends.mps.is_available()}')

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'
Is CUDA available on torch? False
Is MPS available on torch? True




## Set general variables

In [3]:
device_type = "mps"  # set device to run model

## Load model & tokenizer

In [4]:
# load pre-trained model
model_name = "google/flan-t5-small"

''' 4-bit quantization requires more work to initiate; so won't use for prototype
# config for 4-bit quantization for QLoRA
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True
    ,bnb_4bit_use_double_quant=False
    ,bnb_4bit_compute_dtype=torch.float16
)
'''

# config for 8-bit quantization for QLoRA
bnb_config = BitsAndBytesConfig(
    # load_in_8bit = True  # CUDA not supported for Mac OS so cannot use this parameter
    llm_int8_threshold = 6.0  # if outlier is above this threshold, then will run with lower fp16 precision
    ,llm_int8_enable_fp32_cpu_offload = True
)

model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name
    # ,device_map = device_type
    ,quantization_config=bnb_config
    ,trust_remote_code=True
)

# set up and load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name 
    ,trust_remote_code = True
)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"  # set padding to the right to avoid issues with fp16 (when using 4-bit quantization)

# check model parameters & structure
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 

## Set PEFT parameters

In [5]:
peft_config = LoraConfig(
    lora_alpha = 16               # controls the scaling factor of loralib.linear weights
    ,lora_dropout = 0.1           # dropout rate of loralib.linear
    ,r = 64                       # dimension of low-rank matrix in lora adaptor
    ,bias = "none"
    ,task_type = "SEQ_2_SEQ_LM"
)

In [6]:
# add low rank adaptor to model
model.add_adapter(peft_config, adapter_name="adapter_1")

# Model Fine Tuning

In [7]:
from huggingface_hub import notebook_login
notebook_login() # log into HuggingFace Hub

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [8]:
import wandb
wandb.login() # log into wandb
%env WANDB_PROJECT=flan-t5-fine-tuning

[34m[1mwandb[0m: Currently logged in as: [33mma-dahlia25[0m. Use [1m`wandb login --relogin`[0m to force relogin


env: WANDB_PROJECT=flan-t5-fine-tuning


## Load and split data

In [9]:
from datasets import load_dataset

In [50]:
data = load_dataset("json", data_files = 'trembling_qa_data.json')
data

Downloading and preparing dataset json/default to /Users/dahliama/.cache/huggingface/datasets/json/default-22288e17529495c0/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /Users/dahliama/.cache/huggingface/datasets/json/default-22288e17529495c0/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 168
    })
})

In [51]:
data['train'][0]

{'question': 'What are the potential causes of a dog shivering or trembling?',
 'answer': 'Causes may include joy, toxic food ingestion, pain, old age, nausea, poisoning, and more.'}

## Train the model

In [52]:
# define function to format data to prompt instruction format
def prompt_instruction_format(sample):
    return f"""### Instruction:
    You are a friendly and patient professional who cares about dogs. Use the given Input below to write the Response.
    If you have not seen a similar input to Input, politely respond that it is not within your knowledge as a Response.

    ### Input:
    {sample['question']}

    ### Response:
    {sample['answer']}
    """

In [53]:
# define training arguments to fine-tune model
trainingArgs = TrainingArguments(
    output_dir=model_name
    ,num_train_epochs=10
    ,per_device_train_batch_size=4  # batch size per GPU for training
    ,gradient_accumulation_steps=2
    ,gradient_checkpointing=True
    ,optim="paged_adamw_32bit"
    ,logging_steps=3                # log onto console ever 'x' steps
    ,save_strategy="epoch"          # save after every epoch
    ,learning_rate=2e-4
    ,weight_decay=0.001
    ,max_grad_norm=0.3
    ,warmup_ratio=0.03
    ,group_by_length=False
    ,lr_scheduler_type="cosine"
    ,disable_tqdm=True
    ,report_to="wandb"
    ,seed=55
)

In [54]:
# Create the trainer
trainer = SFTTrainer(
    model=model
    ,train_dataset=data['train']
    ,peft_config=peft_config
    ,max_seq_length=2048
    ,tokenizer=tokenizer
    ,packing=True
    ,formatting_func=prompt_instruction_format
    ,args=trainingArgs
)

In [55]:
# Train/fine tune model
trainer.train()

NameError: name 'str2optimizer32bit' is not defined

In [60]:
wandb.finish()

[34m[1mwandb[0m: [32m[41mERROR[0m Control-C detected -- Run data was not synced
