# Supervised Fine-Tuning of Llama2 on Financial Phrasebank

## Imports

In [1]:
import os
import sys
from pathlib import Path
from tqdm.notebook import tqdm

SRC_DIRECTORY = Path().cwd().resolve().parent

if str(SRC_DIRECTORY) not in sys.path:
    sys.path.insert(0, str(SRC_DIRECTORY))

In [2]:
import logging
logger = logging.getLogger('llama2_finetune')
logger.setLevel(logging.DEBUG)
consoleHandler = logging.StreamHandler()
consoleHandler.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
consoleHandler.setFormatter(formatter)
logger.addHandler(consoleHandler)

In [3]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

In [4]:
import torch
from transformers import LlamaTokenizer, LlamaForCausalLM, LlamaModel, LlamaConfig, TextGenerationPipeline
from transformers import TrainingArguments

In [5]:
from transformers import BitsAndBytesConfig

In [6]:
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split

In [7]:
import bitsandbytes as bnb

In [8]:
import huggingface_hub

In [9]:
from trl import SFTTrainer

In [10]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, AutoPeftModelForCausalLM

In [11]:
from peft import AutoPeftModelForCausalLM

In [12]:
from transformers import AutoModelForCausalLM, AutoTokenizer

In [13]:
from llama.instructions import TASK_MAP, llama2_prompt_generator

In [14]:
import wandb
import torch.nn as nn

In [15]:
from peft import TaskType

In [16]:
from collections import namedtuple

## Configuration

In [17]:
Args = namedtuple("Args", ["task_name"])
args = Args(task_name="sentiment_analysis")

# TODO: move more configs into my args?

In [18]:
import uuid
from datetime import datetime

# Generate a date string
date_str = datetime.now().strftime("%y%m%d")

# Generate a short UUID
uid = str(uuid.uuid4())[:8]

# Combine
uid = f"{uid}_{date_str}"

print(uid)

7a029f70_231008


### HuggingFace

In [19]:
hf_auth = "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" 
# TODO REMOVE MY TOKEN FOR THE FINAL VERSION
# TODO PROVIDE DIRECTIONS FOR HOW TO GET HF TOKEN HERE

huggingface_hub.login(token=hf_auth)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/AD/gmatlin3/.cache/huggingface/token
Login successful


In [20]:
ORGANIZATION = "gtfintechlab"
DATASET = "financial_phrasebank"

### Weights and Biases

In [42]:
# set the wandb project where this run will be logged
os.environ["WANDB_PROJECT"]="llama2-fpb-sft"

# save your trained model checkpoint to wandb
os.environ["WANDB_LOG_MODEL"]="false"

# turn off watch to log faster
os.environ["WANDB_WATCH"]="false"

os.environ["WANDB_API_KEY"]="fa69ffc6a97578da0410b553042cbb8b3bf5fcaf"
os.environ["WANDB_NOTEBOOK_NAME"]="llama2_finetune_fpb"

In [43]:
wandb.login()

True

### CUDA

In [44]:
# Check GPU compatibility with bfloat16
major, _ = torch.cuda.get_device_capability()
if major >= 8:
    print("=" * 80)
    print("Your GPU supports bfloat16: accelerate training with bf16=True")
    print("=" * 80)
    compute_dtype = torch.bfloat16
else:
    print("=" * 80)
    print("Your GPU does not support bfloat16: using fp=16")
    print("=" * 80)
    compute_dtype = torch.float16

Your GPU supports bfloat16: accelerate training with bf16=True


In [45]:
CUDA_N_GPUS = torch.cuda.device_count()
CUDA_MAX_MEMORY = f"{int(torch.cuda.mem_get_info()[0] / 1024 ** 3) - 2}GB"
CUDA_MAX_MEMORY = {i: CUDA_MAX_MEMORY for i in range(CUDA_N_GPUS)}
logger.info(
    f"Using k={CUDA_N_GPUS} CUDA GPUs with max memory {CUDA_MAX_MEMORY}"
)

2023-10-08 00:44:14,268 - llama2_finetune - INFO - Using k=2 CUDA GPUs with max memory {0: '41GB', 1: '41GB'}


In [46]:
# device_map = {"":0}
device_map = "auto"

## Financial PhraseBank

In [47]:
SEEDS = (5768, 78516, 944601)

In [48]:
SEED = SEEDS[0]

### Create splits on HuggingFace Hub

In [49]:
def decode(label_number):
    if label_number == 0:
        return "POSITIVE"
    elif label_number == 1:
        return "NEGATIVE"
    elif label_number == 2:
        return "NEUTRAL"
    else:
        raise ValueError("Invalid label number")

In [50]:
def make_fpb_hub_datasets(seed=None):
    configs = [
        "sentences_allagree",
        "sentences_75agree",
        "sentences_66agree",
        "sentences_50agree",
    ]

    for config in tqdm(configs, desc="Configs"):
        try:
            fpb_dataset = load_dataset("financial_phrasebank", config)
            config_short = config.replace("sentences_", "")

            texts = fpb_dataset["train"]["sentence"]
            labels = fpb_dataset["train"]["label"]

            splits = {}

            # Splitting the data
            train_texts, test_texts, train_labels, test_labels = train_test_split(
                texts, labels, test_size=0.2, random_state=seed
            )

            # Storing in the dictionary
            splits[seed] = {
                "train": Dataset.from_dict({"context": train_texts, "response": list(map(decode,train_labels))}),
                "test": Dataset.from_dict({"context": test_texts, "response": list(map(decode, test_labels))}),
            }

            # Push to HF Hub
            splits[seed]["train"].push_to_hub(
                f"{ORGANIZATION}/{DATASET}-{config_short}-{seed}",
                config_name="train",
                private=True,
            )
            splits[seed]["test"].push_to_hub(
                f"{ORGANIZATION}/{DATASET}-{config_short}-{seed}",
                config_name="test",
                private=True,
            )

            return splits
        except Exception as e:
            print(f"Error processing config {config}: {str(e)}")

In [51]:
# Execute the function
splits = make_fpb_hub_datasets(SEED)

Configs:   0%|          | 0/4 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/803 [00:00<?, ?B/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

### Load split dataset from HF Hub

In [52]:
CONFIG = "allagree"

In [53]:
# todo: figure out why datasets are either not being overwritten on hub or the fields are misnamed

# fpb_train_dataset = load_dataset(f"{ORGANIZATION}/{DATASET}-{CONFIG}-{SEED}", "train")['train']
# fpb_test_dataset = load_dataset(f"{ORGANIZATION}/{DATASET}-{CONFIG}-{SEED}", "test")['train']

fpb_train_dataset = splits[SEED]['train']
fpb_test_dataset = splits[SEED]['test']

## Prompts

In [54]:
TASK_INSTRUCTION, TASK_DATA = (
    TASK_MAP[args.task_name]["instruction"],
    TASK_MAP[args.task_name]["data"],
)

## Model

In [72]:
def find_all_linear_names(model):
    # SOURCE https://github.com/artidoro/qlora/blob/main/qlora.py
    cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:  # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

In [60]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [92]:
def create_peft_config(modules):
    peft_config = LoraConfig(
        # Pass our list as an argument to the PEFT config for your model
        target_modules=modules,
        # Dimension of the LoRA matrices we update in adapaters
        r=8,
        # Alpha parameter for LoRA scaling
        lora_alpha=32,
        # Dropout probability for LoRA layers
        lora_dropout=0.1,
        bias="none", 
        task_type=TaskType.CAUSAL_LM
        )
    return peft_config

In [62]:
# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length


def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )


# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed, dataset: str):
    """Format & tokenize it so it is ready for training
    :param tokenizer (AutoTokenizer): Model Tokenizer
    :param max_length (int): Maximum number of tokens to emit from tokenizer
    """
    
    # Add prompt to each sample
    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats)#, batched=True)
    
    # Apply preprocessing to each batch of the dataset & and remove 'instruction', 'context', 'response', 'category' fields
    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        # remove_columns=["instruction", "context", "response", "text", "category"],
    )

    # Filter out samples that have input_ids exceeding max_length
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)
    
    # Shuffle dataset
    dataset = dataset.shuffle(seed=seed)

    return dataset

In [64]:
def create_prompt_formats(sample, context_field='context'):
    SYS_PROMPT = f""""Discard all the previous instructions.
    Below is an instruction that describes a task.
    Write a response that appropriately completes the request."""
    INST_PROMPT = TASK_INSTRUCTION
    fields = ['context', 'response'] # ['instruction', 'category']
    if not INST_PROMPT or not isinstance(INST_PROMPT, str):
        raise ValueError("Instruction must be a non-empty string.")
    if not sample or not all(isinstance(sample[field], str) for field in fields):
        raise ValueError("Fields must be a non-empty strings.")
    ###################################################
    # blurb = f"{INTRO_BLURB}"
    # instruction = f"{INSTRUCTION_KEY}\n{sample['instruction']}"
    # input_context = f"{INPUT_KEY}\n{sample['context']}" if sample["context"] else None
    # response = f"{RESPONSE_KEY}\n{sample['response']}"
    # end = f"{END_KEY}"
    # parts = [part for part in [blurb, instruction, input_context, response, end] if part]
    # formatted_prompt = "\n\n".join(parts)
    ###################################################
    prompt = B_INST + B_SYS + SYS_PROMPT + E_SYS + INST_PROMPT + sample[context_field] + E_INST
    sample["text"] = prompt
    return sample

In [79]:
from transformers import DataCollatorForLanguageModeling

In [80]:
from llama.instructions import B_INST, E_INST, B_SYS, E_SYS
from functools import partial

In [124]:
n_parameters = '7b'
model_id = f"meta-llama/Llama-2-{n_parameters}-chat-hf"
model_name = model_id.split('/')[-1]

In [125]:
def train(model_id, dataset, seed, output_dir, gradient_checkpointing_enabled=True, epochs=5):
    bnb_config = BitsAndBytesConfig(
        # Activate 4-bit precision base model loading
        load_in_4bit=True,
        # Activate nested quantization for 4-bit base models (double quantization)
        bnb_4bit_use_double_quant=True,
        # Quantization type (fp4 or nf4)
        bnb_4bit_quant_type="nf4",
        # Compute dtype for 4-bit base models
        bnb_4bit_compute_dtype=compute_dtype
        )

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(model_id,
                                                  # load_in_4bit=True,
                                                  device_map=device_map,
                                                  max_memory=CUDA_MAX_MEMORY,
                                                  # use_flash_attention_2=True, #not working i think its due to 11.4 cuda
                                                  torch_dtype=compute_dtype,
                                                  quantization_config=bnb_config
                                                 )
    max_length = get_max_length(model)
    preprocessed_dataset = preprocess_dataset(tokenizer=tokenizer,
                                              max_length=max_length,
                                              seed=seed,
                                              dataset=dataset)
    
    # Enabling gradient checkpointing to reduce memory usage during fine-tuning
    # gradient_checkpointing_enabled will slow down the compute time, but reduce memory usage
    if gradient_checkpointing_enabled:
        notfailing_checkpoint = partial(torch.utils.checkpoint.checkpoint, use_reentrant=False)
        torch.utils.checkpoint.checkpoint = notfailing_checkpoint
        model.gradient_checkpointing_enable()
    
    # Using the prepare_model_for_kbit_training method from PEFT
    model = prepare_model_for_kbit_training(model)
    # Get lora module names
    layers_for_adapters = find_all_linear_names(model)
    print(f"Layers for PEFT Adaptation: {layers_for_adapters}")
    # Create PEFT config for these modules and wrap the model to PEFT
    peft_config = create_peft_config(layers_for_adapters)
    model = get_peft_model(model, peft_config)
    # Print information about the percentage of trainable parameters
    print_trainable_parameters(model)
    
    training_args = TrainingArguments(
        output_dir=output_dir,
        bf16=True,
        report_to="wandb",

        # Batch size per GPU for training
        per_device_train_batch_size=1,
        # Batch size per GPU for evaluation
        per_device_eval_batch_size=4,
        
        # Number of update steps to accumulate the gradients for
        gradient_accumulation_steps=4,
        
        # Maximum gradient normal (gradient clipping)
        # max_grad_norm = 0.3,

        # Initial learning rate (AdamW optimizer)
        learning_rate = 2e-4,

        # Weight decay to apply to all layers except bias/LayerNorm weights
        # weight_decay = 0.001,
        
        # Optimizer to use
        optim = "paged_adamw_8bit",    

        # Learning rate schedule (constant a bit better than cosine)
        lr_scheduler_type = "constant",
        
        # Number of training steps (overrides num_train_epochs)
        max_steps = epochs*len(preprocessed_dataset),
        
        # Ratio of steps for a linear warmup (from 0 to learning rate)
        warmup_ratio = 0.05,
        
        # Save checkpoint every X updates steps
        save_steps = int(epochs*len(preprocessed_dataset)/20),

        # Log every X updates steps
        logging_steps = int(epochs*len(preprocessed_dataset)/20),
        
        # Group sequences into batches with same length to save memory and speed up training
        # group_by_length = True,
        )

    trainer = Trainer(
        model=model,
        train_dataset=preprocessed_dataset,
        args=training_args,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
    )
    
    model.config.use_cache = False  # re-enable for inference to speed up predictions for similar inputs
    
    dtypes = {}
    for _, p in model.named_parameters():
        dtype = p.dtype
        if dtype not in dtypes: dtypes[dtype] = 0
        dtypes[dtype] += p.numel()
    total = 0
    for k, v in dtypes.items(): total+= v
    for k, v in dtypes.items():
        print(k, v, v/total)
     
    do_train = True
    
    # Launch training
    print("Training...")
    
    if do_train:
        train_result = trainer.train()
        metrics = train_result.metrics
        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()
        print(metrics)    
    
    ###
    
    # Saving model
    print("Saving last checkpoint of the model...")
    os.makedirs(output_dir, exist_ok=True)
    trainer.model.save_pretrained(output_dir)
    
    # Free memory for merging weights
    del model
    del trainer
    torch.cuda.empty_cache()

In [126]:
output_dir = Path.home() / "results" / f"{model_name}-{DATASET}" / "final_checkpoint"

In [127]:
from transformers import Trainer

In [None]:
train(model_id=model_id, dataset=fpb_train_dataset,
      seed=SEED, output_dir=output_dir, gradient_checkpointing_enabled=False, epochs=5)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Found max lenth: 4096
Preprocessing dataset...


Map:   0%|          | 0/1811 [00:00<?, ? examples/s]

Map:   0%|          | 0/1811 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1811 [00:00<?, ? examples/s]

Layers for PEFT Adaptation: ['v_proj', 'o_proj', 'down_proj', 'k_proj', 'up_proj', 'q_proj', 'gate_proj']
trainable params: 19988480 || all params: 3520401408 || trainable%: 0.5677897967708119
torch.float32 282398720 0.08021776134910578
torch.uint8 3238002688 0.9197822386508943
Training...


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


In [107]:
output_merged_dir = Path.home() / "results" / f"{model_name}-{DATASET}" / "final_merged_checkpoint"

PosixPath('/home/AD/gmatlin3/results/Llama-2-7b-chat-hf-financial_phrasebank')

In [None]:
model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map="auto", torch_dtype=torch.bfloat16)
model = model.merge_and_unload()

# os.makedirs(output_merged_dir, exist_ok=True)
model.save_pretrained(output_merged_dir, safe_serialization=True)

# save tokenizer for easy inference
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained(output_merged_dir)

In [None]:
repo_name = f"{organization}/{model_name}_{dataset}_{CONFIG}_{SEEDS[0]}"
print(repo_name)
trainer.model.push_to_hub(repo_name, private=True, use_temp_dir=True)

## Evaluation

In [None]:
sft_model = AutoPeftModelForCausalLM.from_pretrained(output_dir/"final_checkpoint",
                                             device_map=device_map, max_memory=CUDA_MAX_MEMORY,
                                             torch_dtype=compute_dtype)

In [None]:
df_test = convert_dataset(fpb_test_dataset)

In [None]:
from transformers import GenerationConfig

In [None]:
temperature=0.0  # [0.0, 1.0]; 0.0 means greedy sampling
do_sample=False
max_new_tokens=256
top_k=10
top_p=0.92
repetition_penalty=1.0  # 1.0 means no penalty
num_return_sequences=1  # Only generate one response
num_beams=1

In [None]:
def generate(model=None, tokenizer=None, input=None):
    inputs = tokenizer(input, return_tensors="pt")
    input_ids = inputs["input_ids"].cuda()
    generation_output = model.generate(
        input_ids=input_ids,
        generation_config=GenerationConfig(temperature=temperature,
                                           do_sample=do_sample,
                                           max_new_tokens=max_new_tokens,
                                           top_k=top_k,
                                           top_p=top_p,
                                           repetition_penalty=repetition_penalty,
                                           num_return_sequences=num_return_sequences,
                                           num_beams=num_beams,
                                           return_dict_in_generate=True,
                                           output_scores=False,
                                          ))
    seq = generation_output.sequences
    output = tokenizer.decode(seq[0])
    return output.split('[/INST]')[-1].strip()

In [None]:
%%time
input_list = df_test['prompt'].to_list()
output_list = []
for i in range(len(input_list)):
    output_list.append(
        generate(model=sft_model, tokenizer=tokenizer, input=inputs_list[i]).replace('</s>','')
    )

In [None]:
output_list