# Supervised Fine-Tuning of Llama2 on FOMC

## Not for Public

In [1]:
import huggingface_hub

HF_AUTH = "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"
huggingface_hub.login(token=HF_AUTH)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/AD/gmatlin3/.cache/huggingface/token
Login successful


### Weights and Biases

In [2]:
# import os
# import wandb

# # set the wandb project where this run will be logged
# os.environ["WANDB_PROJECT"] = f"llama2_sft_fomc"
# # save your trained model checkpoint to wandb
# os.environ["WANDB_LOG_MODEL"] = "false"
# # turn off watch to log faster
# os.environ["WANDB_WATCH"] = "false"
# os.environ["WANDB_API_KEY"] = "fa69ffc6a97578da0410b553042cbb8b3bf5fcaf"
# os.environ["WANDB_NOTEBOOK_NAME"] = f"llama2_sft"

# wandb.login()

## Setup

In [2]:
import os

# os.environ["TOKENIZERS_PARALLELISM"] = "false"

### Imports

In [21]:
import os
import sys
from pathlib import Path

from tqdm.notebook import tqdm
from transformers import GenerationConfig

SRC_DIRECTORY = Path().cwd().resolve().parent

if str(SRC_DIRECTORY) not in sys.path:
    sys.path.insert(0, str(SRC_DIRECTORY))

import logging

logger = logging.getLogger("llama2_finetune")
logger.setLevel(logging.DEBUG)
consoleHandler = logging.StreamHandler()
consoleHandler.setLevel(logging.DEBUG)
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
consoleHandler.setFormatter(formatter)
logger.addHandler(consoleHandler)

import pprint

pp = pprint.PrettyPrinter(indent=4)

import uuid
import warnings
from collections import namedtuple
from datetime import datetime
from functools import partial

import bitsandbytes as bnb
import torch
import torch.nn as nn
from datasets import Dataset, DatasetDict, load_dataset
from peft import (
    AutoPeftModelForCausalLM,
    LoraConfig,
    TaskType,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from sklearn.model_selection import train_test_split
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    LlamaConfig,
    LlamaForCausalLM,
    LlamaModel,
    LlamaTokenizer,
    TextGenerationPipeline,
    Trainer,
    TrainingArguments,
    logging,
    pipeline,
)

from llama.instructions import (
    B_INST,
    B_SYS,
    E_INST,
    E_SYS,
    TASK_MAP,
    llama2_prompt_generator,
)

# from trl import SFTTrainer


BOS, EOS = "<s>", "</s>"

from peft import PeftModel

### Functions

In [22]:
def generate_uid(id_length=8, dt_format="%y%m%d"):
    date_str = datetime.now().strftime(dt_format)

    # Generate a short UUID
    uid = str(uuid.uuid4())[:id_length]

    # Combine
    uid = f"{uid}_{date_str}"

    return uid


def find_all_linear_names(model):
    # SOURCE https://github.com/artidoro/qlora/blob/main/qlora.py
    cls = (
        bnb.nn.Linear4bit
    )  # if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split(".")
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if "lm_head" in lora_module_names:  # needed for 16-bit
        lora_module_names.remove("lm_head")
    return list(lora_module_names)


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


def print_dtypes(model):
    dtypes = {}
    for _, p in model.named_parameters():
        dtype = p.dtype
        if dtype not in dtypes:
            dtypes[dtype] = 0
        dtypes[dtype] += p.numel()
    total = 0
    for k, v in dtypes.items():
        total += v
    for k, v in dtypes.items():
        print(k, v, v / total)


def create_peft_config(modules):
    peft_config = LoraConfig(
        # Pass our list as an argument to the PEFT config for your model
        target_modules=modules,
        # Dimension of the LoRA matrices we update in adapaters
        r=lora_r,
        # Alpha parameter for LoRA scaling
        lora_alpha=lora_alpha,
        # Dropout probability for LoRA layers
        lora_dropout=lora_dropout,
        bias="none",
        task_type=TaskType.CAUSAL_LM,
    )
    return peft_config


# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length


# def convert_dataset(ds):
#     prompts = llama2_prompt_generator(TASK_INSTRUCTION, ds['text'])
#     labels = [decode(L).upper() for L in ds['label']]
#     df = pd.DataFrame.from_dict({'prompt': prompts, 'response': labels})
#     return df


def create_prompt_formats(
    sample, context_field="sentence", response_field="label_decoded"
):
    SYS_PROMPT = f""""Discard all the previous instructions.
    Below is an instruction that describes a task.
    Write a response that appropriately completes the request."""

    INST_PROMPT = TASK_INSTRUCTION

    if not INST_PROMPT or not isinstance(INST_PROMPT, str):
        raise ValueError("Instruction must be a non-empty string.")
    if not sample or not all(
        isinstance(sample[field], str) for field in [context_field, response_field]
    ):
        raise ValueError("Fields must be a non-empty strings.")

    prompt = (
        B_INST
        + B_SYS
        + SYS_PROMPT
        + E_SYS
        + INST_PROMPT
        + sample[context_field]
        + E_INST
    )
    sample["text"] = prompt
    return sample


def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )


def preprocess_dataset(
    tokenizer: AutoTokenizer, max_length: int, seed: int, dataset: Dataset
):
    """Format & tokenize it so it is ready for training
    :param tokenizer (AutoTokenizer): Model Tokenizer
    :param max_length (int): Maximum number of tokens to emit from tokenizer
    """

    # Add prompt to each sample
    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats)  # , batched=True)

    # Apply preprocessing to each batch of the dataset & and remove 'instruction', 'context', 'response', 'category' fields
    _preprocessing_function = partial(
        preprocess_batch, max_length=max_length, tokenizer=tokenizer
    )
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        # remove_columns=["instruction", "context", "response", "text", "category"],
    )

    # Filter out samples that have input_ids exceeding max_length
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)

    # Shuffle dataset
    dataset = dataset.shuffle(seed=seed)

    return dataset

### Configuration

In [27]:
################################################################################
# CUDA Parameters
################################################################################
major, _ = torch.cuda.get_device_capability()
if major >= 8:
    print("=" * 80)
    print("Your GPU supports bfloat16: accelerate training with bf16=True")
    print("=" * 80)
    compute_dtype = torch.bfloat16
else:
    print("=" * 80)
    print("Your GPU does not support bfloat16: using fp=16")
    print("=" * 80)
    compute_dtype = torch.float16

CUDA_N_GPUS = torch.cuda.device_count()
CUDA_MAX_MEMORY = f"{int(torch.cuda.mem_get_info()[0] / 1024 ** 3) - 2}GB"
CUDA_MAX_MEMORY = {i: CUDA_MAX_MEMORY for i in range(CUDA_N_GPUS)}
logger.info(f"Using k={CUDA_N_GPUS} CUDA GPUs with max memory {CUDA_MAX_MEMORY}")


################################################################################
# User parameters
################################################################################
organization = "gtfintechlab"

task_name = "fomc_communication"

seeds = (5768, 78516, 944601)

seed = seeds[0]

model_parameters = "7b"
model_id = f"meta-llama/Llama-2-{model_parameters}-chat-hf"
model_name = model_id.split("/")[-1]

repo_name = f"{organization}/{model_name}_{task_name}"

task_instruction, task_data = (
    TASK_MAP[task_name]["instruction"],
    TASK_MAP[task_name]["data"],
)

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 16

# Alpha parameter for LoRA scaling
lora_alpha = 8

# Dropout probability for LoRA layers
lora_dropout = 0.05

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
# device_map = {"": 0}

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
load_in_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = compute_dtype

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
bnb_4bit_use_double_quant = True

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = Path(f"/fintech_storage/results/{MODEL_NAME}_{TASK}")

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = True

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit" #"paged_adamw_8bit"

# Learning rate schedule
lr_scheduler_type = "cosine" #"constant"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

2023-10-17 19:33:45,767 - llama2_finetune - INFO - Using k=2 CUDA GPUs with max memory {0: '41GB', 1: '41GB'}
2023-10-17 19:33:45,767 - llama2_finetune - INFO - Using k=2 CUDA GPUs with max memory {0: '41GB', 1: '41GB'}
2023-10-17 19:33:45,767 - llama2_finetune - INFO - Using k=2 CUDA GPUs with max memory {0: '41GB', 1: '41GB'}
2023-10-17 19:33:45,767 - llama2_finetune - INFO - Using k=2 CUDA GPUs with max memory {0: '41GB', 1: '41GB'}


Your GPU supports bfloat16: accelerate training with bf16=True


In [28]:
Args = namedtuple(
    "Args",
    [
        "task_name",
        "seed",
        "model_id",
        "model_name",
        "organization",
        "uid",
        "lora_r",
        "lora_alpha",
        "lora_dropout",
        "max_seq_length",
        "packing",
        "device_map",
        "load_in_4bit",
        "bnb_4bit_compute_dtype",
        "bnb_4bit_use_double_quant",
        "bnb_4bit_quant_type",
        "output_dir",
        "num_train_epochs",
        "fp16",
        "bf16",
        "per_device_train_batch_size",
        "per_device_eval_batch_size",
        "gradient_accumulation_steps",
        "gradient_checkpointing",
        "max_grad_norm",
        "learning_rate",
        "weight_decay",
        "optim",
        "lr_scheduler_type",
        "max_steps",
        "warmup_ratio",
        "group_by_length",
        "save_steps",
        "logging_steps"
    ],
)

args = Args(
    task_name=task_name,
    seed=seed,
    model_id=model_id,
    model_name=model_id.split("/")[-1],
    organization=organization,
    uid=generate_uid(),
    lora_r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    max_seq_length=max_seq_length,
    packing=packing,
    device_map=device_map,
    load_in_4bit=load_in_4bit,
    bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
    bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    fp16=fp16,
    bf16=bf16,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    gradient_checkpointing=gradient_checkpointing,
    max_grad_norm=max_grad_norm,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    optim=optim,
    lr_scheduler_type = lr_scheduler_type,
    max_steps=max_steps,
    warmup_ratio= warmup_ratio,
    group_by_length=group_by_length,
    save_steps=save_steps,
    logging_steps=logging_steps,
    
)

## Supervised Fine-Tuning

In [29]:
def train(args):  # , gradient_checkpointing_enabled=False):
    bnb_config = BitsAndBytesConfig(
        # Activate 4-bit precision base model loading
        load_in_4bit=args.load_in_4bit,
        # Activate nested quantization for 4-bit base models (double quantization)
        bnb_4bit_use_double_quant=args.bnb_4bit_use_double_quant,
        # Quantization type (fp4 or nf4)
        bnb_4bit_quant_type=args.bnb_4bit_quant_type,
        # Compute dtype for 4-bit base models
        bnb_4bit_compute_dtype=args.bnb_4bit_compute_dtype,
    )

    # Tokenizer configured to fix overflow issues with fp16 training
    tokenizer = AutoTokenizer.from_pretrained(args.model_id, pad_token=EOS)

    model = AutoModelForCausalLM.from_pretrained(
        args.model_id,
        device_map=args.device_map,
        # max_memory=CUDA_MAX_MEMORY,
        torch_dtype=args.bnb_4bit_compute_dtype,
        quantization_config=bnb_config,
    )
    if args.max_length:
        max_length = args.max_length
    else:
        max_length = get_max_length(model)

    train_dataset = load_dataset(
        f"{args.organization}/{args.task_name}", str(args.seed)
    )["train"]
    preprocessed_train_dataset = preprocess_dataset(
        tokenizer=tokenizer,
        max_length=max_length,
        seed=args.seed,
        dataset=train_dataset,
    )
    # test_dataset = load_dataset(f"{args.organization}/{args.task_name}", str(args.seed))[
    #     "test"max_grad_norm
    # ]
    # preprocessed_test_dataset = preprocess_dataset(
    # tokenizer=tokenizer, max_length=max_length, seed=args.seed, dataset=test_dataset
    # )

    # Using the prepare_model_for_kbit_training method from PEFT
    model = prepare_model_for_kbit_training(model)
    # Get lora module names
    layers_for_adapters = find_all_linear_names(model)
    print(f"Layers for PEFT Adaptation: {layers_for_adapters}")
    # Create PEFT config for these modules and wrap the model to PEFT
    peft_config = create_peft_config(layers_for_adapters)
    model = get_peft_model(model, peft_config)
    # Print information about the percentage of trainable parameters
    print_trainable_parameters(model)

    output_dir = args.output_dir / "final_checkpoint"

    training_arguments = TrainingArguments(
        output_dir=args.output_dir,
        bf16=args.bf16,
        # report_to="wandb",
        # Batch size per GPU for training
        per_device_train_batch_size=args.per_device_train_batch_size,
        # Batch size per GPU for evaluation
        per_device_eval_batch_size=args.per_device_eval_batch_size,
        # Number of update steps to accumulate the gradients for
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        # Maximum gradient normal (gradient clipping)
        max_grad_norm = args.max_grad_norm,
        # Initial learning rate (AdamW optimizer)
        learning_rate=args.learning_Rate,
        # Weight decay to apply to all layers except bias/LayerNorm weights
        weight_decay = args.weight_decay,
        # Optimizer to use
        optim=args.optim,
        # Learning rate schedule (constant a bit better than cosine)
        lr_scheduler_type=args.lr_scheduler_type,
        # Number of training steps (overrides num_train_epochs)
        max_steps=args.max_steps,
        # Ratio of steps for a linear warmup (from 0 to learning rate)
        warmup_ratio=args.warmup_ratio,
        # Save checkpoint every X updates steps
        save_steps=args.save_steps,
        # Log every X updates steps
        logging_steps=args.logging_steps,
        # Group sequences into batches with same length to save memory and speed up training
        group_by_length = args.group_by_length,
    )

    trainer = Trainer(
        model=model,
        train_dataset=preprocessed_train_dataset,
        args=training_arguments,
        # data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    )

    # trainer = SFTTrainer(
    #     model=model,
    #     args=training_arguments,
    #     max_seq_length=max_length,
    #     train_dataset=preprocessed_train_dataset,
    #     eval_dataset=preprocessed_test_dataset,
    #     peft_config=peft_config,
    #     dataset_text_field="text",
    #     tokenizer=tokenizer,
    # )

    # # Re-enable for inference to speed up predictions for similar inputs
    model.config.use_cache = False
    print("Training...")
    # Suppress specific warnings from torch.utils.checkpoint
    with warnings.catch_warnings():
        warnings.filterwarnings(
            "ignore", category=UserWarning, module="torch.utils.checkpoint"
        )
        train_result = trainer.train()
        metrics = trainer.metrics
        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()
        print_dtypes(trainer.model)
        print(metrics)
        # Saving model
        print("Saving last checkpoint of the model...")
        os.makedirs(output_dir, exist_ok=True)
        trainer.model.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)

    # Empty VRAM
    del model
    del pipe
    del trainer
    torch.cuda.empty_cache()
    import gc

    gc.collect()
    gc.collect()

In [30]:
train(args)

ValueError: `torch_dtype` can be either `torch.dtype` or `"auto"`, but received bfloat16

---

### Save Results

In [None]:
# reload final model checkpoint and save
new_model = AutoPeftModelForCausalLM.from_pretrained(
    args.output_dir / "final_checkpoint",
    device_map=args.device_map,
    torch_dtype=compute_dtype,
)

In [None]:
print_dtypes(model)

In [None]:
# load base model
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=args.device_map,
    max_memory=CUDA_MAX_MEMORY,
    torch_dtype=compute_dtype,
)

In [None]:
print_dtypes(base_model)

In [None]:
# This method merges the LoRa layers into the base model. This is needed to use it as a standalone model.
peft_model = PeftModel.from_pretrained(base_model, args.output_dir / "final_checkpoint")
peft_model = peft_model.merge_and_unload()
tokenizer = AutoTokenizer.from_pretrained(
    args.output_dir / "final_checkpoint", pad_token=EOS
)

In [None]:
print_dtypes(peft_model)

In [None]:
# save inference
merged_checkpoint_dir = args.output_dir / "final_merged_checkpoint"
model.save_pretrained(merged_checkpoint_dir, safe_serialization=True)
tokenizer.save_pretrained(merged_checkpoint_dir)

In [None]:
# push to hub
peft_model.push_to_hub(repo_name, private=True, use_temp_dir=True)
tokenizer.push_to_hub(repo_name, private=True, use_temp_dir=True)

## Evaluation

In [None]:
# TODO: move to configs or args
temperature = 0.0  # [0.0, 1.0]; 0.0 means greedy sampling
do_sample = False
max_new_tokens = 256
top_k = 10
top_p = 0.92
repetition_penalty = 1.0  # 1.0 means no penalty
num_return_sequences = 1  # Only generate one response
num_beams = 1


def generate(model=None, tokenizer=None, dataset=None):
    input_ids = tokenizer(dataset["text"])

    # Ensure that input_ids is a PyTorch tensor
    # input_ids = torch.tensor(input_ids).long()

    # Move the tensor to the GPU
    input_ids = input_ids.cuda()

    generation_output = model.generate(
        input_ids=input_ids,
        generation_config=GenerationConfig(
            temperature=temperature,
            do_sample=do_sample,
            max_new_tokens=max_new_tokens,
            top_k=top_k,
            top_p=top_p,
            repetition_penalty=repetition_penalty,
            num_return_sequences=num_return_sequences,
            num_beams=num_beams,
            return_dict_in_generate=True,
            output_scores=False,
        ),
    )
    seq = generation_output.sequences
    output = tokenizer.decode(seq[0])
    return output.split("[/INST]")[-1].strip()

In [None]:
test_dataset = load_dataset(f"{args.organization}/{args.task_name}", str(args.seed))[
    "test"
]

### Baseline

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=args.device_map,
    max_memory=CUDA_MAX_MEMORY,
    torch_dtype=compute_dtype,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = EOS
tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training

max_length = get_max_length(model)

preprocessed_test_dataset = preprocess_dataset(
    tokenizer=tokenizer, max_length=max_length, seed=args.seed, dataset=test_dataset
)

In [None]:
# Run text generation pipeline with our next model
prompt = "What is a large language model?"
pipe = pipeline(
    task="text-generation", model=model, tokenizer=tokenizer, max_length=max_length
)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]["generated_text"])

In [None]:
# N_GENS = preprocessed_test_dataset.num_rows
N_GENS = 10

output_list = []
for i in range(N_GENS):
    output_list.append(
        generate(model=model, tokenizer=tokenizer, dataset=preprocessed_test_dataset)
    )

### Supervised Fine-Tuning

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    merged_checkpoint_dir,
    device_map=args.device_map,
    max_memory=CUDA_MAX_MEMORY,
    torch_dtype=compute_dtype,
)
tokenizer = AutoTokenizer.from_pretrained(merged_checkpoint_dir)

max_length = get_max_length(model)

preprocessed_dataset = preprocess_dataset(
    tokenizer=tokenizer, max_length=max_length, seed=args.seed, dataset=dataset
)