# Supervised Fine-Tuning of Llama2 on Financial Phrasebank

!python -m ipykernel install --user --name=conference

## Imports

In [1]:
import os
import sys
from pathlib import Path
from tqdm.notebook import tqdm

SRC_DIRECTORY = Path().cwd().resolve().parent

if str(SRC_DIRECTORY) not in sys.path:
    sys.path.insert(0, str(SRC_DIRECTORY))

In [2]:
import logging
logger = logging.getLogger('llama2_finetune')
logger.setLevel(logging.DEBUG)
consoleHandler = logging.StreamHandler()
consoleHandler.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
consoleHandler.setFormatter(formatter)
logger.addHandler(consoleHandler)

In [3]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

In [4]:
import torch
from transformers import LlamaTokenizer, LlamaForCausalLM, LlamaModel, LlamaConfig, TextGenerationPipeline
from transformers import TrainingArguments

In [5]:
from transformers import BitsAndBytesConfig

In [7]:
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split

In [8]:
import huggingface_hub

In [9]:
from trl import SFTTrainer

False

The following directories listed in your path were found to be non-existent: {PosixPath('pf=\\E[4i'), PosixPath('k3=\\EOR'), PosixPath('AB=\\E[4%dm'), PosixPath('sr=\\EM'), PosixPath('sc=\\E7'), PosixPath('k5=\\E[15~'), PosixPath('nd=\\E[C'), PosixPath('Co#8'), PosixPath('ks=\\E[?1h\\E='), PosixPath('AF=\\E[3%dm'), PosixPath('ct=\\E[3g'), PosixPath('xn'), PosixPath('kH=\\E[4~'), PosixPath('cs=\\E[%i%d;%dr'), PosixPath('DL=\\E[%dM'), PosixPath('cr=^M'), PosixPath('k;=\\E[21~'), PosixPath('me=\\E[m'), PosixPath('cd=\\E[J'), PosixPath('kd=\\EOB'), PosixPath('ue=\\E[24m'), PosixPath('te=\\E[?1049l'), PosixPath('ac=\\140\\140aaffggjjkkllmmnnooppqqrrssttuuvvwwxxyyzz{{||}}~~..--++,,hhII00'), PosixPath('ce=\\E[K'), PosixPath('ku=\\EOA'), PosixPath('kN=\\E[6~'), PosixPath('kI=\\E[2~'), PosixPath('k7=\\E[18~'), PosixPath('k6=\\E[17~'), PosixPath('F2=\\E[24~'), PosixPath('kh=\\E[1~'), PosixPath('DC=\\E[%dP'), PosixPath('dc=\\E[P'), PosixPath('k2=\\EOQ'), PosixPath('dl=\\E[M'), PosixPath('s


python -m bitsandbytes


  warn(msg)
  warn(msg)


RuntimeError: 
        CUDA Setup failed despite GPU being available. Please run the following command to get more information:

        python -m bitsandbytes

        Inspect the output of the command and see if you can locate CUDA libraries. You might need to add them
        to your LD_LIBRARY_PATH. If you suspect a bug, please take the information from python -m bitsandbytes
        and open an issue at: https://github.com/TimDettmers/bitsandbytes/issues

In [None]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, AutoPeftModelForCausalLM

In [None]:
from peft import AutoPeftModelForCausalLM

## Configuration

In [None]:
from collections import namedtuple
Args = namedtuple("Args", ["task_name"])
args = Args(task_name="sentiment_analysis")

# TODO: move more configs into my args?

In [None]:
import uuid
from datetime import datetime

# Generate a date string
date_str = datetime.now().strftime("%y%m%d")

# Generate a short UUID
uid = str(uuid.uuid4())[:8]

# Combine
uid = f"{uid}_{date_str}"

print(uid)

### HuggingFace

In [None]:
hf_auth = "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" 
# TODO REMOVE MY TOKEN FOR THE FINAL VERSION
# TODO PROVIDE DIRECTIONS FOR HOW TO GET HF TOKEN HERE

huggingface_hub.login(token=hf_auth)

In [None]:
organization = "gtfintechlab"
dataset = "financial_phrasebank"

### Folders

In [None]:
output_dir = Path.home() / f"{dataset}_results"

### Weights and Biases

In [None]:
import wandb

# set the wandb project where this run will be logged
os.environ["WANDB_PROJECT"]="llama2-fpb-sft"

# save your trained model checkpoint to wandb
os.environ["WANDB_LOG_MODEL"]="false"

# turn off watch to log faster
os.environ["WANDB_WATCH"]="false"

### CUDA

In [None]:
# Check GPU compatibility with bfloat16
major, _ = torch.cuda.get_device_capability()
if major >= 8:
    print("=" * 80)
    print("Your GPU supports bfloat16: accelerate training with bf16=True")
    print("=" * 80)
    compute_dtype = torch.bfloat16
else:
    print("=" * 80)
    print("Your GPU does not support bfloat16: using fp=16")
    print("=" * 80)
    compute_dtype = torch.float16

In [None]:
CUDA_N_GPUS = torch.cuda.device_count()
CUDA_MAX_MEMORY = f"{int(torch.cuda.mem_get_info()[0] / 1024 ** 3) - 2}GB"
CUDA_MAX_MEMORY = {i: CUDA_MAX_MEMORY for i in range(CUDA_N_GPUS)}
logger.info(
    f"Using k={CUDA_N_GPUS} CUDA GPUs with max memory {CUDA_MAX_MEMORY}"
)

In [None]:
device_map = "auto" #{"":0},

## Financial PhraseBank

In [None]:
SEEDS = (5768, 78516, 944601)

### Create splits on HuggingFace Hub

### Load split dataset from HF Hub

In [None]:
CONFIG = "allagree"

In [None]:
fpb_train_dataset = load_dataset(f"{organization}/{dataset}-{CONFIG}-{SEEDS[0]}", "train")['train']
fpb_test_dataset = load_dataset(f"{organization}/{dataset}-{CONFIG}-{SEEDS[0]}", "test")['train']

## Prompts

In [None]:
from llama.instructions import TASK_MAP, llama2_prompt_generator

In [None]:
def decode(label_number):
    if label_number == 0:
        return "positive"
    elif label_number == 1:
        return "negative"
    elif label_number == 2:
        return "neutral"
    else:
        raise ValueError("Invalid label number")

TASK_INSTRUCTION, TASK_DATA = (
    TASK_MAP[args.task_name]["instruction"],
    TASK_MAP[args.task_name]["data"],
)

def convert_dataset(ds):
    prompts = llama2_prompt_generator(TASK_INSTRUCTION, ds['text'])
    labels = [decode(L).upper() for L in ds['label']]
    df = pd.DataFrame.from_dict({'prompt': prompts, 'response': labels})
    return df

## Model

In [None]:
model_id = "meta-llama/Llama-2-7b-chat-hf"
model_name = model_id.split('/')[-1]

In [None]:
bnb_config = BitsAndBytesConfig(
    # Activate 4-bit precision base model loading
    load_in_4bit=True,
    # Activate nested quantization for 4-bit base models (double quantization)
    bnb_4bit_use_double_quant=True,
    # Quantization type (fp4 or nf4)
    bnb_4bit_quant_type="nf4",
    # Compute dtype for 4-bit base models
    bnb_4bit_compute_dtype=compute_dtype
)

In [None]:
tokenizer = LlamaTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
model_config = LlamaConfig.from_pretrained(
                            model_id,
                            bos_token_id = 1,
                            eos_token_id = 2,
                            hidden_act = "silu",
                            hidden_size = 8192,
                            initializer_range = 0.02,
                            intermediate_size = 28672,
                            max_position_embeddings = 4096,
                            model_type = "llama",
                            num_attention_heads = 64,
                            num_hidden_layers = 80,
                            num_key_value_heads = 8,
                            pretraining_tp = 1,
                            rms_norm_eps = 1e-05,
                            rope_scaling = None,
                            tie_word_embeddings = False,
                            # torch_dtype = "float16",
                            # transformers_version = "4.32.0.dev0",
                            use_cache = False, # TODO: double check use cache
                            vocab_size = 32000
                            )

In [None]:
from transformers import AutoModelForCausalLM

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(model_id,
                                                  load_in_4bit=True,
                                              device_map=device_map,
                                                  max_memory=CUDA_MAX_MEMORY,
                                                  torch_dtype=compute_dtype,
                                              # use_auth_token=True,
                                              quantization_config=bnb_config
                                                 )

In [None]:
base_model = LlamaForCausalLM.from_pretrained(
    model_id,
    config=model_config,
    quantization_config=bnb_config,
    device_map="auto",
    # offload_state_dict=True,
    # offload_folder="offload",
    max_memory=CUDA_MAX_MEMORY,
)

In [None]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
peft_config = LoraConfig(
    # Alpha parameter for LoRA scaling
    lora_alpha=16,
    # Dropout probability for LoRA layers
    lora_dropout=0.1,
    # LoRA attention dimension
    r=64,
    bias="none", 
    task_type="CAUSAL_LM"
)

In [None]:
training_args = TrainingArguments(
    output_dir=output_dir,
    report_to="wandb",
    # Batch size per GPU for training
    per_device_train_batch_size=4,
    # Batch size per GPU for evaluation
    per_device_eval_batch_size=4,
    # Number of update steps to accumulate the gradients for
    gradient_accumulation_steps=4,
    # Enable gradient checkpointing
    gradient_checkpointing = True,
    # Maximum gradient normal (gradient clipping)
    max_grad_norm = 0.3,
    # Initial learning rate (AdamW optimizer)
    learning_rate = 2e-4,
    # Weight decay to apply to all layers except bias/LayerNorm weights
    weight_decay = 0.001,
    # Optimizer to use
    optim = "paged_adamw_32bit",    
    # Learning rate schedule (constant a bit better than cosine)
    lr_scheduler_type = "constant",
    # Number of training steps (overrides num_train_epochs)
    max_steps = -1,
    # Ratio of steps for a linear warmup (from 0 to learning rate)
    warmup_ratio = 0.03,
    # Group sequences into batches with same length to save memory and speed up training
    group_by_length = True,
    # Save checkpoint every X updates steps
    save_steps = 25,
    # Log every X updates steps
    logging_steps = 25,
)

In [None]:
trainer = SFTTrainer(
    model=base_model,
    train_dataset=fpb_train_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    # Maximum sequence length to use
    max_seq_length = None,
    # max_seq_length = 512,
    # Pack multiple short examples in the same input sequence to increase efficiency
    packing = False,
    tokenizer=tokenizer,
    args=training_args,
)

In [None]:
trainer.train()

In [None]:
trainer.model.save_pretrained(output_dir / "final_checkpoint")
trainer.model.config.save_pretrained(output_dir / "final_checkpoint")

In [None]:
repo_name = f"{organization}/{model_name}_{dataset}_{CONFIG}_{SEEDS[0]}"
print(repo_name)
trainer.model.push_to_hub(repo_name, private=True, use_temp_dir=True)

## Evaluation

In [None]:
sft_model = AutoPeftModelForCausalLM.from_pretrained(output_dir/"final_checkpoint",
                                             device_map=device_map, max_memory=CUDA_MAX_MEMORY,
                                             torch_dtype=compute_dtype)

In [None]:
df_test = convert_dataset(fpb_test_dataset)

In [None]:
from transformers import GenerationConfig

In [None]:
temperature=0.0  # [0.0, 1.0]; 0.0 means greedy sampling
do_sample=False
max_new_tokens=256
top_k=10
top_p=0.92
repetition_penalty=1.0  # 1.0 means no penalty
num_return_sequences=1  # Only generate one response
num_beams=1

In [None]:
def generate(model=None, tokenizer=None, input=None):
    inputs = tokenizer(input, return_tensors="pt")
    input_ids = inputs["input_ids"].cuda()
    generation_output = model.generate(
        input_ids=input_ids,
        generation_config=GenerationConfig(temperature=temperature,
                                           do_sample=do_sample,
                                           max_new_tokens=max_new_tokens,
                                           top_k=top_k,
                                           top_p=top_p,
                                           repetition_penalty=repetition_penalty,
                                           num_return_sequences=num_return_sequences,
                                           num_beams=num_beams,
                                           return_dict_in_generate=True,
                                           output_scores=False,
                                          ))
    seq = generation_output.sequences
    output = tokenizer.decode(seq[0])
    return output.split('[/INST]')[-1].strip()

In [None]:
%%time
input_list = df_test['prompt'].to_list()
output_list = []
for i in range(len(input_list)):
    output_list.append(
        generate(model=sft_model, tokenizer=tokenizer, input=inputs_list[i]).replace('</s>','')
    )

In [None]:
output_list