# Supervised Fine-Tuning of Llama2 on FOMC

## Not for Public

In [1]:
import huggingface_hub

HF_AUTH = "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"
huggingface_hub.login(token=HF_AUTH)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/AD/gmatlin3/.cache/huggingface/token
Login successful


### Weights and Biases

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import wandb

# # set the wandb project where this run will be logged
# os.environ["WANDB_PROJECT"] = f"llama2_sft_fomc"
# # save your trained model checkpoint to wandb
# os.environ["WANDB_LOG_MODEL"] = "false"
# # turn off watch to log faster
# os.environ["WANDB_WATCH"] = "false"
# os.environ["WANDB_API_KEY"] = "fa69ffc6a97578da0410b553042cbb8b3bf5fcaf"
# os.environ["WANDB_NOTEBOOK_NAME"] = f"llama2_sft"

# wandb.login()

## Setup

### Imports

In [3]:
import os
import sys
from pathlib import Path

from tqdm.notebook import tqdm
from transformers import GenerationConfig

SRC_DIRECTORY = Path().cwd().resolve().parent

if str(SRC_DIRECTORY) not in sys.path:
    sys.path.insert(0, str(SRC_DIRECTORY))

In [4]:
import logging
from transformers import logging as hf_logging

hf_logging.set_verbosity(hf_logging.DEBUG)

logger = logging.getLogger("llama2_finetune")
logger.setLevel(logging.DEBUG)

# Create handlers
c_handler = logging.StreamHandler()
f_handler = logging.FileHandler('llama2_finetune.log')
c_handler.setLevel(logging.DEBUG)
f_handler.setLevel(logging.DEBUG)


# Create formatters and add it to handlers
format = '%(name)s - %(levelname)s - %(message)s'
c_format = logging.Formatter(format)
f_format = logging.Formatter(format)
c_handler.setFormatter(c_format)
f_handler.setFormatter(f_format)


# Add handlers to the logger
logger.addHandler(c_handler)
logger.addHandler(f_handler)

In [5]:
import pprint

pp = pprint.PrettyPrinter(indent=4)

import uuid
import warnings
from collections import namedtuple
from datetime import datetime
from functools import partial

import bitsandbytes as bnb
import torch
import torch.nn as nn
from datasets import Dataset, DatasetDict, load_dataset
from peft import (
    AutoPeftModelForCausalLM,
    LoraConfig,
    TaskType,
    get_peft_model,
    prepare_model_for_kbit_training,
)
from sklearn.model_selection import train_test_split
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    LlamaConfig,
    LlamaForCausalLM,
    LlamaModel,
    LlamaTokenizer,
    TextGenerationPipeline,
    Trainer,
    TrainingArguments,
    logging,
    pipeline,
)

from llama.instructions import (
    B_INST,
    B_SYS,
    E_INST,
    E_SYS,
    TASK_MAP,
    llama2_prompt_generator,
)

from trl import SFTTrainer

# TODO: Import EOS from elsewhere
EOS = "</s>"

from peft import PeftModel

Detected PIL version 10.0.1
Detected PIL version 10.0.1
Detected PIL version 10.0.1
Detected PIL version 10.0.1
Detected PIL version 10.0.1
Detected PIL version 10.0.1
Detected PIL version 10.0.1
Detected PIL version 10.0.1
Detected PIL version 10.0.1
Detected PIL version 10.0.1
Detected PIL version 10.0.1
Detected PIL version 10.0.1
Detected PIL version 10.0.1
Detected PIL version 10.0.1
Detected PIL version 10.0.1
Detected PIL version 10.0.1
Detected PIL version 10.0.1
Detected PIL version 10.0.1
Detected PIL version 10.0.1
Detected PIL version 10.0.1
Detected PIL version 10.0.1
Detected PIL version 10.0.1
Detected PIL version 10.0.1
Detected PIL version 10.0.1
Detected PIL version 10.0.1
Detected PIL version 10.0.1
Detected PIL version 10.0.1
Detected PIL version 10.0.1
Detected PIL version 10.0.1
Detected PIL version 10.0.1
Detected PIL version 10.0.1
Detected PIL version 10.0.1
Detected PIL version 10.0.1
Detected PIL version 10.0.1
Detected PIL version 10.0.1
Detected PIL version

### Functions

In [6]:
def generate_uid(id_length=8, dt_format="%y%m%d"):
    date_str = datetime.now().strftime(dt_format)

    # Generate a short UUID
    uid = str(uuid.uuid4())[:id_length]

    # Combine
    uid = f"{uid}_{date_str}"

    return uid


def find_all_linear_names(model):
    # SOURCE https://github.com/artidoro/qlora/blob/main/qlora.py
    cls = (
        bnb.nn.Linear4bit
    )  # if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split(".")
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if "lm_head" in lora_module_names:  # needed for 16-bit
        lora_module_names.remove("lm_head")
    return list(lora_module_names)


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


def print_dtypes(model):
    dtypes = {}
    for _, p in model.named_parameters():
        dtype = p.dtype
        if dtype not in dtypes:
            dtypes[dtype] = 0
        dtypes[dtype] += p.numel()
    total = 0
    for k, v in dtypes.items():
        total += v
    for k, v in dtypes.items():
        print(k, v, v / total)


def create_peft_config(args, modules):
    peft_config = LoraConfig(
        # Pass our list as an argument to the PEFT config for your model
        target_modules=modules,
        # Dimension of the LoRA matrices we update in adapaters
        r=args.lora_r,
        # Alpha parameter for LoRA scaling
        lora_alpha=args.lora_alpha,
        # Dropout probability for LoRA layers
        lora_dropout=args.lora_dropout,
        bias="none",
        task_type=TaskType.CAUSAL_LM,
    )
    return peft_config


# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length


# def convert_dataset(ds):
#     prompts = llama2_prompt_generator(TASK_INSTRUCTION, ds['text'])
#     labels = [decode(L).upper() for L in ds['label']]
#     df = pd.DataFrame.from_dict({'prompt': prompts, 'response': labels})
#     return df


def create_prompt_formats(
    sample, context_field="sentence", response_field="label_decoded"
):
    SYS_PROMPT = f""""Discard all the previous instructions.
    Below is an instruction that describes a task.
    Write a response that appropriately completes the request."""

    INST_PROMPT = args.task_instruction

    if not INST_PROMPT or not isinstance(INST_PROMPT, str):
        raise ValueError("Instruction must be a non-empty string.")
    if not sample or not all(
        isinstance(sample[field], str) for field in [context_field, response_field]
    ):
        raise ValueError("Fields must be a non-empty strings.")

    prompt = (
        B_INST
        + B_SYS
        + SYS_PROMPT
        + E_SYS
        + INST_PROMPT
        + sample[context_field]
        + E_INST
    )
    sample["text"] = prompt
    return sample


def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )


def preprocess_dataset(
    tokenizer: AutoTokenizer, max_length: int, seed: int, dataset: Dataset
):
    """Format & tokenize it so it is ready for training
    :param tokenizer (AutoTokenizer): Model Tokenizer
    :param max_length (int): Maximum number of tokens to emit from tokenizer
    """

    # Add prompt to each sample
    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats)  # , batched=True)

    # Apply preprocessing to each batch of the dataset & and remove 'instruction', 'context', 'response', 'category' fields
    _preprocessing_function = partial(
        preprocess_batch, max_length=max_length, tokenizer=tokenizer
    )
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        # remove_columns=["instruction", "context", "response", "text", "category"],
    )

    # Filter out samples that have input_ids exceeding max_length
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)

    # Shuffle dataset
    dataset = dataset.shuffle(seed=seed)

    return dataset

### Configuration

In [7]:
################################################################################
# User parameters
################################################################################
organization = "gtfintechlab"

task_name = "fomc_communication"

seeds = (5768, 78516, 944601)

seed = seeds[0]

model_parameters = "7b"
model_id = f"meta-llama/Llama-2-{model_parameters}-chat-hf"
model_name = model_id.split("/")[-1]

repo_name = f"{organization}/{model_name}_{task_name}"

task_instruction, task_data = (
    TASK_MAP[task_name]["instruction"],
    TASK_MAP[task_name]["data"],
)

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 16

# Alpha parameter for LoRA scaling
lora_alpha = 32

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

################################################################################
# CUDA Parameters
################################################################################

# Enable fp16/bf16 training

major, _ = torch.cuda.get_device_capability()
if major >= 8:
    print("=" * 80)
    print("Your GPU supports bfloat16: accelerate training with bf16=True")
    print("=" * 80)
    compute_dtype = torch.bfloat16
    fp16, bf16 = False, True
else:
    print("=" * 80)
    print("Your GPU does not support bfloat16: using fp=16")
    print("=" * 80)
    compute_dtype = torch.float16
    fp16, bf16 = True, False

CUDA_N_GPUS = torch.cuda.device_count()
CUDA_MAX_MEMORY = f"{int(torch.cuda.mem_get_info()[0] / 1024 ** 3) - 2}GB"
CUDA_MAX_MEMORY = {i: CUDA_MAX_MEMORY for i in range(CUDA_N_GPUS)}
logger.info(f"Using k={CUDA_N_GPUS} CUDA GPUs with max memory {CUDA_MAX_MEMORY}")

# Load the entire model on the GPU 0
# device_map = {"": 0}
device_map = "auto"

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
load_in_4bit = True

# Activate 8-bit precision base model loading
load_in_8bit = False

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = compute_dtype

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
bnb_4bit_use_double_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = Path(f"/fintech_3/20231018/results/{model_name}_{task_name}")

# Number of training epochs
num_train_epochs = 3

# Batch size per GPU for training
per_device_train_batch_size = 1

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 2

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "adamw_bnb_8bit" #"paged_adamw_8bit" #"paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "constant" #"cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = 250

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 25

# Log every X updates steps
logging_steps = 25

#new
save_strategy="epoch"
disable_tqdm=True

llama2_finetune - INFO - Using k=2 CUDA GPUs with max memory {0: '41GB', 1: '41GB'}


Your GPU supports bfloat16: accelerate training with bf16=True


In [8]:
Args = namedtuple(
    "Args",
    [
        "task_name",
        "task_instruction",
        "seed",
        "model_id",
        "model_name",
        "organization",
        "uid",
        "lora_r",
        "lora_alpha",
        "lora_dropout",
        "max_seq_length",
        "packing",
        "device_map",
        "load_in_4bit",
        "load_in_8bit",
        "bnb_4bit_compute_dtype",
        "bnb_4bit_use_double_quant",
        "bnb_4bit_quant_type",
        "output_dir",
        "num_train_epochs",
        "fp16",
        "bf16",
        "per_device_train_batch_size",
        "per_device_eval_batch_size",
        "gradient_accumulation_steps",
        "gradient_checkpointing",
        "max_grad_norm",
        "learning_rate",
        "weight_decay",
        "optim",
        "lr_scheduler_type",
        "max_steps",
        "warmup_ratio",
        "group_by_length",
        "save_steps",
        "logging_steps"
    ],
)

args = Args(
    task_name=task_name,
    task_instruction=task_instruction,
    seed=seed,
    model_id=model_id,
    model_name=model_id.split("/")[-1],
    organization=organization,
    uid=generate_uid(),
    lora_r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    max_seq_length=max_seq_length,
    packing=packing,
    device_map=device_map,
    load_in_4bit=load_in_4bit,
    load_in_8bit=load_in_8bit,
    bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
    bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    fp16=fp16,
    bf16=bf16,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    gradient_checkpointing=gradient_checkpointing,
    max_grad_norm=max_grad_norm,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    optim=optim,
    lr_scheduler_type = lr_scheduler_type,
    max_steps=max_steps,
    warmup_ratio= warmup_ratio,
    group_by_length=group_by_length,
    save_steps=save_steps,
    logging_steps=logging_steps,
    
)

## Supervised Fine-Tuning

In [9]:
def train(args):  # , gradient_checkpointing_enabled=False):
    
    logger.info("Starting the training process...")

    logger.info("Creating BitsAndBytesConfig...")
    bnb_config = BitsAndBytesConfig(
        # Activate k-bit precision base model loading
        load_in_4bit=args.load_in_4bit,
        load_in_8bit=args.load_in_8bit,

        # Activate nested quantization for 4-bit base models (double quantization)
        bnb_4bit_use_double_quant=args.bnb_4bit_use_double_quant,
        bnb_8bit_use_double_quant=args.bnb_4bit_use_double_quant,
        
        # Quantization type (fp4 or nf4)
        bnb_4bit_quant_type=args.bnb_4bit_quant_type,
        bnb_8bit_quant_type=args.bnb_4bit_quant_type,
        
        # Compute dtype for 4-bit base models
        bnb_4bit_compute_dtype=args.bnb_4bit_compute_dtype,
        bnb_8bit_compute_dtype=args.bnb_4bit_compute_dtype,
        
    )

    logger.info("Loading the Tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(args.model_id, trust_remote_code=False)
    logger.info("Tokenizer pad token specified as the EOS token")
    tokenizer.pad_token = EOS
    # logger.info("Tokenizer configured to fix overflow issues with fp16 training"
    # tokenizer.padding_side = "right"


    logger.info("Loading the CausalLM...")
    model = AutoModelForCausalLM.from_pretrained(
        args.model_id,
        load_in_4bit=args.load_in_4bit,
        load_in_8bit=args.load_in_8bit,
        device_map=args.device_map,
        max_memory=CUDA_MAX_MEMORY,
        torch_dtype=compute_dtype,
        quantization_config=bnb_config,
        trust_remote_code=False
    )
    model.config.use_cache = False
    model.config.pretraining_tp = 1
    max_seq_length = get_max_length(model)


    logger.info("Loading train dataset...")
    train_dataset = load_dataset(
        f"{args.organization}/{args.task_name}", str(args.seed)
    )["train"]

    logger.info("Preprocessing train dataset...")
    preprocessed_train_dataset = preprocess_dataset(
        tokenizer=tokenizer,
        max_length=max_seq_length,
        seed=args.seed,
        dataset=train_dataset,
    )
    logger.info("Train dataset preprocessed.")

    logger.info("Loading test dataset...")
    test_dataset = load_dataset(f"{args.organization}/{args.task_name}", str(args.seed))[
        "test"
    ]
    logger.info("Preprocessing test dataset...")
    preprocessed_test_dataset = preprocess_dataset(
        tokenizer=tokenizer,
        max_length=max_seq_length,
        seed=args.seed,
        dataset=test_dataset
    )
    logger.info("Test dataset preprocessed.")

    logger.info("Getting the model's memory footprint...")
    logger.info(model.get_memory_footprint())
    logger.info("Using the prepare_model_for_kbit_training method from PEFT...")
    model = prepare_model_for_kbit_training(model)
    logger.info("Get lora module names...")
    layers_for_adapters = find_all_linear_names(model)
    logger.info(f"Layers for PEFT Adaptation: {layers_for_adapters}")
    logger.info("Create PEFT config for these modules and wrap the model to PEFT...")
    peft_config = create_peft_config(args, layers_for_adapters)
    model = get_peft_model(model, peft_config)
    logger.info("Print information about the percentage of trainable parameters...")
    print_trainable_parameters(model)

    logger.info("Make output directory...")
    output_dir = args.output_dir / "final_checkpoint"
    output_dir.mkdir(mode=0o777, parents=True, exist_ok=True)

    logger.info("Define TrainingArguments...")
    training_arguments = TrainingArguments(
        output_dir=args.output_dir,
        fp16=args.fp16,
        bf16=args.bf16,
        report_to="wandb",
        per_device_train_batch_size=args.per_device_train_batch_size,
        per_device_eval_batch_size=args.per_device_eval_batch_size,
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        max_grad_norm = args.max_grad_norm,
        weight_decay = args.weight_decay,
        optim=args.optim,
        learning_rate=args.learning_rate,
        lr_scheduler_type=args.lr_scheduler_type,
        num_train_epochs=args.num_train_epochs,
        max_steps=args.max_steps,
        warmup_ratio=args.warmup_ratio,
        save_steps=args.save_steps,
        logging_steps=args.logging_steps,
        group_by_length = args.group_by_length,
    )

    logger.info("Defining SFTTrainer...")    
    trainer = SFTTrainer(
        model=model,
        args=training_arguments,
        packing=args.packing,
        max_seq_length=max_seq_length,
        train_dataset=preprocessed_train_dataset,
        eval_dataset=preprocessed_test_dataset,
        peft_config=peft_config,
        dataset_text_field="text",
        tokenizer=tokenizer,
    )
    
    # # Suppress specific warnings from torch.utils.checkpoint
    # with warnings.catch_warnings():
    #     warnings.filterwarnings(
    #         "ignore", category=UserWarning, module="torch.utils.checkpoint"
    #     )
    logger.info("Running trainer.train() ...")
    trainer.train()
    metrics = trainer.metrics
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    print(metrics)
    trainer.save_state()
    logger.info("Saving model")
    model = trainer.model
    print_dtypes(model)
    logger.info("Saving last checkpoint of the model...")
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

In [None]:
try:
    train(args)
except Exception as e:
    print(e)
    # Empty VRAM
    del model
    del pipe
    del trainer
    torch.cuda.empty_cache()
    import gc

    gc.collect()
    gc.collect()

llama2_finetune - INFO - Starting the training process...
llama2_finetune - INFO - Creating BitsAndBytesConfig...
llama2_finetune - INFO - Loading the Tokenizer...
loading file tokenizer.model from cache at /home/AD/gmatlin3/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/af6df14e494ef16d69ec55e9a016e900a2dde1c8/tokenizer.model
loading file tokenizer.json from cache at /home/AD/gmatlin3/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/af6df14e494ef16d69ec55e9a016e900a2dde1c8/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /home/AD/gmatlin3/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/af6df14e494ef16d69ec55e9a016e900a2dde1c8/special_tokens_map.json
loading file tokenizer_config.json from cache at /home/AD/gmatlin3/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/af6df14e494ef16d69ec55e9a016e900a2dde1c8/tokenizer_config.json


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing LlamaForCausalLM.

All the weights of LlamaForCausalLM were initialized from the model checkpoint at meta-llama/Llama-2-7b-chat-hf.
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training.
loading configuration file generation_config.json from cache at /home/AD/gmatlin3/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/af6df14e494ef16d69ec55e9a016e900a2dde1c8/generation_config.json
Generation config file not found, using a generation config created from the model config.
llama2_finetune - INFO - Loading train dataset...


Found max lenth: 4096


llama2_finetune - INFO - Preprocessing train dataset...


Preprocessing dataset...


Map:   0%|          | 0/1984 [00:00<?, ? examples/s]

Map:   0%|          | 0/1984 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1984 [00:00<?, ? examples/s]

llama2_finetune - INFO - Train dataset preprocessed.
llama2_finetune - INFO - Loading test dataset...
llama2_finetune - INFO - Preprocessing test dataset...


Preprocessing dataset...


Map:   0%|          | 0/496 [00:00<?, ? examples/s]

Map:   0%|          | 0/496 [00:00<?, ? examples/s]

Filter:   0%|          | 0/496 [00:00<?, ? examples/s]

llama2_finetune - INFO - Test dataset preprocessed.
llama2_finetune - INFO - Getting the model's memory footprint...
llama2_finetune - INFO - 3829936128
llama2_finetune - INFO - Using the prepare_model_for_kbit_training method from PEFT...
llama2_finetune - INFO - Get lora module names...
llama2_finetune - INFO - Layers for PEFT Adaptation: ['k_proj', 'v_proj', 'gate_proj', 'up_proj', 'down_proj', 'o_proj', 'q_proj']
llama2_finetune - INFO - Create PEFT config for these modules and wrap the model to PEFT...
llama2_finetune - INFO - Print information about the percentage of trainable parameters...
llama2_finetune - INFO - Make output directory...
llama2_finetune - INFO - Define TrainingArguments...
Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
PyTorch:

trainable params: 39976960 || all params: 3540389888 || trainable%: 1.1291682911958425


Map:   0%|          | 0/1984 [00:00<?, ? examples/s]

Map:   0%|          | 0/496 [00:00<?, ? examples/s]

You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set to `True` to avoid any unexpected behavior such as device placement mismatching.
The model is loaded in 8-bit precision. To train this model you need to add additional modules inside the model such as adapters using `peft` library and freeze the model weights. Please check  the examples in https://github.com/huggingface/peft for more details.
max_steps is given, it will override any value given in num_train_epochs
llama2_finetune - INFO - Running trainer.train() ...
Currently training with a batch size of: 1
***** Running training *****
  Num examples = 1,984
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 2
  Total optimization steps = 250
  Number of trainable parameters = 39,976,960
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Fail

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,0.9007
50,0.302
75,0.5387
100,0.2905


Saving model checkpoint to /fintech_3/20231018/results/Llama-2-7b-chat-hf_fomc_communication/checkpoint-25
tokenizer config file saved in /fintech_3/20231018/results/Llama-2-7b-chat-hf_fomc_communication/checkpoint-25/tokenizer_config.json
Special tokens file saved in /fintech_3/20231018/results/Llama-2-7b-chat-hf_fomc_communication/checkpoint-25/special_tokens_map.json
Saving model checkpoint to /fintech_3/20231018/results/Llama-2-7b-chat-hf_fomc_communication/checkpoint-50
tokenizer config file saved in /fintech_3/20231018/results/Llama-2-7b-chat-hf_fomc_communication/checkpoint-50/tokenizer_config.json
Special tokens file saved in /fintech_3/20231018/results/Llama-2-7b-chat-hf_fomc_communication/checkpoint-50/special_tokens_map.json
Saving model checkpoint to /fintech_3/20231018/results/Llama-2-7b-chat-hf_fomc_communication/checkpoint-75
tokenizer config file saved in /fintech_3/20231018/results/Llama-2-7b-chat-hf_fomc_communication/checkpoint-75/tokenizer_config.json
Special tokens

---

### Save Results

In [None]:
# reload final model checkpoint and save
new_model = AutoPeftModelForCausalLM.from_pretrained(
    args.output_dir / "final_checkpoint",
    device_map=args.device_map,
    torch_dtype=compute_dtype,
)

In [None]:
print_dtypes(model)

In [None]:
# load base model
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=args.device_map,
    max_memory=CUDA_MAX_MEMORY,
    torch_dtype=compute_dtype,
)

In [None]:
print_dtypes(base_model)

In [None]:
# This method merges the LoRa layers into the base model. This is needed to use it as a standalone model.
peft_model = PeftModel.from_pretrained(base_model, args.output_dir / "final_checkpoint")
peft_model = peft_model.merge_and_unload()
tokenizer = AutoTokenizer.from_pretrained(
    args.output_dir / "final_checkpoint", pad_token=EOS
)

In [None]:
print_dtypes(peft_model)

In [None]:
# save inference
merged_checkpoint_dir = args.output_dir / "final_merged_checkpoint"
model.save_pretrained(merged_checkpoint_dir, safe_serialization=True)
tokenizer.save_pretrained(merged_checkpoint_dir)

In [None]:
# push to hub
peft_model.push_to_hub(repo_name, private=True, use_temp_dir=True)
tokenizer.push_to_hub(repo_name, private=True, use_temp_dir=True)

## Evaluation

In [None]:
# TODO: move to configs or args
temperature = 0.0  # [0.0, 1.0]; 0.0 means greedy sampling
do_sample = False
max_new_tokens = 256
top_k = 10
top_p = 0.92
repetition_penalty = 1.0  # 1.0 means no penalty
num_return_sequences = 1  # Only generate one response
num_beams = 1


def generate(model=None, tokenizer=None, dataset=None):
    input_ids = tokenizer(dataset["text"])

    # Ensure that input_ids is a PyTorch tensor
    # input_ids = torch.tensor(input_ids).long()

    # Move the tensor to the GPU
    input_ids = input_ids.cuda()

    generation_output = model.generate(
        input_ids=input_ids,
        generation_config=GenerationConfig(
            temperature=temperature,
            do_sample=do_sample,
            max_new_tokens=max_new_tokens,
            top_k=top_k,
            top_p=top_p,
            repetition_penalty=repetition_penalty,
            num_return_sequences=num_return_sequences,
            num_beams=num_beams,
            return_dict_in_generate=True,
            output_scores=False,
        ),
    )
    seq = generation_output.sequences
    output = tokenizer.decode(seq[0])
    return output.split("[/INST]")[-1].strip()

In [None]:
test_dataset = load_dataset(f"{args.organization}/{args.task_name}", str(args.seed))[
    "test"
]

### Baseline

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=args.device_map,
    max_memory=CUDA_MAX_MEMORY,
    torch_dtype=compute_dtype,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = EOS
tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training

max_length = get_max_length(model)

preprocessed_test_dataset = preprocess_dataset(
    tokenizer=tokenizer, max_length=max_length, seed=args.seed, dataset=test_dataset
)

In [None]:
# Run text generation pipeline with our next model
prompt = "What is a large language model?"
pipe = pipeline(
    task="text-generation", model=model, tokenizer=tokenizer, max_length=max_length
)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]["generated_text"])

In [None]:
# N_GENS = preprocessed_test_dataset.num_rows
N_GENS = 10

output_list = []
for i in range(N_GENS):
    output_list.append(
        generate(model=model, tokenizer=tokenizer, dataset=preprocessed_test_dataset)
    )

### Supervised Fine-Tuning

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    merged_checkpoint_dir,
    device_map=args.device_map,
    max_memory=CUDA_MAX_MEMORY,
    torch_dtype=compute_dtype,
)
tokenizer = AutoTokenizer.from_pretrained(merged_checkpoint_dir)

max_length = get_max_length(model)

preprocessed_dataset = preprocess_dataset(
    tokenizer=tokenizer, max_length=max_length, seed=args.seed, dataset=dataset
)