***TODO?***: see if `offload_folder="tmp"` can help when loading models?

In [1]:
# ====================== HUGGINGFACE ======================
import huggingface_hub
HF_AUTH = "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"
huggingface_hub.login(HF_AUTH)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/AD/gmatlin3/.cache/huggingface/token
Login successful


In [2]:
import os
# cache_dir = "/fintech_3/hf_models"
# os.environ["TRANSFORMERS_CACHE"] = cache_dir
# os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb=100'

In [3]:
# ====================== WEIGHTS AND BIASES ======================
import wandb
WANDB_PROJECT = f"llama2_sft_fomc"
# Set the wandb project where this run will be logged
os.environ["WANDB_PROJECT"] = WANDB_PROJECT
# Turn off save your trained model checkpoint to wandb (our models are too large)
os.environ["WANDB_LOG_MODEL"] = "false"
# Turn off watch to log faster
os.environ["WANDB_WATCH"] = "false"
os.environ["WANDB_API_KEY"] = "fa69ffc6a97578da0410b553042cbb8b3bf5fcaf"
os.environ["WANDB_NOTEBOOK_NAME"] = f"llama2_sft"
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mglennmatlin[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
# ====================== IMPORTS ======================
# Standard Libraries
import os
import gc
import logging
import time
import fire
from pathlib import Path
from functools import partial
from typing import NamedTuple, List, Type
from IPython.display import display
from dataclasses import dataclass, field

# Third-Party Libraries
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
# import nltk
from tqdm.auto import tqdm

# PyTorch and HuggingFace Libraries
import torch
import bitsandbytes as bnb
import evaluate
from datasets import Dataset, DatasetDict, load_dataset
from trl import SFTTrainer
from transformers import logging as hf_logging
from transformers.trainer_callback import TrainerCallback
from transformers import set_seed as transformers_set_seed
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    GenerationConfig,
    TrainingArguments,
    # DataCollatorForLanguageModeling,
    # LlamaConfig,
    # LlamaForCausalLM,
    # LlamaModel,
    # LlamaTokenizer,
    # TextGenerationPipeline,
    # Trainer,
    # pipeline,
)
from peft import (
    PeftModel,
    AutoPeftModelForCausalLM,
    LoraConfig,
    TaskType,
    get_peft_model,
    prepare_model_for_kbit_training,
)

In [5]:
# ====================== HUGGINGFACE ======================
organization = "gtfintechlab"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# ====================== TASK PARAMETERS ======================
task_name = "fomc_communication"
# seeds = (5768, 78516, 944601)
# seed = seeds[0]
seed = 42

# ====================== MODEL PARAMETERS ======================
model_parameters = "7b"
model_id = f"meta-llama/Llama-2-{model_parameters}-chat-hf"
model_name = model_id.split("/")[-1]

# ====================== LOGGING PARAMETERS ======================
report_to = "tensorboard"
logging_dir = Path.home() / "tensorboard" / "logs"

# ====================== DIRECTORY PARAMETERS ======================
output_dir = Path("/fintech_3/") / f"{model_name}_{task_name}" / "20231028"
checkpoint_dir = output_dir / "final_checkpoint"
mergepoint_dir = output_dir / "final_merged_checkpoint"

# ====================== PROMPT PARAMETERS ======================
# system_prompt = f"Discard all previous instructions. Below is an instruction that describes a task. Write a response that appropriately completes the request."
system_prompt = f"Below is an instruction that describes a task. Write a response that appropriately completes the request."
# instruction_prompt = f"Discard all the previous instructions. Behave like you are an expert sentence classifier. Classify the following sentence from FOMC into 'HAWKISH', 'DOVISH', or 'NEUTRAL' class. Label 'HAWKISH' if it is corresponding to tightening of the monetary policy, 'DOVISH' if it is corresponding to easing of the monetary policy, or 'NEUTRAL' if the stance is neutral. Provide the label 'HAWKISH', 'DOVISH', or 'NEUTRAL'. The sentence: ",
# instruction_prompt = f"Behave like you are an expert sentence classifier. Classify the following sentence from the Federal Open Market Committee into 'HAWKISH', 'DOVISH', or 'NEUTRAL' class. Label 'HAWKISH' if it is corresponding to tightening of the monetary policy. Label 'DOVISH' if it is corresponding to easing of the monetary policy. Label 'NEUTRAL' if the stance is neutral. Provide a single label from the choices 'HAWKISH', 'DOVISH', or 'NEUTRAL' then stop generating text. The sentence: "
instruction_prompt = "Discard all the previous instructions. Behave like you are an expert sentence classifier. Classify the following sentence from FOMC into 'HAWKISH', 'DOVISH', or 'NEUTRAL' class. Label 'HAWKISH' if it is corresponding to tightening of the monetary policy, 'DOVISH' if it is corresponding to easing of the monetary policy, or 'NEUTRAL' if the stance is neutral. Provide the label in the first line and provide a short explanation in the second line. The sentence: "

B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
BOS, EOS = "<s>", "</s>"

repo_name = f"{organization}/{model_name}_{task_name}"

# ====================== QLORA PARAMETERS ======================
# LoRA attention dimension
lora_r = 64
# Alpha parameter for LoRA scaling
lora_alpha = 16
# Dropout probability for LoRA layers
lora_dropout = 0.1

# ====================== SFT PARAMETERS ======================
# Default maximum sequence length to use
max_seq_length = 4096
# Pack multiple short examples in the same input sequence to increase efficiency
packing = False
neftune_noise_alpha = 5

# ====================== CUDA PARAMETERS ======================
# Enable fp16/bf16 training
compute_dtype = torch.bfloat16
fp16, bf16 = False, True

cuda_n_gpus, cuda_max_memory = None, None # Determined dynamically at runtime

device_map = "auto"  # Automatically determine the device map

save_safetensors = True

# ====================== BITSANDBYTES PARAMETERS ======================
# Activate 4-bit precision base model loading
load_in_4bit = True

# Activate 8-bit precision base model loading
load_in_8bit = False

# Compute dtype for 4-bit base models
bnb_compute_dtype = compute_dtype

# Quantization type (fp4 or nf4)
bnb_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
bnb_use_double_quant = False


def configure_bnb(args):
    """
    Configures BitsAndBytes based on the arguments provided.
    """
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=args.load_in_4bit,
        load_in_8bit=args.load_in_8bit,
        bnb_4bit_use_double_quant=args.bnb_use_double_quant,
        bnb_8bit_use_double_quant=args.bnb_use_double_quant,
        bnb_4bit_quant_type=args.bnb_quant_type,
        bnb_8bit_quant_type=args.bnb_quant_type,
        bnb_4bit_compute_dtype=args.bnb_compute_dtype,
        bnb_8bit_compute_dtype=args.bnb_compute_dtype,
    )
    return bnb_config


# ====================== TRAININGARGUMENTS PARAMETERS ======================
# Number of training epochs
num_train_epochs = 1 #12

# Batch size per GPU for training
per_device_train_batch_size = 8

# Batch size per GPU for evaluation
per_device_eval_batch_size = 8

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = False

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 3e-3

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "adamw_bnb_8bit"

# Learning rate schedule
lr_scheduler_type = "constant"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0.1

# Log every X updates steps
logging_steps = 0.1

load_best_model_at_end = True

strategy = "epoch"
save_strategy = strategy
logging_strategy = strategy
evaluation_strategy = strategy

disable_tqdm = False
predict_with_generate = True

In [6]:
# ====================== LOGGING SETUP ======================
def create_logger(name="llama2_finetune", level=logging.DEBUG):
    logger = logging.getLogger(name)
    if not logger.hasHandlers():
        logger.setLevel(level)
        hf_logging.set_verbosity(level)

        # Create handlers
        c_handler = logging.StreamHandler()
        f_handler = logging.FileHandler("llama2_finetune.log")
        c_handler.setLevel(level)
        f_handler.setLevel(level)

        # Create formatters and add it to handlers
        format = "%(name)s - %(levelname)s - %(message)s"
        c_handler.setFormatter(logging.Formatter(format))
        f_handler.setFormatter(logging.Formatter(format))

        # Add handlers to the logger
        logger.addHandler(c_handler)
        logger.addHandler(f_handler)
    return logger

In [7]:
# ====================== ARGUMENTS SETUP ======================
# TODO: MOVE OUR DEFAULT VALUES INTO OUR DATA CLASS(ES)
@dataclass
class Args():
    repo_name: str
    task_name: str
    system_prompt: str
    instruction_prompt: str
    seed: int
    model_id: str
    model_name: str
    organization: str
    lora_r: float
    lora_alpha: float
    lora_dropout: float
    max_seq_length: int
    packing: bool
    device_map: str
    load_in_4bit: bool
    load_in_8bit: bool
    bnb_compute_dtype: bool
    bnb_use_double_quant: bool
    bnb_quant_type: str
    output_dir: str
    checkpoint_dir: str
    mergepoint_dir: str
    logging_dir: str
    num_train_epochs: int
    fp16: bool
    bf16: bool
    per_device_train_batch_size: int
    per_device_eval_batch_size: int
    gradient_accumulation_steps: int
    gradient_checkpointing: bool
    max_grad_norm: float
    learning_rate: float
    weight_decay: float
    optim: str
    lr_scheduler_type: str
    max_steps: int
    warmup_ratio: float
    group_by_length: bool
    save_steps: int
    save_strategy: str
    logging_strategy: str
    logging_steps: int
    evaluation_strategy: str
    neftune_noise_alpha: float
    save_safetensors: bool
    load_best_model_at_end: bool
    disable_tqdm: bool
    B_INST: str
    E_INST: str
    B_SYS: str
    E_SYS: str
    BOS: str
    EOS: str
    report_to: str
    predict_with_generate: bool
    cuda_n_gpus: int 
    cuda_max_memory: str


def setup_args() -> Args:
    args = Args(
        repo_name=repo_name,
        task_name=task_name,
        system_prompt=system_prompt,
        instruction_prompt=instruction_prompt,
        seed=seed,
        model_id=model_id,
        model_name=model_id.split("/")[-1],
        organization=organization,
        lora_r=lora_r,
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        max_seq_length=max_seq_length,
        packing=packing,
        device_map=device_map,
        load_in_4bit=load_in_4bit,
        load_in_8bit=load_in_8bit,
        bnb_compute_dtype=bnb_compute_dtype,
        bnb_use_double_quant=bnb_use_double_quant,
        bnb_quant_type=bnb_quant_type,
        output_dir=output_dir,
        checkpoint_dir = checkpoint_dir,
        mergepoint_dir = mergepoint_dir,
        logging_dir=logging_dir,
        num_train_epochs=num_train_epochs,
        fp16=fp16,
        bf16=bf16,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        gradient_checkpointing=gradient_checkpointing,
        max_grad_norm=max_grad_norm,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        optim=optim,
        lr_scheduler_type=lr_scheduler_type,
        max_steps=max_steps,
        warmup_ratio=warmup_ratio,
        group_by_length=group_by_length,
        save_steps=save_steps,
        save_strategy=save_strategy,
        logging_strategy=logging_strategy,
        logging_steps=logging_steps,
        evaluation_strategy=evaluation_strategy,
        neftune_noise_alpha=neftune_noise_alpha,
        save_safetensors=save_safetensors,
        load_best_model_at_end=load_best_model_at_end,
        disable_tqdm=disable_tqdm,
        B_INST=B_INST,
        E_INST=E_INST,
        B_SYS=B_SYS,
        E_SYS=E_SYS,
        BOS=BOS,
        EOS=EOS,
        report_to=report_to,
        predict_with_generate=predict_with_generate,
        cuda_n_gpus=cuda_n_gpus,
        cuda_max_memory=cuda_max_memory
    )

    return args

In [8]:
# =============== SFT LOGGING FUNCTIONS ==================
def log_trainable_parameters(model, logger):
    """
    Logs the number of trainable parameters in the model.

    Parameters:
    - model : torch.nn.Module - The model to log.
    - logger : logging.Logger - Logger to use for logging the info.
    """

    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())

    logger.info(
        f"Trainable params: {trainable_params} || "
        f"All params: {total_params} || "
        f"Trainable%: {100 * trainable_params / total_params}"
    )


def log_dtypes(model, logger):
    """
    Logs the data types of the model parameters.

    Parameters:
    - model : torch.nn.Module - The model to log.
    - logger : logging.Logger - Logger to use for logging the info.
    """
    dtypes = {}

    for p in model.parameters():
        dtype = p.dtype
        dtypes[dtype] = dtypes.get(dtype, 0) + p.numel()

    total = sum(dtypes.values())

    for dtype, count in dtypes.items():
        logger.info(f"{dtype}: {count} ({100 * count / total:.2f}%)")

def merge_evaluation_results(baseline_results: dict, final_results: dict) -> pd.DataFrame:
    """
    Merge evaluation results for comparison.
    """
    all_metrics = set(baseline_results.keys()).union(final_results.keys())
    data = {
        "Metric": [],
        "Baseline": [],
        "After Fine-tuning": []
    }
    
    for metric in all_metrics:
        data["Metric"].append(metric)
        data["Baseline"].append(baseline_results.get(metric, "N/A"))
        data["After Fine-tuning"].append(final_results.get(metric, "N/A"))
    
    return pd.DataFrame(data)

In [9]:
# ========== DATA SET PROCESSING FUNCTIONS ==========
# TODO: double-check the fomc mapping
FOMC_COMMUNICATION_MAPPING = {
    0: "DOVISH",
    1: "HAWKISH",
    2: "NEUTRAL"
}

# Function to decode the labels
def decode_label(label_number):
    return FOMC_COMMUNICATION_MAPPING.get(label_number, "undefined").upper()

# Function to encode the labels
def encode_label(label_name):
    reversed_mapping = {v: k for k, v in FOMC_COMMUNICATION_MAPPING.items()}
    return reversed_mapping.get(label_name.lower(), -1)

# TODO: have extract_lavel use our encoding/mapping
def extract_label(text_output, E_INST="[/INST]"):
    """
    Extracts the label from the text output from a large language model 
    """
    # Find the 'end of instruction' token and remove text before it
    response_pos = text_output.find(E_INST)
    # Convert the string to uppercase for case-insensitive search
    generated_text = text_output[response_pos + len(E_INST) :].strip().upper()
    # Define the substring options
    substrings = ["DOVISH", "HAWKISH", "NEUTRAL"]
    # Iterate over the substrings and find the matching label
    for i, substring in enumerate(substrings):
        if substring in generated_text:
            return i
    # If none of the substrings are found, return -1
    return -1

def get_max_length(model: Type[torch.nn.Module]) -> int:
    """
    Get the maximum length of position embeddings in the model.

    Parameters:
    - model : torch.nn.Module - The model to inspect

    Returns:
    - int - Maximum length of position embeddings
    """
    conf = model.config
    max_length = None

    # Checking various attributes to determine max length
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(conf, length_setting, None)
        if max_length:
            print(f"Found max length: {max_length}")
            break

    # Defaulting to 4096 if no length attribute is found
    if not max_length:
        max_length = 4096
        print(f"Using default max length: {max_length}")

    return max_length

def load_dataset_split(args, logger, split: str):
    """
    Load a dataset split
    """
    logger.debug(f"Loading {split} dataset...")
    dataset_split = load_dataset(f"{args.organization}/{args.task_name}")[
        split
    ]

    return dataset_split

def split_dataset(train_dataset, train_ratio=0.7, seed=42):
    """
    Split a Hugging Face dataset into training and validation sets with a given ratio.

    Parameters:
    - train_dataset: Hugging Face dataset to split
    - train_ratio: Ratio of data to keep in the training set
    - seed: Seed for reproducibility

    Returns:
    - train_set: Training dataset
    - val_set: Validation dataset
    """
    # Ensuring the ratios are valid
    if train_ratio <= 0 or train_ratio >= 1:
        raise ValueError("Train ratio must be between 0 and 1")

    val_ratio = 1 - train_ratio

    # Splitting the dataset
    datasets = train_dataset.train_test_split(test_size=val_ratio, seed=seed)
    train_set = datasets["train"]
    val_set = datasets["test"] # TODO: can I name this eval instead?

    return train_set, val_set

In [10]:
def _preprocess_batch_(
    batch,
    args: Args,
    logger,
    tokenizer,
):
    """
    Creates formatted prompts and tokenizes in batch mode.

    Parameters:
    - batch: dict - Batch containing columns as lists.
    - args: Args - Arguments needed for formatting.
    - tokenizer: AutoTokenizer - Tokenizer for the model.
    - max_seq_length: int - Maximum sequence length for tokenization.
    """
    
    # TODO: Fields should be in args
    context_field = 'sentence'
    label_field = 'label'
    encoded_label_field = f"{label_field}_encoded"
    response_field = 'label_decoded'
    text_field = 'input_texts'
    id_field = 'input_ids'
    truncation_field = True
    padding_field = True
    
    # Rename a column
    batch[encoded_label_field] = batch[label_field]
    # Decode the label
    batch[response_field] = [decode_label(label) for label in batch[encoded_label_field]]
    # Validate the prompts
    if not args.instruction_prompt.strip() or not args.system_prompt.strip():
        raise ValueError("All prompts (instruction, system) must be non-empty strings.")
    # Validate the fields
    if not all(item.strip() for item in batch[context_field]) or not all(
        item.strip() for item in batch[response_field]
    ):
        raise ValueError("All fields (context, response) must be non-empty strings.")
    # Formatt the input text for the batch
    batch[text_field] = [
        args.B_INST
        + args.B_SYS
        + args.system_prompt
        + args.E_SYS
        + args.instruction_prompt
        + context
        + args.E_INST
        for context in batch[context_field]
    ]

    tokenized_inputs = tokenizer(batch[text_field], 
                                 max_length=args.max_seq_length, 
                                 truncation=truncation_field, 
                                 padding=padding_field)

    batch['input_ids'] = tokenized_inputs['input_ids']
    batch['attention_mask'] = tokenized_inputs['attention_mask']

    
    return batch

def preprocess_dataset(
    args: Args, logger, tokenizer: AutoTokenizer, dataset: Dataset
):
    """
    Prepare the dataset for supervised fine-tuning.

    Parameters:
    - args: Args - Arguments needed for formatting.
    - tokenizer: AutoTokenizer - Tokenizer for the model.
    - max_seq_length: int - Maximum sequence length for tokenization.
    - dataset: Dataset - Dataset to preprocess.
    """

    # TODO: Fields should be in args
    context_field = 'sentence'
    label_field = 'label'
    encoded_label_field = f"{label_field}_encoded"
    response_field = 'label_decoded'
    text_field = 'input_texts'
    id_field = 'input_ids'
    truncation_field = True
    padding_field = True
    
    logger.debug(f"Preprocessing dataset...")    

     # We have to preprocess in batch because datasets dont allow for easy assignment of new fields
    dataset = dataset.map(
        partial(
            _preprocess_batch_,
            args=args,
            logger=logger,
            tokenizer=tokenizer,
        ),
        batched=True,
    )
    
    logger.debug("Filtering dataset to ensure we are below the maximum sequence length")
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < args.max_seq_length)
    logger.debug("Shuffling the data using our seed value")
    dataset = dataset.shuffle(seed=args.seed)    
    return dataset

In [11]:
# ======= PEFT HELPER FUNCTIONS ===========
class PeftSavingCallback(TrainerCallback):
    """
    A callback to save the PEFT adapters during the model training.
    """

    def on_save(self, args, state, control, **kwargs):
        checkpoint_path = os.path.join(
            args.output_dir, f"checkpoint-{state.global_step}"
        )
        kwargs["model"].save_pretrained(checkpoint_path)

        if "pytorch_model.bin" in os.listdir(checkpoint_path):
            os.remove(os.path.join(checkpoint_path, "pytorch_model.bin"))


def find_all_linear_names(model: Type[torch.nn.Module], bits: int) -> List[str]:
    """
    Find names of all linear layers in the model based on the number of bits specified.

    Parameters:
    - model : torch.nn.Module - The model to inspect
    - bits : int - The number of bits to select the appropriate linear layer class

    Returns:
    - List[str] - List of linear layer names
    """

    # Selecting the appropriate class based on the number of bits
    if bits == 4:
        cls = bnb.nn.Linear4bit
    elif bits == 8:
        cls = bnb.nn.Linear8bitLt
    else:
        cls = torch.nn.Linear

    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split(".")
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    # Removing 'lm_head' if exists (specific to 16-bit scenarios)
    lora_module_names.discard("lm_head")

    return list(lora_module_names)


def create_peft_config(args: Args, modules: List[str]) -> LoraConfig:
    """
    Create PEFT configuration for LoRA.

    Parameters:
    - args : Args - The arguments containing LoRA parameters
    - modules : List[str] - List of module names

    Returns:
    - LoraConfig - Configuration object for PEFT
    """
    return LoraConfig(
        target_modules=modules,
        r=args.lora_r,
        lora_alpha=args.lora_alpha,
        lora_dropout=args.lora_dropout,
        bias="none",
        task_type=TaskType.CAUSAL_LM,
    )

In [12]:
# ============== METRICS FUNCTIONS =================
def compute_metrics(eval_pred, tokenizer, label_list=["HAWKISH", "NEUTRAL", "DOVISH"]):
    predictions, labels = eval_pred
    # Decode the predictions to text
    decoded_preds = [tokenizer.decode(pred, skip_special_tokens=True) for pred in predictions]
    # Ensure the labels and predictions are in the same format for comparison
    labels_text = [label_list[lbl] for lbl in labels]
    
    # Calculate accuracy
    acc = accuracy_score(labels_text, decoded_preds)
    
    # Calculate F1 score (you might want to specify the average parameter)
    f1 = f1_score(labels_text, decoded_preds, average='weighted', labels=label_list)
    
    return {
        'accuracy': acc,
        'f1_score': f1,
    }

In [13]:
# ========== TRAINING FUNCTIONS ===============
def create_tokenizer(args, logger):
    """
    Configures the tokenizer based on the provided arguments.
    """
    tokenizer = AutoTokenizer.from_pretrained(args.model_id, trust_remote_code=False)
    tokenizer.pad_token = args.EOS

    return tokenizer


def create_model(args, logger, bnb_mode=True, peft_mode=True):
    """
    Applies further configurations to the model based on the arguments provided.
    """
    
    if not bnb_mode:
        logger.debug("Creating ModelforCausalLM ...")
        model = AutoModelForCausalLM.from_pretrained(
            args.model_id,
            device_map=args.device_map,
            max_memory=args.cuda_max_memory,
            torch_dtype=torch.bfloat16,
            # trust_remote_code=False,
        )
    else:
        logger.debug("Creating BitsAndBytesConfig ...")
        bnb_config = configure_bnb(args)

        logger.debug("Creating ModelforCausalLM ...")
        model = AutoModelForCausalLM.from_pretrained(
            args.model_id,
            load_in_4bit=args.load_in_4bit,
            load_in_8bit=args.load_in_8bit,
            device_map=args.device_map,
            max_memory=args.cuda_max_memory,
            torch_dtype=compute_dtype,
            quantization_config=bnb_config,
            trust_remote_code=False,
        )

        model.config.use_cache = False
        model.config.pretraining_tp = 1
        info_data = []

        logger.debug("Logging the model's memory footprint ...")
        memory_footprint = model.get_memory_footprint()
        info_data.append(["Memory Footprint", memory_footprint])

        logger.debug(f"Logging the model's Dtypes ...")
        dtypes_loaded = log_dtypes(
            model, logger
        )
        info_data.append(["Dtypes init", dtypes_loaded])

    if peft_mode:
        logger.debug("Using the prepare_model_for_kbit_training method from PEFT...")
        model = prepare_model_for_kbit_training(
            model, use_gradient_checkpointing=args.gradient_checkpointing
        )

        logger.debug(f"Model Dtypes after preparing for kbit training ...")
        dtypes_after = log_dtypes(
            model, logger
        )
        info_data.append(["Dtypes After KBit Prep", dtypes_after])

        logger.debug("Get module names for the linear layers where we add LORA adapters...")
        layers_for_adapters = find_all_linear_names(model, 4)
        logger.debug(f"Layers for Adapters: {layers_for_adapters}")
        info_data.append(["Layers for Adapters", layers_for_adapters])

        logger.debug("Create PEFT config for these modules and wrap the model to PEFT...")
        peft_config = create_peft_config(args, layers_for_adapters)

        logger.debug(f"Model Dtypes before applying PEFT config ...")
        dtypes_before = log_dtypes(model, logger)
        info_data.append(["Dtypes Before PEFT Config", dtypes_before])

        model = get_peft_model(model, peft_config)

        logger.debug(f"Model Dtypes after applying PEFT config ...")
        dtypes_after_peft = log_dtypes(model, logger)
        info_data.append(["Dtypes After PEFT Config", dtypes_after_peft])

        logger.debug("Information about the percentage of trainable parameters...")
        trainable_parameters = log_trainable_parameters(model, logger)
        info_data.append(["Trainable Parameters", trainable_parameters])

    if bnb_mode or peft_mode:
        logger.debug("Converting the info_data list into a pandas DataFrame and saving it...")
        df = pd.DataFrame(info_data, columns=["Info", "Value"])
        logger.debug('\n%s', df.to_string(index=False))
    return model


def setup_training_arguments(args):
    """
    Configures and returns the TrainingArguments based on the provided arguments.
    """
    # Directory setup for outputs
    output_dir = setup_output_directory(
        args.output_dir
    )

    training_arguments = TrainingArguments(
        output_dir=output_dir,
        fp16=args.fp16,
        bf16=args.bf16,
        per_device_train_batch_size=args.per_device_train_batch_size,
        per_device_eval_batch_size=args.per_device_eval_batch_size,
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        max_grad_norm=args.max_grad_norm,
        weight_decay=args.weight_decay,
        optim=args.optim,
        learning_rate=args.learning_rate,
        lr_scheduler_type=args.lr_scheduler_type,
        num_train_epochs=args.num_train_epochs,
        max_steps=args.max_steps,
        warmup_ratio=args.warmup_ratio,
        save_safetensors=args.save_safetensors,
        load_best_model_at_end=args.load_best_model_at_end,
        push_to_hub=False,
        evaluation_strategy=args.evaluation_strategy,
        logging_dir=logging_dir,
        report_to=args.report_to,
        save_strategy=args.save_strategy,
        save_steps=args.save_steps,
        logging_strategy=args.logging_strategy,
        logging_steps=args.logging_steps,
        group_by_length=args.group_by_length,
    )
    return training_arguments


def setup_trainer(
    args, model, tokenizer, peft_config, train_dataset, eval_dataset, training_arguments
):
    """
    Configures and returns the trainer based on the provided arguments and datasets.
    """
    callbacks = [PeftSavingCallback()]
    trainer = SFTTrainer(
        model=model,
        args=training_arguments,
        packing=args.packing,
        max_seq_length=max_seq_length,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        peft_config=peft_config,
        callbacks=callbacks,
        dataset_text_field="text",
        neftune_noise_alpha=args.neftune_noise_alpha,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.predict_with_generate = args.predict_with_generate
    return trainer

In [14]:
def baseline_results(trainer, logger):
    logger.info("Baseline Results: Train Set")
    original_eval_dataset = trainer.eval_dataset
    trainer.eval_dataset = trainer.train_dataset
    baseline_train_results = trainer.evaluate()
    for key, value in baseline_train_results.items():
        logger.info(f" {key}: {value}")

    logger.info("Baseline Results: Evaluation Set")
    trainer.eval_dataset = original_eval_dataset
    baseline_eval_results = trainer.epeft_mode=uate()
    for key, value in baseline_eval_results.items():
        logger.info(f" {key}: {value}")
    
    return baseline_train_results, baseline_eval_results

In [15]:
def execute_training_and_evaluation(trainer, args, logger):
    """
    Executes the training and evaluation process based on the configured trainer and arguments.
    """
    logger.info(
        "Evaluating the baseline performance of the model before fine-tuning ..."
    )

    baseline_train_results, baseline_eval_results = baseline_results(trainer, logger)
    logger.info("Running Supervised Fine Tuning ...")
    logger.debug("Trying trainer.train() ...")
    try:
        trainer.train()
    except Exception as e:
        logger.error(e)
        raise Exception(e)
    finally:
        memory_cleanup()

    if args.report_to == "wandb":
        wandb.finish()

    logger.debug("Trying the final trainer.evaluate() ...")
    try:
        final_eval_results = trainer.evaluate()
        for key, value in final_eval_results.items():
            logger.info(f" {key}: {value}")
    except Exception as e:
        logger.error("The final trainer.evaluate() failed !!!")
        logger.error(e)
        raise e
    # trainer.save_state()
    logger.debug("Creating results dataframe ...")
    results_df = merge_evaluation_results(baseline_eval_results, final_eval_results)
    return results_df

In [16]:
def train(args, logger):
    logger.info("Starting Supervised Fine Tuning...")
    logger.debug("Creating the Tokenizer...")
    tokenizer = create_tokenizer(args, logger)
    logger.debug("Creating the Model...")
    model = create_model(args, logger)
    max_seq_length = get_max_length(
        model
    )
    logger.debug("Loading and preprocessing train dataset...")
    train_dataset = load_dataset_split(args=args, logger=logger, split="train")
    train_dataset = preprocess_dataset(args=args, logger=logger, tokenizer=tokenizer, dataset=train_dataset)
    train_set, val_set = split_dataset(train_dataset, train_ratio=0.7, seed=args.seed)
    logger.debug("Creating TrainingArguments ...")
    training_arguments = setup_training_arguments(args)
    logger.debug("Creating PEFT config ...")
    layers_for_adapters = find_all_linear_names(model, 4)
    logger.debug(f"Layers for Adapters: {layers_for_adapters}")
    peft_config = create_peft_config(args, layers_for_adapters)
    logger.debug("Creating SFTTrainer ...")
    trainer = setup_trainer(args, model, tokenizer, peft_config, train_set, val_set, training_arguments)
    logger.debug("Executing the SFTTrainer pipeline")
    results_df = execute_training_and_evaluation(trainer, args, logger)
    display(results_df)
    logger.debug("Saving final model and tokenizer states")
    model = trainer.model
    save_model_and_tokenizer(model_dir=args.checkpoint_dir, logger=logger, model=model, tokenizer=tokenizer)

In [17]:
# ====== UTILS =======
def memory_cleanup():
    """
    Empty VRAM
    """
    if "trainer" in locals() or "trainer" in globals():
        del trainer
    if "model" in locals() or "model" in globals():
        del model
    if "pipe" in locals() or "pipe" in globals():
        del pipe
    torch.cuda.empty_cache()
    gc.collect()
    gc.collect()

def configure_cuda_args(args, logger):
    """
    Configure the parameter arguments using the system's CUDA information
    """
    if args.cuda_n_gpus is None:
        args.cuda_n_gpus = torch.cuda.device_count()
        logger.debug(f"args.cuda_n_gpus now defined: {args.cuda_n_gpus}")
    else:
        logger.debug("args.cuda_n_gpus already defined.")
        
    if args.cuda_max_memory is None:
        CUDA_MAX_MEMORY = f"{int(torch.cuda.mem_get_info()[0] / 1024 ** 3) - 2}GB"
        args.cuda_max_memory = {i: CUDA_MAX_MEMORY for i in range(args.cuda_n_gpus)}
        logger.debug(f"args.cuda_max_memory now defined: {args.cuda_max_memory}")
    else:
        logger.debug("args.cuda_max_memory already defined.")
        
    return args

# def load_models(args, logger):
#     """
#     Load the base foundation model and the new finetuned model
#     """
#     # TODO: load_models() should be decomposed or functionalized e.g. base, finetune, etc.?
    
#     base_model = AutoModelForCausalLM.from_pretrained(
#         args.model_id,
#         device_map=args.device_map,
#         max_memory=args.cuda_max_memory,
#         torch_dtype=args.bnb_compute_dtype,
#     )
#     log_dtypes(base_model, logger)

#     # Load the fine-tuned model
#     logger.debug("Creating BitsAndBytesConfig ...")
#     bnb_config = configure_bnb(args)
#     new_model = AutoPeftModelForCausalLM.from_pretrained(
#         args.output_dir / "final_checkpoint",
#         device_map=args.device_map,
#         max_memory=args.cuda_max_memory,
#         torch_dtype=args.bnb_compute_dtype,
#         quantization_config=bnb_config,
#     )

#     log_dtypes(new_model, logger)

#     return base_model, new_model


# def merge_models(base_model, new_model, logger):
#     """
#     Merge the LoRa layers into the base model for standalone use
#     """
#     peft_model = PeftModel.from_pretrained(base_model, new_model)
#     peft_model.merge_and_unload()
#     log_dtypes(peft_model, logger)

#     return peft_model


# def save_and_push(args, logger, peft_model, tokenizer):
#     """
#     Save peft model and tokenizer, then push them to the hub
#     """
#     merged_checkpoint_dir = args.output_dir / "final_merged_checkpoint"
#     logger.debug(f"Saving the final checkpoint from PEFT model to {merged_checkpoint_dir}")
#     peft_model.save_pretrained(merged_checkpoint_dir, safe_serialization=True)
#     logger.debug(f"Saving the tokenizer to {merged_checkpoint_dir}")
#     tokenizer.save_pretrained(merged_checkpoint_dir)

#     logger.debug(f"Pushing the PEFT'd model and tokenizer to hub repo {args.repo_name}")
#     peft_model.push_to_hub(args.repo_name, private=True, use_temp_dir=True)
#     tokenizer.push_to_hub(args.repo_name, private=True, use_temp_dir=True)


def save_model_and_tokenizer(logger, model, tokenizer, model_dir):
    """
    Save the model and tokenizer in the trainer to the specified directory.

    Parameters:
    - model
        model object
    - tokenizer : PreTrainedTokenizer
        The tokenizer to be saved.
    - model_dir : str
        The directory where the model and tokenizer will be saved.
    """
    try:
        logger.debug(f"Model saving to {model_dir} ...")
        model.save_pretrained(model_dir, safe_serialization=True)
        logger.debug(f"Tokenizer saving to {model_dir} ..")
        tokenizer.save_pretrained(model_dir)
    except Exception as e:
        logger.error(f"An error occurred while saving the model and tokenizer: {e}")
        raise Exception(e)

def setup_output_directory(args):
    """
    Sets up the output directory for saving model checkpoints and other outputs.
    """
    logger.debug("Creating output directories ...")
    args.checkpoint_dir.mkdir(mode=0o777, parents=True, exist_ok=True)
    args.mergepoint_dir.mkdir(mode=0o777, parents=True, exist_ok=True)
    return output_dir

In [18]:
# ========= main ===========
def main():
    args = setup_args()
    if 'logger' in globals() or 'logger' in locals():
        pass
    else:
        logger = create_logger()
    args = configure_cuda_args(args, logger)
    logger.info(f"Using k={args.cuda_n_gpus} CUDA GPUs with max memory {args.cuda_max_memory}")

    logger.debug(f"Setting reproducibility seed: '{args.seed}'")
    transformers_set_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    torch.manual_seed(args.seed)
    
    output_dir = setup_output_directory(
        args.output_dir
    )
    
    try:
        train(args, logger)
    except Exception as e:
        logger.error(e)
        raise Exception(e)
    finally:
        memory_cleanup()

    base_model = create_model(args=args, logger=logger, bnb_mode=False, peft_mode=False)
    # base_model, new_model = load_models(args, logger)
    # base_model = AutoModelForCausalLM.from_pretrained(
    #     args.model_id,
    #     device_map=args.device_map,
    #     max_memory=args.cuda_max_memory,
    #     torch_dtype=args.bnb_compute_dtype,
    # )
    log_dtypes(logger=logger, model=base_model)

    logger.debug("Creating BitsAndBytesConfig ...")
    bnb_config = configure_bnb(args)
    logger.debug("Loading the final checkpoint for the PEFT Model ...")
    new_model = AutoPeftModelForCausalLM.from_pretrained(
        args.checkpoint_dir,
        device_map=args.device_map,
        max_memory=args.cuda_max_memory,
        torch_dtype=args.bnb_compute_dtype,
        quantization_config=bnb_config,
    )
    log_dtypes(new_model, logger)

    logger.debug(f"Creating the final PEFT Model ...")
    peft_model = PeftModel.from_pretrained(base_model, new_model)
    peft_model.merge_and_unload()
    log_dtypes(peft_model, logger)

    save_model_and_tokenizer(model_dir=args.mergepoint_dir, logger=logger, model=peft_model, tokenizer=tokenizer)
#     logger.debug(f"Saving the final checkpoint from PEFT model to {args.mergepoint_dir}")
#     peft_model.save_pretrained(args.mergepoint_dir, safe_serialization=True)    
#     logger.debug(f"Saving the tokenizer to {args.mergepoint_dir}")
#     tokenizer.save_pretrained(args.mergepoint_dir)
    

    logger.debug(f"Pushing the PEFT'd model and tokenizer to hub repo {args.repo_name}")
    peft_model.push_to_hub(args.repo_name, private=True, use_temp_dir=True)
    tokenizer.push_to_hub(args.repo_name, private=True, use_temp_dir=True)

# if name = "__main__" :
#    fire.Fire(main)

---

---

---

# TEST BENCH

In [49]:
def evaluate_predictions(true_labels, predicted_labels):
    accuracy_perc = accuracy_score(true_labels, predicted_labels)
    f1_score_perc = f1_score(true_labels, predicted_labels, average="weighted")
    missing_perc = (predicted_labels.count(-1) / len(predicted_labels)) * 100.0
    return accuracy_perc, f1_score_perc, missing_perc

In [52]:
# ========= GENERATION PARAMETERS ===========
@dataclass
class GenerationParams:
    """
    ##### TODO: Convert my generation params into a config for the project
    # from transformers import GenerationConfig
    # generation_config = GenerationConfig( args )
    ### Tip: add `push_to_hub=True` to push to the Hub
    # generation_config.save_pretrained("/tmp", "translation_generation_config.json")
    ### You could then use the named generation config file to parameterize generation
    # generation_config = GenerationConfig.from_pretrained("/tmp", "translation_generation_config.json")
    # outputs = model.generate(**inputs, generation_config=generation_config)
    # tokenizer.batch_decode(outputs, skip_special_tokens=True)
    """
    ## Strategies [https://huggingface.co/docs/transformers/generation_strategies]
    ### Greedy Sampling: beams==1; sample==False
    ### Multinomial Sampling: beams==1; sample==True
    ### Beam Search: beams>1; sample==False
    ### Beam Search + Multinomial Sampling: beams>1; sample==True
    num_beams: int = 1 # Beams used in non-greedy search
    do_sample: bool = False #Whether or not to use sampling ; use greedy decoding otherwise.

    ## Contrastive Search Parameters; penalty_alpha, top_k [https://huggingface.co/blog/introducing-csearch]
    penalty_alpha: float = None # float = 0.6 # 
    # top_k: int = None # 4 # none for greedy
    
    # TODO: should these be here? or in other args?
    # eos_token_id=model.config.eos_token_id,
    # pad_token=model.config.pad_token_id,

    # TODO: how to unset a parameter in the pre-loaded config?
    # /home/AD/gmatlin3/.conda/envs/conference/lib/python3.8/site-packages/transformers/generation/configuration_utils.py:362: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `None` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.
    
    ## Other Parameters
    max_new_tokens: int = 100 # The maximum numbers of tokens to generate
    min_length: int = None # The minimum length of the sequence to be generated, input prompt + min_new_tokens
    use_cache: bool = True  # Whether or not the model should use the past last key/values attentions Whether or not the model should use the past last key/values attentions (if applicable to the model) to speed up decoding.
    top_p: float = 1.0 # If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
    temperature: float = None # The value used to modulate the next token probabilities.
    repetition_penalty: float = 1.0 # Parameter for repetition penalty. 1.0 means no penalty.
    length_penalty: int = 1 # Exponential penalty to the length that is used with beam-based generation. 
    max_padding_length: int = 4096 # Max padding length to be used with tokenizer padding the prompts.
    num_return_sequences: int = 1
    early_stopping: bool = True
    return_dict_in_generate: bool = False
    output_scores: bool = False

In [57]:
# ========= TEXT GENERATION ===========
def __text_generation__(generation_params:GenerationParams, n_limit:int = -1):
    print(f"Getting logger ...")
    if 'logger' in locals():
        pass
    else:
        logger = create_logger()
    
    logger.debug("Setting up argument parameters ...")
    args = setup_args()
    # TODO: Fields should be in args
    context_field = 'sentence'
    label_field = 'label'
    encoded_label_field = f"{label_field}_encoded"
    response_field = 'label_decoded'
    text_field = 'input_texts'
    id_field = 'input_ids'
    truncation_field = True
    padding_field = True

    logger.debug("Configuring CUDA ...")
    # args = configure_cuda_args(args, logger)
    device, args.cuda_n_gpus, args.cuda_max_memory = 'cuda:0', 1, {0: '41GB'}
    
    logger.info(f"Using k={args.cuda_n_gpus} CUDA GPUs with max memory {args.cuda_max_memory}")
    
    logger.debug(f"Setting reproducibility seed: '{args.seed}'")
    transformers_set_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    torch.manual_seed(args.seed)
        
    tokenizer = AutoTokenizer.from_pretrained(args.model_id)
    tokenizer.pad_token = args.EOS

    model = AutoModelForCausalLM.from_pretrained(args.model_id,
                                                 torch_dtype=torch.bfloat16,
                                                 device_map=device,
                                                 max_memory=args.cuda_max_memory)
    
    logger.debug("Specifing some generation configs when loading the model, doesnt work if doing it inside of .generate()!!!")
    model.generation_config.do_sample = generation_params.do_sample
    model.generation_config.temperature = generation_params.temperature

    test_dataset = load_dataset_split(args,logger, "test")

    if int(n_limit)>=1:
        test_dataset = test_dataset.select(range(n_limit))
    
    test_dataset = preprocess_dataset(args, logger, tokenizer, test_dataset)
    logger.debug(f"Creating the Test DataLoader with batch size == {args.per_device_eval_batch_size} ...")
    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=args.per_device_eval_batch_size)
    logger.debug(f"Sending the model to device '{device}'")
    model.eval() # TODO: double check I need to use model.eval() here?
    model.to(device)
    logger.info("Generating text ...")
    test_responses = []
    start = time.perf_counter()
    for batch in tqdm(test_dataloader):
        inputs = tokenizer(batch[text_field],
                              padding=padding_field,
                              truncation=truncation_field,
                              max_length=generation_params.max_padding_length,
                              return_tensors="pt")
        # input_ids = torch.tensor(input_ids).long() ## TODO: extra code to ensure that input_ids is a PyTorch tensor ... is unneeded
        inputs.to(device)
        with torch.no_grad():
            try:
                generated_ids = model.generate(
                    **inputs,
                    max_new_tokens=generation_params.max_new_tokens,
                    do_sample=generation_params.do_sample,
                    temperature=generation_params.temperature,
                    top_p=generation_params.top_p,
                    # top_k=generation_params.top_k,
                    min_length=generation_params.min_length,
                    use_cache=generation_params.use_cache,
                    repetition_penalty=generation_params.repetition_penalty,
                    length_penalty=generation_params.length_penalty,
                    num_return_sequences=generation_params.num_return_sequences
                )
            except TypeError as e:
                logger.error(f"An error occurred during generation: {e}")
                raise TypeError(e)
        generated_texts = [tokenizer.decode(gen_id, skip_special_tokens=True) for gen_id in generated_ids]
        # generated_texts = tokenizer.batch_decode(generated_ids.detach().cpu().numpy(), skip_special_tokens=True)
        test_responses.extend(generated_texts)

    e2e_inference_time = (time.perf_counter()-start)*1000
    logger.debug(f"the inference time is {e2e_inference_time} ms")
    
    predicted_labels = [extract_label(test_responses[i]) for i in range(len(test_responses))]
    logger.debug(f"Predicted label_encoded counts:\n {pd.Series(predicted_labels).value_counts().to_string()}")
    true_labels = test_dataset['label_encoded']
    logger.debug(f"Ground truth label_encoded counts:\n {pd.Series(true_labels).value_counts().to_string()}")

    logger.debug("Evaluating prediction metrics ...")
    accuracy_perc, f1_score_perc, missing_perc = evaluate_predictions(true_labels, predicted_labels)

    logger.info(f"Accuracy: {accuracy_perc}")
    logger.info(f"F1 Score: {f1_score_perc}")
    logger.info(f"Missing Percent: {missing_perc}")

    return test_dataset, generated_texts, true_labels, predicted_labels

In [None]:
# Getting predictions
test_dataset, generated_texts, true_labels, predicted_labels = __text_generation__(GenerationParams())

llama2_finetune - DEBUG - Setting up argument parameters ...
llama2_finetune - DEBUG - Configuring CUDA ...
llama2_finetune - INFO - Using k=1 CUDA GPUs with max memory {0: '41GB'}
llama2_finetune - DEBUG - Setting reproducibility seed: '42'
loading file tokenizer.model from cache at /home/AD/gmatlin3/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/94b07a6e30c3292b8265ed32ffdeccfdadf434a8/tokenizer.model
loading file tokenizer.json from cache at /home/AD/gmatlin3/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/94b07a6e30c3292b8265ed32ffdeccfdadf434a8/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /home/AD/gmatlin3/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/94b07a6e30c3292b8265ed32ffdeccfdadf434a8/special_tokens_map.json
loading file tokenizer_config.json from cache at /home/AD/gmatlin3/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-cha

Getting logger ...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing LlamaForCausalLM.

All the weights of LlamaForCausalLM were initialized from the model checkpoint at meta-llama/Llama-2-7b-chat-hf.
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training.
loading configuration file generation_config.json from cache at /home/AD/gmatlin3/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-chat-hf/snapshots/94b07a6e30c3292b8265ed32ffdeccfdadf434a8/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 1,
  "do_sample": true,
  "eos_token_id": 2,
  "max_length": 4096,
  "pad_token_id": 0,
  "temperature": 0.6,
  "top_p": 0.9
}

llama2_finetune - DEBUG - Specifing some generation configs when loading the model, doesnt work if doing it inside of .generate()!!!
llama2_finetune - DEBUG - Loading test dataset...
llama2_finetune - DEBUG - Preprocessing dataset...


Map:   0%|          | 0/496 [00:00<?, ? examples/s]

llama2_finetune - DEBUG - Filtering dataset to ensure we are below the maximum sequence length


Filter:   0%|          | 0/496 [00:00<?, ? examples/s]

llama2_finetune - DEBUG - Shuffling the data using our seed value
llama2_finetune - DEBUG - Creating the Test DataLoader with batch size == 8 ...
llama2_finetune - DEBUG - Sending the model to device 'cuda:0'
llama2_finetune - INFO - Generating text ...


  0%|          | 0/62 [00:00<?, ?it/s]



In [None]:
len(generated_texts)

---

# SANDBOX

---

---