In [None]:
# Get the project root directory and add it to the system path
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.insert(0, project_root)

In [None]:
import json
import torch
from util import generate_instruction_prompts, initialize_dfs, validation_split, login_to_hf, _config
from datasets import Dataset
from transformers import set_seed, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer, SFTConfig
from pprint import pprint

In [None]:
login_to_hf()

In [None]:
# training configurations
v1 = {
    "model_name": "mistralai/Mistral-7B-v0.1",
    "output_dir": "mistral7b_v1",
    "epochs": 1,
    "learning_rate": 2e-3,
    "batch_size": 32,
    "max_seq_length": 512,
    "logging_steps": 20,
    "completion_only_loss": False,
    "warmup_steps": 0,
    "eval_strategy": "epoch",
    "eval_steps": None
}

v2 = {
    "model_name": "mistralai/Mistral-7B-v0.1",
    "output_dir": "mistral7b_v2",
    "epochs": 1,
    "learning_rate": 1e-4,
    "batch_size": 32,
    "max_seq_length": 512,
    "logging_steps": 20,
    "completion_only_loss": False,
    "warmup_steps": 0,
    "eval_strategy": "epoch",
    "eval_steps": None
}

v3 = {
    "model_name": "mistralai/Mistral-7B-v0.1",
    "output_dir": "mistral7b_v3",
    "epochs": 2,
    "learning_rate": 1e-4,
    "batch_size": 64,
    "max_seq_length": 512,
    "logging_steps": 32,
    "completion_only_loss": True,
    "warmup_steps": 0,
    "eval_strategy": "steps",
    "eval_steps": 64
}

model_config = v3  # Change this to v1, v2, or v3 as needed

In [None]:
seed = 1212

# dataset splits
test_split = 0.1  # 10% of the all
valid_split = 0.05  # 5% of the training set

# directories
output_dir = os.path.join(project_root, model_config['output_dir'])
cache_dir = os.path.expanduser("~/.cache/huggingface/")
logs_path = os.path.join(output_dir, "logs.json")

In [None]:
print("Dataframe configuration :")
pprint(_config)

In [None]:
set_seed(seed)

In [None]:
df_train, _ = initialize_dfs(test=test_split)

In [None]:
_, p0, c0 = generate_instruction_prompts(df_train, shots=0)
_, p1, c1 = generate_instruction_prompts(df_train, shots=1, fuzzy=True)

prompts = p0 + p1
completions = c0 + c1

dataset = Dataset.from_dict({"prompt": prompts, "completion": completions})
dataset = validation_split(dataset, validation=valid_split)
pprint(dataset)

In [None]:
nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [None]:
# if there is an error, that directories related to cuda are not found, the easiest solution is to reinstall bitsandbytes

model = AutoModelForCausalLM.from_pretrained(
    model_config['model_name'],
    device_map="auto",
    quantization_config=nf4_config,
    use_cache=False,
    cache_dir=cache_dir,
    # attn_implementation='flash_attention_2'  # not compatible with current versions of torch and cuda
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_config["model_name"],
    cache_dir=cache_dir,
    add_bos_token=True,
    add_eos_token=False,
)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
peft_config = LoraConfig(
    lora_alpha=16, lora_dropout=0.1, r=64, bias="none", task_type="CAUSAL_LM"
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [None]:
assert torch.cuda.is_available(), "CUDA is not available"
print("CUDA device:", torch.cuda.get_device_name(0))
print("Model device:", next(model.parameters()).device)

In [None]:
sft_config = SFTConfig(
    output_dir=output_dir,
    num_train_epochs=model_config["epochs"],
    per_device_train_batch_size=model_config["batch_size"],
    per_device_eval_batch_size=model_config["batch_size"],
    warmup_steps=model_config["warmup_steps"],
    logging_steps=model_config["logging_steps"],
    do_train=True,
    save_strategy="epoch",
    do_eval=True,
    eval_strategy=model_config["eval_strategy"],
    eval_steps=model_config["eval_steps"],
    learning_rate=model_config["learning_rate"],
    bf16=True,
    lr_scheduler_type="constant",
    max_seq_length=model_config["max_seq_length"],
    packing=True,
    completion_only_loss=model_config["completion_only_loss"],
)

In [None]:
trainer = SFTTrainer(
    model=model,
    peft_config=peft_config,
    processing_class=tokenizer,
    args=sft_config,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
)

In [None]:
tokenizer.decode(trainer.train_dataset["input_ids"][0])

In [None]:
trainer.train()

In [None]:
logs = trainer.state.log_history
with open(logs_path, "w") as log:
    log.write(json.dumps(logs, indent=2))