In [None]:
import os
import json
import random
import torch
from util import generate_simple_prompts, initialize_df, split_dataset, _config
from datasets import Dataset
from transformers import set_seed, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer, SFTConfig
from pprint import pprint

In [None]:
!python3 login.py

In [None]:
seed = 1212

# dataset splits
train_split = 0.85
test_split = 0.1
validation_split = 0.05

# directories
output_dir = "./finetuning_mistral7b_v1"
cache_dir = os.path.expanduser("~/.cache/huggingface/")
logs_path = os.path.join(output_dir, "logs.json")

# training
model_name = "mistralai/Mistral-7B-v0.1"
epochs = 1
learning_rate=2e-3
batch_size = 32
max_seq_length = 512
logging_steps=20

In [None]:
print("Dataset configuration :")
pprint(_config)

In [None]:
set_seed(seed)

In [None]:
df = initialize_df()

In [None]:
prompts = generate_simple_prompts(df, shots=0) + generate_simple_prompts(df, shots=1, fuzzy=True)
random.shuffle(prompts)

In [None]:
print(type(prompts))
print(prompts[0], "\n")
print(prompts[-1])

In [None]:
dataset = Dataset.from_dict({"text": prompts})
dataset = split_dataset(dataset, train=train_split, test=test_split, validation=validation_split)
pprint(dataset)

In [None]:
pprint(dataset['train'][4])

In [None]:
nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=nf4_config,
    use_cache=False,
    cache_dir=cache_dir,
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    cache_dir=cache_dir,
    add_bos_token=True,
    add_eos_token=False,
)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
peft_config = LoraConfig(
    lora_alpha=16, lora_dropout=0.1, r=64, bias="none", task_type="CAUSAL_LM"
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [None]:
assert torch.cuda.is_available(), "CUDA is not available"
print("CUDA device:", torch.cuda.get_device_name(0))
print("Model device:", next(model.parameters()).device)

In [None]:
sft_config = SFTConfig(
    output_dir=output_dir,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=0,
    logging_steps=logging_steps,
    do_train=True,
    save_strategy="epoch",
    do_eval=True,
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    bf16=True,
    lr_scheduler_type="constant",
    max_seq_length=max_seq_length,
    packing=True,
    dataset_text_field="text",
)

In [None]:
trainer = SFTTrainer(
    model=model,
    peft_config=peft_config,
    processing_class=tokenizer,
    args=sft_config,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
)

In [None]:
tokenizer.decode(trainer.train_dataset["input_ids"][0])

In [None]:
trainer.train()

In [None]:
logs = trainer.state.log_history
with open(logs_path, "w") as log:
    log.write(json.dumps(logs, indent=2))