In [None]:
import os
import json
import random
import torch
from util import generate_training_prompts, initialize_dfs, validation_split, _config
from datasets import Dataset
from transformers import set_seed, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer, SFTConfig
from pprint import pprint



In [2]:
!python3 login.py



In [3]:
seed = 1212

# dataset splits
test_split = 0.1  # 10% of the all
valid_split = 0.05  # 5% of the training set

# directories
output_dir = "./finetuning_mistral7b_v1"
cache_dir = os.path.expanduser("~/.cache/huggingface/")
logs_path = os.path.join(output_dir, "logs.json")

# training
model_name = "mistralai/Mistral-7B-v0.1"
epochs = 1
learning_rate = 2e-3
batch_size = 32
max_seq_length = 512
logging_steps = 20

In [4]:
print("Dataset configuration :")
pprint(_config)

Dataset configuration :
{'limit_num_fuzzy_matches': 10,
 'min_fuzzy_score': 20,
 'min_sentence_length': 10,
 'target_path': 'data/all.csv'}


In [5]:
set_seed(seed)

In [6]:
df_train, _ = initialize_dfs(test=test_split)

Constructed dataframe found. Reading source...
Non-empty dataframe read.
Dataframes created: split index = 10119


In [7]:
prompts = generate_training_prompts(df_train, shots=0) + generate_training_prompts(df_train, shots=1, fuzzy=True)
random.shuffle(prompts)

In [8]:
print(type(prompts))
print(prompts[0], "\n")
print(prompts[-1])

<class 'list'>
English: This open.
French: Aussi ouvert.
 

English: "As long as the surge doesn't get too much bigger than they predict, we'll be okay.
French: Tant que l'onde de tempête n'est pas plus forte que ce qu'ils prévoient, tout ira bien.



In [9]:
dataset = Dataset.from_dict({"text": prompts})
dataset = validation_split(dataset, validation=valid_split)
pprint(dataset)

{'train': Dataset({
    features: ['text'],
    num_rows: 19226
}),
 'validation': Dataset({
    features: ['text'],
    num_rows: 1012
})}


In [10]:
pprint(dataset['train'][4])

{'text': "English: He couldn't be trusted to talk to anyone, much less forge "
         'relationships with people.\n'
         'French: On ne pouvait pas lui faire confiance pour parler à qui que '
         'ce soit, et encore moins pour nouer des relations avec les gens.\n'}


In [11]:
nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [12]:
# if there is an error, that directories related to cuda are not found, the easiet solution is to reinstall bitsandbytes

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=nf4_config,
    use_cache=False,
    cache_dir=cache_dir,
)

ImportError: The installed version of bitsandbytes (<0.43.1) requires CUDA, but CUDA is not available. You may need to install PyTorch with CUDA support or upgrade bitsandbytes to >=0.43.1.

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    cache_dir=cache_dir,
    add_bos_token=True,
    add_eos_token=False,
)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
peft_config = LoraConfig(
    lora_alpha=16, lora_dropout=0.1, r=64, bias="none", task_type="CAUSAL_LM"
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [None]:
assert torch.cuda.is_available(), "CUDA is not available"
print("CUDA device:", torch.cuda.get_device_name(0))
print("Model device:", next(model.parameters()).device)

In [None]:
sft_config = SFTConfig(
    output_dir=output_dir,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=0,
    logging_steps=logging_steps,
    do_train=True,
    save_strategy="epoch",
    do_eval=True,
    eval_strategy="epoch",
    learning_rate=learning_rate,
    bf16=True,
    lr_scheduler_type="constant",
    max_seq_length=max_seq_length,
    packing=True,
    dataset_text_field="text",
)

In [None]:
trainer = SFTTrainer(
    model=model,
    peft_config=peft_config,
    processing_class=tokenizer,
    args=sft_config,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
)

In [None]:
tokenizer.decode(trainer.train_dataset["input_ids"][0])

In [None]:
trainer.train()

In [None]:
logs = trainer.state.log_history
with open(logs_path, "w") as log:
    log.write(json.dumps(logs, indent=2))