<a href="https://colab.research.google.com/github/jlopetegui98/RebornToBeWilde/blob/main/2-FineTuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Fine tuning Mistral-7B (instruction) with Literary texts**

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging, TextStreamer
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os,torch, platform, gradio, warnings
from datasets import load_dataset, Dataset
from trl import SFTTrainer
from huggingface_hub import notebook_login
import json
import utils

2024-03-04 11:00:44.532796: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-04 11:00:44.532827: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-04 11:00:44.533649: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-04 11:00:44.537434: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from utils import *

In [3]:
# model to fine-tune
#model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
model_name = "mistralai/Mistral-7B-Instruct-v0.1"

dataset_name = "finetuning_instructions_dataset_POLICIER"

In [4]:
# path to the data
# root_path = '.' # comment if using colab
root_path = './POLICIER' # uncomment if using colab
dir_data = f'{root_path}/data/{dataset_name}.json'
models_path = f'{root_path}/models'

In [6]:
# load the dataset
data_dict = json.load(open(dir_data))
dataset = Dataset.from_dict(data_dict)
dataset = dataset.train_test_split(test_size = 0.1, seed = 42)
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 11376
    })
    test: Dataset({
        features: ['text'],
        num_rows: 1264
    })
})

In [7]:
import random

In [8]:
dataset_red = dataset['train'].filter(lambda x:random.random()<0.2)
len(dataset_red)

Filter:   0%|          | 0/11376 [00:00<?, ? examples/s]

2343

In [9]:
# example of the dataset
print(dataset['train']['text'][0])

<s>[INST]These are the first lines of a literary work. Continue the text below, closely following its style and content. -même et je retombai dans mon accablement.  Il se fit. un moment de silence. Etait-ce un reproche ou une plainte de la part de Sylvio ?  A la fin il se leva, vint à moi, et prit une pièce d’or : — Je veux en avoir le cœur net, me dit-il ; où est-elle ? je vais l’acheter.  — Toi, Sylvio ?  — Moi-même ! Que t’importe d’ailleurs qui l’achète, puisque chacun a le droit d’être ton rival ? Insensé [/INST] ! tout à l’heure il se moquait de ma passion vagabonde, et le voilà aujourd’hui brisé sous la honte qu’il n’a pas faite ! Toute la terre peut posséder sa maîtresse, excepté lui ; et il va mourir de rage sur le seuil de cette porte ! Encore s’il n’avait pas d’argent dans sa bourse ! mais, à cette heure, il a de quoi payer vingt fois celle qu’il aime ! Il tient là cette femme vénale sur ce marbre ; il peut acheter, s’il le veut, trois mois de la vie de cette femme, et à la 

In [10]:
# Load base model(Mistral 7B-Instruct)
# quantization config
model = load_model(model_name, adapt = True)
# print the model
#model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map={'':torch.cuda.current_device()})

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
tokenizer = load_tokenizer(model_name)

In [12]:
peft_config = LoraConfig(
        r=16,
        lora_alpha=16,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
    )

In [13]:
#Hyperparamters for fine tuning
training_arguments = TrainingArguments(
    output_dir= "./POLICIER/results",
    num_train_epochs=1,
    per_device_train_batch_size= 4,
    gradient_accumulation_steps= 2,
    optim = "paged_adamw_8bit",
    save_steps= 1000,
    logging_steps= 30,
    learning_rate= 2e-4,
    weight_decay= 0.001,
    fp16= False,
    bf16= False,
    max_grad_norm= 0.3,
    max_steps= -1,
    warmup_ratio= 0.3,
    group_by_length= True,
    lr_scheduler_type= "constant",
    report_to="wandb",
    evaluation_strategy="steps", # Evaluate the model every logging step
    eval_steps=50,               # Evaluate and save checkpoints every 50 steps
    do_eval=True,  
)

 

# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_red,
    eval_dataset=dataset['test'],
    peft_config=peft_config,
    max_seq_length= 512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)

Map:   0%|          | 0/2292 [00:00<?, ? examples/s]

Map:   0%|          | 0/1254 [00:00<?, ? examples/s]



In [14]:
# Train the model
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Currently logged in as: [33mjeanbarre[0m ([33mliterary_tuning[0m). Use [1m`wandb login --relogin`[0m to force relogin
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




Step,Training Loss,Validation Loss
50,2.3449,2.160545
100,2.0978,2.056311
150,2.0367,2.008884
200,1.9865,1.981664
250,1.9519,1.952787


TrainOutput(global_step=286, training_loss=2.0539300091616757, metrics={'train_runtime': 4440.0092, 'train_samples_per_second': 0.516, 'train_steps_per_second': 0.064, 'total_flos': 4.252809500432794e+16, 'train_loss': 2.0539300091616757, 'epoch': 1.0})

In [None]:
trainer.evaluate()

In [16]:
trainer.evaluate()

{'eval_loss': 2.042064666748047,
 'eval_runtime': 268.7846,
 'eval_samples_per_second': 3.166,
 'eval_steps_per_second': 0.398,
 'epoch': 2.0}

In [17]:
# evaluate the model
trainer.evaluate()

{'eval_loss': 1.9740540981292725,
 'eval_runtime': 272.384,
 'eval_samples_per_second': 3.124,
 'eval_steps_per_second': 0.393,
 'epoch': 2.0}

In [16]:
# save fine tuned model
model_save_name = 'Mistral7B_fine_tuned_AVENTURES.pt'
path = f"{models_path}/{model_save_name}"
torch.save(model.state_dict(), path)

In [17]:
print("Model saved successfully!")

Model saved successfully!
