In [2]:
# Import necessary libraries
import mlflow
import os
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, pipeline, logging
from peft import LoraConfig
from trl import SFTTrainer
import torch
import gc

project_id = os.environ['DOMINO_PROJECT_ID']
new_model = "llama-2-7b-chat-guanaco"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#Force garbage collection
gc.collect()

#For PyTorch memory management add the following code

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:1024"



# Define model, dataset, and new model name
base_model = "NousResearch/Llama-2-7b-chat-hf"
guanaco_dataset = "mlabonne/guanaco-llama2-1k"


# Load dataset
dataset = load_dataset(guanaco_dataset, split="train")

# 4-bit Quantization Configuration
compute_dtype = getattr(torch, "float16")
quant_config = BitsAndBytesConfig(load_in_4bit=True,
                                  bnb_4bit_quant_type="nf4",
                                  bnb_4bit_compute_dtype=compute_dtype,
                                  bnb_4bit_use_double_quant=False)

# Load model with 4-bit precision
model = AutoModelForCausalLM.from_pretrained(base_model,
                                             cache_dir=f"/artifacts/mlflow/{project_id}/llama2-model-cache/",
                                             quantization_config=quant_config,
                                             device_map="auto")
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, 
                                          cache_dir=f"/artifacts/mlflow/{project_id}/llama2-model-cache/",
                                          trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Set PEFT Parameters
peft_params = LoraConfig(lora_alpha=16,
                         lora_dropout=0.1,
                         r=64, bias="none",
                         task_type="CAUSAL_LM")


# Define training parameters
training_params = TrainingArguments(output_dir="./results",
                                    num_train_epochs=1,
                                    per_device_train_batch_size=8,
                                    gradient_accumulation_steps=1,
                                    optim="paged_adamw_32bit",
                                    save_steps=25,
                                    logging_steps=25,
                                    learning_rate=2e-4,
                                    weight_decay=0.001,
                                    fp16=False,
                                    bf16=False,
                                    max_grad_norm=0.3,
                                    max_steps=-1,
                                    warmup_ratio=0.03,
                                    group_by_length=True,
                                    lr_scheduler_type="constant",
                                    report_to=None)

# Initialize the trainer
trainer = SFTTrainer(model=model,
                     train_dataset=dataset,
                     peft_config=peft_params,
                     dataset_text_field="text",
                     max_seq_length=None,
                     tokenizer=tokenizer,
                     args=training_params,
                     packing=False)

#Force clean the pytorch cache
gc.collect()

torch.cuda.empty_cache()

Loading checkpoint shards: 100%|██████████| 2/2 [00:17<00:00,  8.51s/it]


In [3]:
# Train the model
experiment_name = f'llama2-7b-4bit-lora-sft-{project_id}'
exp = mlflow.set_experiment(experiment_name)
print("Fine-tuning model:")
with mlflow.start_run() as run:
    trainer.train()
    # Save the model and tokenizer
    trainer.model.save_pretrained(f"/artifacts/mlflow/{project_id}/{new_model}")
    trainer.tokenizer.save_pretrained(f"/artifacts/mlflow/{project_id}/{new_model}")

# Test the model
logging.set_verbosity(logging.CRITICAL)
prompt = "Who is Leonardo Da Vinci?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

Fine-tuning model:


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,1.4654
50,1.4289
75,1.3055
100,1.3426
125,1.3948




<s>[INST] Who is Leonardo Da Vinci? [/INST] Leonardo da Vinci (1452-1519) was an Italian polymath, artist, inventor, and scientist. He is widely considered one of the greatest painters of all time, and his inventions and designs were centuries ahead of his time. He is known for his famous works such as the Mona Lisa, The Last Supper, and Vitruvian Man. He also made significant contributions to engineering, anatomy, and mathematics. Da Vinci was a true Renaissance man, and his legacy continues to inspire and influence people around the world.


In [3]:
#Force garbage collection; kill the kernel and run the first cell and then this cell
gc.collect()
#Force clean the pytorch cache
gc.collect()

torch.cuda.empty_cache()

# Reload model in FP16 and merge it with LoRA weights
from peft import LoraConfig, PeftModel

model_name = "NousResearch/Llama-2-7b-chat-hf"

base_model = AutoModelForCausalLM.from_pretrained(model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    cache_dir=f"/artifacts/mlflow/{project_id}/llama2-model-cache/",
    torch_dtype=torch.float16,
    device_map="auto",
)
model = PeftModel.from_pretrained(base_model, f"/artifacts/mlflow/{project_id}/{new_model}")
model = model.merge_and_unload()

output_merged_dir = f"/artifacts/mlflow/{project_id}/final_merged_checkpoint"
os.makedirs(output_merged_dir, exist_ok=True)
model.save_pretrained(output_merged_dir)

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
tokenizer.save_pretrained(output_merged_dir)

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.02it/s]

Thrown during validation:
`do_sample` is set to `False`. However, `temperature` is set to `0.9` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.


('/artifacts/mlflow/65cd15a3b25a7b376a398426/final_merged_checkpoint/tokenizer_config.json',
 '/artifacts/mlflow/65cd15a3b25a7b376a398426/final_merged_checkpoint/special_tokens_map.json',
 '/artifacts/mlflow/65cd15a3b25a7b376a398426/final_merged_checkpoint/tokenizer.json')