In [3]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from trl import SFTTrainer
from unsloth import is_bfloat16_supported

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [5]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

In [6]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.12.4 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


In [7]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("mlabonne/FineTome-100k")
dataset = dataset.map(formatting_prompts_func, batched = True,)

README.md:   0%|          | 0.00/982 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/117M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

In [6]:
from transformers import TrainerCallback

class GoogleDriveBackupCallback(TrainerCallback):
    def on_save(self, args, state, control, **kwargs):
        # Copy the checkpoint to Google Drive
        local_checkpoint_dir = args.output_dir
        drive_checkpoint_dir = "/content/drive/My Drive/Checkpoints"
        shutil.copytree(local_checkpoint_dir, drive_checkpoint_dir, dirs_exist_ok=True)
        print(f"Checkpoint backed up to {drive_checkpoint_dir}")

In [12]:

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset['train'],
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        num_train_epochs=1,
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "/content/drive/MyDrive/Checkpoints",
        report_to = "none", # Use this for WandB etc
        resume_from_checkpoint= "/content/drive/MyDrive/Checkpoints",
        save_steps = 500,
        save_strategy = "steps",
    ),
)

Map (num_proc=2):   0%|          | 0/100000 [00:00<?, ? examples/s]

In [None]:
trainer_stats = trainer.train(resume_from_checkpoint = True)
#resume_from_checkpoint = True

  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 100,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 12,500
 "-____-"     Number of trainable parameters = 11,272,192
  checkpoint_rng_state = torch.load(rng_file)


Step,Training Loss
3501,0.5801
3502,0.9776
3503,0.8208
3504,0.6881
3505,0.7593
3506,0.9709
3507,1.1903
3508,0.5025
3509,0.9825
3510,0.9791


In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
import shutil

# Define local and Google Drive paths
local_checkpoint_dir = "outputs"
drive_checkpoint_dir = "/content/drive/My Drive/Checkpoints"

# Copy the checkpoints to Google Drive
shutil.copytree(local_checkpoint_dir, drive_checkpoint_dir, dirs_exist_ok=True)

print(f"Checkpoints uploaded to {drive_checkpoint_dir}")

Checkpoints uploaded to /content/drive/My Drive/Checkpoints


In [22]:
model.push_to_hub("PierreJousselin/llamav2", token = "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") # Online saving
tokenizer.push_to_hub("PierreJousselin/llamav2", token = "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") # Online saving

README.md:   0%|          | 0.00/604 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/45.1M [00:00<?, ?B/s]

Saved model to https://huggingface.co/PierreJousselin/llamav2


tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [8]:
dataset = load_dataset("mlabonne/FineTome-100k", split="train")
dataset = dataset.map(formatting_prompts_func, batched = True,)
inputs_text = [dataset[i]['conversations'][0]['value'] for i in range(0,100)]  # Replace with the correct column name
references = [dataset[i]['conversations'][1]['value'] for i in range(0,100)]   # Replace with the correct column name

In [17]:
checkpoint_path = "/content/drive/MyDrive/Checkpoints/checkpoint-4000"  # Update with your checkpoint path

# Load the model
model, tokenizer = FastLanguageModel.from_pretrained(checkpoint_path)

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.03G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

In [18]:
from unsloth.chat_templates import get_chat_template
predictions=[]
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

for text in inputs_text:
  messages = [
      {"role": "user", "content": text},
  ]

  inputs = tokenizer.apply_chat_template(
      messages,
      tokenize = True,
      add_generation_prompt = True, # Must add for generation
      return_tensors = "pt",
  ).to("cuda")

  outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True,
                          temperature = 1.5, min_p = 0.1)
  reponse=tokenizer.batch_decode(outputs)
  cleaned_response = reponse[0]  # Extract the first element if the output is a list
  cleaned_response = cleaned_response.split("<|start_header_id|>assistant<|end_header_id|>")[-1]  # Split at assistant tag
  cleaned_response = cleaned_response.split("<|eot_id|>")[0]  # Stop at the end-of-text tag
  cleaned_response = cleaned_response.strip()
  predictions.append(cleaned_response)

In [19]:
references[0]

'Boolean operators are logical operators used in programming to manipulate boolean values. They operate on one or more boolean operands and return a boolean result. The three main boolean operators are "AND" (&&), "OR" (||), and "NOT" (!).\n\nThe "AND" operator returns true if both of its operands are true, and false otherwise. For example:\n\n```python\nx = 5\ny = 10\nresult = (x > 0) and (y < 20)  # This expression evaluates to True\n```\n\nThe "OR" operator returns true if at least one of its operands is true, and false otherwise. For example:\n\n```python\nx = 5\ny = 10\nresult = (x > 0) or (y < 20)  # This expression evaluates to True\n```\n\nThe "NOT" operator negates the boolean value of its operand. It returns true if the operand is false, and false if the operand is true. For example:\n\n```python\nx = 5\nresult = not (x > 10)  # This expression evaluates to True\n```\n\nOperator precedence refers to the order in which operators are evaluated in an expression. It ensures that 

In [20]:
predictions = [p.strip() for p in predictions if p.strip()]
references = [r.strip() for r in references if r.strip()]

In [21]:
from bert_score import score

# Compute BERTScore
P, R, F1 = score(predictions, references, lang="en", model_type="bert-base-uncased")

# Print results
print(f"Precision: {P.mean().item()}")
print(f"Recall: {R.mean().item()}")
print(f"F1 Score: {F1.mean().item()}")

Precision: 0.6602679491043091
Recall: 0.5098039507865906
F1 Score: 0.5739404559135437
