# LAB 2 FineTuning Lama

## 1. Download and SetUp

In [1]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git
!pip install datasets


In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # 4bit for 405b!
    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!

    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.12.2: Fast Llama patching. Transformers:4.46.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.03G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

In [3]:
import requests
import json
from pathlib import Path
import os

def download_github_folder():
    # Convert GitHub URL to API URL for the repository content
    api_url = "https://api.github.com/repos/davidenascivera/LAB2_Scalable/contents/DATA"

    response = requests.get(api_url)
    if response.status_code != 200:
        raise Exception("Failed to fetch repository contents")

    # Create DATA directory if it doesn't exist
    os.makedirs('DATA', exist_ok=True)

    # Download all JSONL files
    for item in response.json():
        if item['name'].endswith('.jsonl'):
            file_url = item['download_url']
            file_content = requests.get(file_url).text

            # Save individual files
            with open(f"DATA/{item['name']}", 'w') as f:
                f.write(file_content)

def merge_jsonl_files():
    # Get all JSONL files in DATA directory
    jsonl_files = list(Path('DATA').glob('*.jsonl'))

    # Merge all files into one
    with open('merged.jsonl', 'w') as outfile:
        for jsonl_file in jsonl_files:
            with open(jsonl_file, 'r') as infile:
                for line in infile:
                    outfile.write(line)

# Execute the script
download_github_folder()
merge_jsonl_files()
print("Files downloaded and merged successfully!")

Files downloaded and merged successfully!


In [4]:
# Load and prepare dataset
def convert_to_conversation(example):
    return {
        "conversations": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": example["prompt"]},
            {"role": "assistant", "content": example["completion"]}
        ]
    }
from datasets import load_dataset
from datasets import Dataset

# Load your dataset
recipe_dataset = load_dataset('json', data_files='/content/merged.jsonl')['train']
recipe_dataset = recipe_dataset.map(
    convert_to_conversation,
    remove_columns=recipe_dataset.column_names
)
dataset = Dataset.from_dict({"conversations": recipe_dataset['conversations']})

# Format prompts
def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(
        convo,
        tokenize=False,
        add_generation_prompt=False
    ) for convo in convos]
    return {"text": texts}

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1751 [00:00<?, ? examples/s]

In [5]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.12.2 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


In [6]:
from unsloth.chat_templates import standardize_sharegpt
dataset = standardize_sharegpt(dataset)
dataset = dataset.map(formatting_prompts_func, batched = True,)

Standardizing format:   0%|          | 0/1751 [00:00<?, ? examples/s]

Map:   0%|          | 0/1751 [00:00<?, ? examples/s]

In [7]:
dataset[5]["conversations"]

[{'content': 'You are a helpful assistant.', 'role': 'system'},
 {'content': 'What methods can I use to cook applesauce?', 'role': 'user'},
 {'content': '**preparation time**: 4 minutes\n**cooking time**: 3 minutes\n**portions**: 3\n**ingredients**:\n- 2 apples\n- peeled\n- pureed with the blender juice of ½ lemon 1 teaspoon of honey 50ml of milk a handful of pumpkin seeds\n**procedure**:1. toast the seeds pumpkin at 180 degrees in the fryer for 3 minutes\n2. in the meantime, blend the apple pulp with the milk, lemon and honey\n3. pour the puree into glasses, add the pumpkin seeds and serve\n4. Enjoy your meal!\n**nutrients**:\n- fat 2\n- fiber 7\n- carbohydrates 12\n- protein 5\n- calories 142',
  'role': 'assistant'}]

In [8]:
dataset[5]["text"]

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 06 Dec 2024\n\nYou are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat methods can I use to cook applesauce?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n**preparation time**: 4 minutes\n**cooking time**: 3 minutes\n**portions**: 3\n**ingredients**:\n- 2 apples\n- peeled\n- pureed with the blender juice of ½ lemon 1 teaspoon of honey 50ml of milk a handful of pumpkin seeds\n**procedure**:1. toast the seeds pumpkin at 180 degrees in the fryer for 3 minutes\n2. in the meantime, blend the apple pulp with the milk, lemon and honey\n3. pour the puree into glasses, add the pumpkin seeds and serve\n4. Enjoy your meal!\n**nutrients**:\n- fat 2\n- fiber 7\n- carbohydrates 12\n- protein 5\n- calories 142<|eot_id|>'

## 2. Training

In [9]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 7, # Set this for 1 full training run.
        #max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Map (num_proc=2):   0%|          | 0/1751 [00:00<?, ? examples/s]

In [10]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

Map:   0%|          | 0/1751 [00:00<?, ? examples/s]

In [11]:
tokenizer.decode(trainer.train_dataset[5]["input_ids"])

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 06 Dec 2024\n\nYou are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat methods can I use to cook applesauce?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n**preparation time**: 4 minutes\n**cooking time**: 3 minutes\n**portions**: 3\n**ingredients**:\n- 2 apples\n- peeled\n- pureed with the blender juice of ½ lemon 1 teaspoon of honey 50ml of milk a handful of pumpkin seeds\n**procedure**:1. toast the seeds pumpkin at 180 degrees in the fryer for 3 minutes\n2. in the meantime, blend the apple pulp with the milk, lemon and honey\n3. pour the puree into glasses, add the pumpkin seeds and serve\n4. Enjoy your meal!\n**nutrients**:\n- fat 2\n- fiber 7\n- carbohydrates 12\n- protein 5\n- calories 142<|eot_id|>'

In [12]:
space = tokenizer(" ", add_special_tokens = False).input_ids[0]
tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]])

'                                                  \n\n**preparation time**: 4 minutes\n**cooking time**: 3 minutes\n**portions**: 3\n**ingredients**:\n- 2 apples\n- peeled\n- pureed with the blender juice of ½ lemon 1 teaspoon of honey 50ml of milk a handful of pumpkin seeds\n**procedure**:1. toast the seeds pumpkin at 180 degrees in the fryer for 3 minutes\n2. in the meantime, blend the apple pulp with the milk, lemon and honey\n3. pour the puree into glasses, add the pumpkin seeds and serve\n4. Enjoy your meal!\n**nutrients**:\n- fat 2\n- fiber 7\n- carbohydrates 12\n- protein 5\n- calories 142<|eot_id|>'

In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,751 | Num Epochs = 7
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 1,533
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss


In [None]:
model.save_pretrained("Italian_Cousine_1.3") # Local saving
tokenizer.save_pretrained("Italian_Cousine_1.3")
model.push_to_hub("davnas/Italian_Cousine_1.3", token = "") # Online saving
tokenizer.push_to_hub("davnas/Italian_Cousine_1.3", token = "") # Online saving

README.md:   0%|          | 0.00/595 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/45.1M [00:00<?, ?B/s]

Saved model to https://huggingface.co/davnas/Italian_Cousine_1.3


  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

# 3.Inference:


In [None]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
#!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

!pip install bitsandbytes


In [None]:
from transformers import AutoModel, AutoTokenizer

max_seq_length = 2048  # Choose any! We auto support ROPE scaling internally!
dtype = None  # None for auto detection. Float16 for Tesla T4, V100, bFloat16 for Ampere+

model_name_or_path = "davnas/Italian_Cousine_2"

from unsloth import FastLanguageModel
import torch

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name_or_path,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=True,
    # token = "hf_...", #se il nostro modello non è public
    # Use one if using gated models like meta-llama/Llama-2-7b-hf
)


==((====))==  Unsloth 2024.12.1: Fast Llama patching. Transformers:4.46.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


adapter_model.safetensors:   0%|          | 0.00/45.1M [00:00<?, ?B/s]

In [None]:
from unsloth import FastLanguageModel

FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "How can I cook an smoothie?"}
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,  # Must add for generation
    return_tensors="pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt=True)

model.generate(input_ids=inputs, streamer=text_streamer, max_new_tokens=128,
               use_cache=True, temperature=1.5, min_p=0.1)


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


**preparation time**: 6 minutes
**cooking time**: 4 minutes
**portions**: 2
**ingredients**:
- 1 banana
- peeled
- pureed 150ml skimmed milk 100ml milk 100ml vegetable juice of 2 lemons 1 tablespoon butter salt to taste.
**procedure**:1. blend all the ingredients in the blender, except the butter
2. heat the butter in a pan until it becomes soft
3. put the smoothie in glasses, pour in the butter, enjoy your meal!
**nutrients**:
- fat 12
- fiber


tensor([[128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,
             25,   6790,    220,   2366,     18,    198,  15724,   2696,     25,
            220,   2371,   3799,    220,   2366,     19,    271, 128009, 128006,
            882, 128007,    271,   4438,    649,    358,   4394,    459,  11113,
            648,     30, 128009, 128006,  78191, 128007,    271,    334,   1762,
          36235,    892,  96618,    220,     21,   4520,    198,    334,   1030,
          10979,    892,  96618,    220,     19,   4520,    198,    334,    403,
            919,  96618,    220,     17,    198,    334,  39220,    334,    512,
             12,    220,     16,  44196,    198,     12,  83612,    198,     12,
          10748,    291,    220,   3965,   1029,  79669,   2106,  14403,    220,
           1041,   1029,  14403,    220,   1041,   1029,  36581,  23661,    315,
            220,     17,    514,  24483,    220,     16,  62611,  14432,  12290,
            311,  12945,    

In [None]:
import evaluate
import jsonlines
from unsloth import FastLanguageModel
import torch

ModuleNotFoundError: No module named 'evaluate'

In [None]:
def load_test_data(file_path):
    """Load test data from JSONL file."""
    prompts = []
    completions = []
    with jsonlines.open(file_path) as reader:
        for item in reader:
            prompts.append(item['prompt'])
            completions.append(item['completion'])
    return prompts, completions

In [None]:
def generate_predictions(model, tokenizer, prompts):
    """Generate model predictions for given prompts."""
    predictions = []
    for prompt in prompts:
        messages = [{"role": "user", "content": prompt}]
        inputs = tokenizer.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to("cuda")

        outputs = model.generate(
            input_ids=inputs,
            max_new_tokens=512,
            use_cache=True,
            temperature=1.5,
            min_p=0.1
        )

        pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(pred)
return predictions


In [None]:
# Initialize model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="davnas/Italian_Cousine",
    max_seq_length=2048,
    load_in_4bit=True
)
FastLanguageModel.for_inference(model)

# Load test data
prompts, references = load_test_data("test_data.jsonl")

# Generate predictions
predictions = generate_predictions(model, tokenizer, prompts)

# Evaluate using ROUGE
rouge = evaluate.load('rouge')
results = rouge.compute(
    predictions=predictions,
    references=references,
    use_stemmer=True
)

# Print results
print("\nEvaluation Results:")
for metric, score in results.items():
    print(f"{metric}: {score:.4f}")