<a href="https://colab.research.google.com/github/giustinod/Fine-Tuning-Llama-2LLM/blob/main/Fine_tune_Llama_3_2_1B_Instruct_with_Unsloth.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

cfr. https://huggingface.co/blog/mlabonne/sft-llama3

check also: https://medium.com/@amrstech/fine-tuning-made-easy-with-unsloth-and-colab-e0993f3f4c07

In [None]:
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

from huggingface_hub import login
from google.colab import userdata
import wandb

import torch
from trl import SFTTrainer
from datasets import load_dataset
from transformers import TrainingArguments, TextStreamer
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel, is_bfloat16_supported
from datasets import load_dataset

hf_token = userdata.get('HF_TOKEN')
login(hf_token)

wb_token = userdata.get("wandb")

wandb.login(key=wb_token)
run = wandb.init(
    project='Fine-tune unsloth Llama-3.2-1B-Instruct on AZService', # DeepSeek-R1-Distill-Llama-8B
    job_type="training",
    anonymous="allow"
)

# caricamento modello e test
max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct", # unsloth/DeepSeek-R1-Distill-Llama-8B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = hf_token,
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

tokenizer = get_chat_template(
    tokenizer,
    mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},
    chat_template="chatml",
)

# def apply_template(example):
#   texts = []
#   for p, c in zip(example['premises'], example['conclusion']):
#     conversation = []
#     conversation.append({"role": "user", "content": p})
#     conversation.append({"role": "assistant", "content": c})
#     formatted_conversation = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=False)
#     texts.append(formatted_conversation)
#   return {"text": texts}

def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['premises'])):
        text = f"### Question: {example['premises'][i]}\n ### Answer: {example['conclusion'][i]}"
        output_texts.append(text)
    return output_texts

dataset = load_dataset("azservice/test-pdfs", split="train", token = hf_token)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    # dataset_text_field="text",
    tokenizer=tokenizer,
    # packing = False, # Can make training 5x faster for short sequences.
    formatting_func=formatting_prompts_func,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "output",
        report_to = "none", # Use this for WandB etc
    ),
)

trainer.train()

model = FastLanguageModel.for_inference(model)

messages = [
    {"from": "human", "value": "What about design specification for route protection in the rail network?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt",
).to("cuda")

text_streamer = TextStreamer(tokenizer)
_ = model.generate(input_ids=inputs, streamer=text_streamer, max_new_tokens=128, use_cache=True)

# Save locally as sharded model files
model.save_pretrained_merged("TestLogica-Pdfs", tokenizer, save_method = "merged_16bit",)
# Push to Huggingface hub - replace the space username as required
model.push_to_hub_merged("azservice/TestLogica-Pdfs", tokenizer, save_method = "merged_16bit", token = hf_token)


Found existing installation: unsloth 2025.2.12
Uninstalling unsloth-2025.2.12:
  Successfully uninstalled unsloth-2025.2.12
Collecting git+https://github.com/unslothai/unsloth.git
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-req-build-bdkev26j
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-req-build-bdkev26j
  Resolved https://github.com/unslothai/unsloth.git to commit d1d15f1d14f1168837d29b9c08e9b6d63945d469
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: unsloth
  Building wheel for unsloth (pyproject.toml) ... [?25l[?25hdone
  Created wheel for unsloth: filename=unsloth-2025.2.12-py3-none-any.whl size=187179 sha256=750f3e4123f533827045731510c820129f507c634a33277204be1478bf83b13c
  Stored in directory: /tmp/pip-ephem-wheel-cache-zs1zi2tj/wheels/d1



==((====))==  Unsloth 2025.2.12: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Generating train split: 0 examples [00:00, ? examples/s]

Applying formatting function to train dataset (num_proc=2):   0%|          | 0/3469 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=2):   0%|          | 0/3469 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/3469 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/3469 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 3,469 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
1,4.6575
2,4.1689
3,3.6135
4,4.6143
5,3.5548
6,3.966
7,2.7053
8,3.2976
9,3.515
10,3.2011


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


<|im_start|>user
What about design specification for route protection in the rail network?<|im_end|>
<|im_start|>assistant
This is a standard question in the rail network. The design specification for route protection in the rail network is a critical aspect of ensuring the safe and efficient operation of the rail network. Here are some possible answers to this question:

1. **Design requirements**: The design specification should include requirements for the type and frequency of route protection, such as the type of protection device, its installation location, and its parameters.
2. **Route protection device**: The design specification should include details on the type and installation of the route protection device, such as a protective head, a protective arm, or a protective track.
3. **Route protection parameters**:


Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 1.1G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 5.1 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 16/16 [00:00<00:00, 38.98it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving TestLogica-Pdfs/pytorch_model.bin...
Done.


Da: https://huggingface.co/spaces/Thziin/Tutorial/blob/5ace8366196059399d4387bb3d3b969007e5dfdc/app.py

Con il modello originario (unsloth/Meta-Llama-3.1-8B-bnb-4bit) è molto lento ma risponde, dopo il fine tuning restituisce:
Error during inference: 504 Server Error: Gateway Timeout for url: https://api-inference.huggingface.co/models/azservice/TestLogica-Pdfs/v1/chat/completions (Request ID: 0Tusix)
Model azservice/TestLogica-Pdfs time out

In [None]:
%pip install gradio

import gradio as gr
from huggingface_hub import InferenceClient

# Safely initialize the inference client
def initialize_client():
    try:
        print("Initializing inference client...")
        client = InferenceClient("azservice/TestLogica-Pdfs")
        print("Inference client initialized successfully!")
        return client
    except Exception as e:
        print(f"Error initializing inference client: {e}")
        return None

client = initialize_client()

# Chatbot response logic
def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    if not client:
        return "Error: Inference client not initialized."

    messages = [{"role": "system", "content": system_message}]

    # Add historical interactions
    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})

    # Add user message
    messages.append({"role": "user", "content": message})

    try:
        print("Sending request to model...")
        response = client.chat_completion(
            messages,
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
        ).choices[0].message.content
        print("Response received successfully!")
        return response
    except Exception as e:
        print(f"Error during inference: {e}")
        return f"Error during inference: {e}"

# Gradio interface
def launch_demo():
    try:
        demo = gr.ChatInterface(
            respond,
            additional_inputs=[
                gr.Textbox(value="You are a friendly Chatbot. Your name is Juninho.", label="System message"),
                gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
                gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
                gr.Slider(
                    minimum=0.1,
                    maximum=1.0,
                    value=0.95,
                    step=0.05,
                    label="Top-p (nucleus sampling)",
                ),
            ],
        )
        demo.launch()
    except Exception as e:
        print(f"Error launching Gradio app: {e}")

if __name__ == "__main__":
    launch_demo()

Initializing inference client...
Inference client initialized successfully!




Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://d95e63666286a8adc4.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
