In [3]:
import json

file = []
with open("/content/dataset_audit.jsonl", "r") as f:
    for line in f:
        file.append(json.loads(line))
print(file[1])

{'instruction': 'Genera un reporte de auditoría completo basado en esta transcripción, evaluando las reglas R1 a R9.', 'input': 'Cliente: Tengo una duda.\nAsesor: Nombre y DNI.\nCliente: Ana Torres, 99001122.\nAsesor: ¿En qué le ayudo?\nAsesor: No veo nada de bajas anteriores en mi pantalla.\nAsesor: Listo, chau.', 'output': '{"rule_analysis": {"R1_validacion_datos": {"cumple": true, "razon": "Identificación básica suficiente para gestión no sensible.", "score": 10}, "R2_empatia_claridad": {"cumple": true, "razon": "Mantiene comunicación clara y respetuosa.", "score": 10}, "R3_ofertas_adecuadas": {"cumple": true, "razon": "No realizó ofertas fuera de contexto.", "score": 10}, "R6A_consulta_baja_previa": {"cumple": false, "razon": "No asistió con información de gestión anterior.", "score": 0}, "R7_tiempo_espera_justificado": {"cumple": true, "razon": "Tiempo en espera razonable.", "score": 10}, "R5_formalizacion_cierre": {"cumple": false, "razon": "No entregó código ni explicó plazos.",

In [4]:
!pip install unsloth trl peft accelerate bitsandbytes

Collecting unsloth
  Downloading unsloth-2026.1.2-py3-none-any.whl.metadata (66 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/66.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.6/66.6 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl
  Downloading trl-0.26.2-py3-none-any.whl.metadata (11 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting unsloth_zoo>=2026.1.2 (from unsloth)
  Downloading unsloth_zoo-2026.1.2-py3-none-any.whl.metadata (32 kB)
Collecting tyro (from unsloth)
  Downloading tyro-1.0.3-py3-none-any.whl.metadata (12 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.33.post2-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.2 kB)
Collecting datasets!=4.0.*,!=4.1.0,<4.4.0,>=3.4.1 (from unsloth)
  Downloading datasets-4.3.0-py3-none-any.whl.metadata (18 kB)
Collecting trl
  D

In [5]:
# For GPU check
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

CUDA available: True
GPU: Tesla T4


In [13]:
from unsloth import FastLanguageModel
import torch

#model_name = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
model_name = "unsloth/Meta-Llama-3.1-8B-bnb-4bit"

max_seq_length = 2048  # Choose sequence length
dtype = None  # Auto detection

# Load model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=True,
)

==((====))==  Unsloth 2026.1.2: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [15]:
from datasets import Dataset

def format_prompt(example):
    #return f"### Input: {example['input']}\n### Output: {json.dumps(example['output'])}<|endoftext|>"
    return f"### Input: {example['input']}\n### Output: {json.dumps(example['output'])}<|endoftext|>"

formatted_data = [format_prompt(item) for item in file]
dataset = Dataset.from_dict({"text": formatted_data})

In [16]:
# Add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r=64,  # LoRA rank - higher = more capacity, more memory
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=128,  # LoRA scaling factor (usually 2x rank)
    lora_dropout=0,  # Supports any, but = 0 is optimized
    bias="none",     # Supports any, but = "none" is optimized
    use_gradient_checkpointing="unsloth",  # Unsloth's optimized version
    random_state=3407,
    use_rslora=False,  # Rank stabilized LoRA
    loftq_config=None, # LoftQ
)

In [17]:
from trl import SFTTrainer
from transformers import TrainingArguments

# Training arguments optimized for Unsloth
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,  # Effective batch size = 8
        warmup_steps=10,
        num_train_epochs=3,
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=25,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        save_strategy="epoch",
        save_total_limit=2,
        dataloader_pin_memory=False,
        report_to="none", # Disable Weights & Biases logging
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/3000 [00:00<?, ? examples/s]

In [18]:
# Train the model
trainer_stats = trainer.train()

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3,000 | Num Epochs = 3 | Total steps = 1,125
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 167,772,160 of 8,198,033,408 (2.05% trained)


Step,Training Loss
25,0.4676
50,0.0388
75,0.0318
100,0.0321
125,0.0299
150,0.0303
175,0.0295
200,0.0289
225,0.0281
250,0.0295


In [20]:
# Test the fine-tuned model
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# The original 'messages' variable format is not compatible with tokenizer.apply_chat_template
# when tokenizer.chat_template is not explicitly set for this custom format.
# We will manually construct the prompt string in the format the model was trained on.

# Use the structure of the training prompt (from format_prompt function):
# f"### Input: {example['input']}\n### Output:"
# Where example['input'] was the raw conversation, and 'instruction' was a separate field
# in the dataset. For inference, we should combine the instruction and the input (conversation).

# Construct the prompt for inference based on the training format
# The training format was `f"### Input: {example['input']}\n### Output: {json.dumps(example['output'])}<|endoftext|>"`
# Here, `example['input']` was the 'input' field from the original dataset (the conversation).
# We should provide the full query including instruction and input to the model for inference.

# Let's adjust the prompt to explicitly include the instruction for clarity, even if the training format didn't originally combine them inside '### Input:'.
# This assumes the model learned to handle the 'instruction' as context.
# A safer approach is to reconstruct the *exact* input string the model saw during training for the 'Input' part.
# Based on `file[1]` and `format_prompt`, `example['input']` was just the raw conversation.
# So, the inference prompt should be: `### Input: {conversation_string}\n### Output:`

# Extract the input conversation from the example
conversation_text = "Cliente: Quiero la baja.\nAsesor: Por favor valide: nombre, dni, fecha, lugar y monto.\nCliente: Ana Torres, 55667788... espere...\n(Llamada cortada por cliente)\nAsesor: ¿Le interesa la oferta 1?\nCliente: No gracias.\nAsesor: ¿Le interesa la oferta 2?\nCliente: No gracias.\nAsesor: ¿Le interesa la oferta 3?\nCliente: No gracias.\nAsesor: Insisto con la oferta 5.\nCliente: Ya dije que no.\n(Espera silenciosa de 7 minutos sin motivo)\nAsesor: Gestión realizada. Código: ID-54, plazo 24h."

# If the instruction was meant to be part of the 'Input' to the model during training,
# it should be concatenated here. Looking at the `format_prompt` from `FIdADxFWXToO`,
# it used `f"### Input: {example['input']}\n### Output: ..."` where `example['input']`
# was just the conversation. The 'instruction' field was separate in the JSONL but not
# explicitly added to the training prompt text. This might lead to suboptimal results
# if the model wasn't trained with the instruction. For now, let's keep the inference
# prompt aligned with the *structure* of the training data's input part.

# If we want the model to use the instruction:
# prompt_text = f"### Instruction: Genera un reporte de auditoría completo basado en esta transcripción, evaluando las reglas R1 a R9.\n### Input: {conversation_text}\n### Output:"

# However, strictly following the `format_prompt` where only `example['input']` was used:
prompt_text = f"### Input: {conversation_text}\n### Output:"

# Tokenize the prompt directly
inputs = tokenizer(prompt_text, return_tensors="pt").to("cuda")

# Generate response
outputs = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_new_tokens=256,
    use_cache=True,
    temperature=0.7,
    do_sample=True,
    top_p=0.9,
)

# Decode and print
response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
print(response)


### Input: Cliente: Quiero la baja.
Asesor: Por favor valide: nombre, dni, fecha, lugar y monto.
Cliente: Ana Torres, 55667788... espere...
(Llamada cortada por cliente)
Asesor: ¿Le interesa la oferta 1?
Cliente: No gracias.
Asesor: ¿Le interesa la oferta 2?
Cliente: No gracias.
Asesor: ¿Le interesa la oferta 3?
Cliente: No gracias.
Asesor: Insisto con la oferta 5.
Cliente: Ya dije que no.
(Espera silenciosa de 7 minutos sin motivo)
Asesor: Gestión realizada. Código: ID-54, plazo 24h.
### Output: "{\"rule_analysis\": {\"R1_validacion_datos\": {\"cumple\": true, \"razon\": \"Asesor aplic\u00f3 protocolo de baja antes del corte.\", \"score\": 10}, \"R9_falta_informacion\": {\"cumple\": \"NO APLICA\", \"razon\": \"Corte por cliente impide terminar.\", \"score\": 0}, \"R2_empatia_claridad\": {\"cumple\": true, \"razon\": \"Mantiene comunicaci\u00f3n clara y respetuosa.\", \"score\": 10}, \"R3_ofertas_adecuadas\": {\"cumple\": false, \"razon\": \"Super\u00f3 el l\u00edmite de 3 ofertas.\", 

In [8]:
model.save_pretrained_gguf("gguf_model_supervisor", tokenizer, quantization_method="q4_k_m")

NameError: name 'model' is not defined

In [4]:
from google.colab import files
import os

gguf_files = [f for f in os.listdir("gguf_model_supervisor") if f.endswith(".gguf")]
if gguf_files:
    gguf_file = os.path.join("gguf_model_supervisor", gguf_files[0])
    print(f"Downloading: {gguf_file}")
    files.download(gguf_file)

In [10]:
# First, install llama-cpp-python if not already installed
!pip install llama-cpp-python



In [13]:
from llama_cpp import Llama
import os
import json

# Define the path to your GGUF model
gguf_model_name = "Meta-Llama-3.1-8B.Q4_K_M.gguf" # This is the name of the file created by the save_pretrained_gguf command
# Correcting the path as the file was saved directly in /content/
gguf_model_path = os.path.join("/content/", gguf_model_name)

# Load the GGUF model
#print(f"Loading GGUF model from: {gguf_model_path}")
llm = Llama(model_path=gguf_model_path, n_ctx=2048, n_gpu_layers=-1)

# Prepare an inference prompt based on the training format
conversation_text = "Cliente: Quiero la baja.\nAsesor: Por favor valide: nombre, dni, fecha, lugar y monto.\nCliente: Ana Torres, 55667788... espere...\n(Llamada cortada por cliente)\nAsesor: ¿Le interesa la oferta 1?\nCliente: No gracias.\nAsesor: ¿Le interesa la oferta 2?\nCliente: No gracias.\nAsesor: ¿Le interesa la oferta 3?\nCliente: No gracias.\nAsesor: Insisto con la oferta 5.\nCliente: Ya dije que no.\n(Espera silenciosa de 7 minutos sin motivo)\nAsesor: Gestión realizada. Código: ID-54, plazo 24h."

#

t:`
# For chat completion with llama_cpp, we can structure it using roles.
# If the instruction was implicitly part of the model's understanding from training, we can simplify.
# Let's create a messages list that aligns with how chat models typically expect input.
# Given how the training prompt was structured with '### Input:', we can put the combined
# instruction and conversation directly into the user message.

full_prompt_for_gguf = f"### Instruction: Genera un reporte de auditoría completo basado en esta transcripción, evaluando las reglas R1 a R9.\n### Input: {conversation_text}\n### Output:"

messages = [
    {
        "role": "user",
        "content": full_prompt_for_gguf
    }
]

# Run inference
print("Running inference with GGUF model...")
output = llm.create_chat_completion(
    messages=messages,
    max_tokens=256,
    temperature=0.7,
    top_p=0.9,
    stop=["<|endoftext|>"], # Stop generation at the end of text token
)

# Decode and print the response
response_content = output["choices"][0]["message"]["content"]
print("\nGenerated Response from GGUF model:")
print(response_content)


gguf_init_from_file_impl: invalid magic characters: '????', expected 'GGUF'
llama_model_load: error loading model: llama_model_loader: failed to load model from /content/Meta-Llama-3.1-8B.Q4_K_M.gguf
llama_model_load_from_file_impl: failed to load model


ValueError: Failed to load model from file: /content/Meta-Llama-3.1-8B.Q4_K_M.gguf