In [1]:
import os
import pathlib

project_root = pathlib.Path.cwd()

cache_dir = project_root / "models_cache"
cache_dir.mkdir(exist_ok=True)
os.environ['HF_HOME'] = str(cache_dir)
print(f"La variable de entorno HF_HOME se ha establecido en: {os.environ['HF_HOME']}")


La variable de entorno HF_HOME se ha establecido en: /Users/deimagjas/machinelearning/gemma3-finetunning/models_cache


In [2]:
import json
from typing import Dict, List, Tuple, Union

import mlx.optimizers as optim
from mlx.utils import tree_flatten
from mlx_lm import load, generate
from mlx_lm.tuner import TrainingArgs, linear_to_lora_layers, train

In [3]:
!uv pip show mlx_lm

Name: mlx-lm
Version: 0.28.0
Location: /Users/deimagjas/machinelearning/gemma3-finetunning/.venv/lib/python3.13/site-packages
Requires: jinja2, mlx, numpy, protobuf, pyyaml, transformers
Required-by:


In [4]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Carga de modelo desde HF

La razón por la que el código funciona con google/gemma-3-270m-it pero no con google/gemma-3-270m se debe a la diferencia entre los
  dos tipos de modelos:

   1. `google/gemma-3-270m-it`: El sufijo "-it" significa "Instruction Tuned" (ajustado para instrucciones). Este modelo ha sido
      específicamente entrenado para entender y seguir instrucciones en un formato de chat o de pregunta-respuesta. Su tokenizador
      incluye una "plantilla de chat" (chat template) que formatea la entrada de manera que el modelo la entienda.

   2. `google/gemma-3-270m`: Este es el modelo base. Es un modelo de lenguaje pre-entrenado que es bueno para predecir la siguiente
      palabra en un texto, pero no ha sido ajustado para seguir instrucciones o para conversar. Su tokenizador no tiene una plantilla de
      chat predeterminada.

In [5]:
from mlx_lm.sample_utils import make_sampler

sampler = make_sampler(temp=0.7, top_p=0.95, top_k=50)
model_path = "google/gemma-3-270m-it"
model, tokenizer = load(model_path)

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

In [6]:
prompt = "create a list of steps in order to help someone that has an anxiety attack"
messages = [{"role": "user", "content": prompt}]
prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
response = generate(
        model,
        tokenizer,
        prompt=prompt,
        verbose=True,
        sampler=sampler,
        kv_bits=8    
    )

Okay, here's a list of steps to help someone who has an anxiety attack. It's important to remember that this is a general guideline, and it's crucial to work with a qualified mental health professional for personalized advice and treatment. **Please don't hesitate to reach out to a mental health professional for support and guidance.**

**Important Considerations:**

*   **Be Kind and Patient:** Anxiety attacks are common, and it's okay to feel nervous. Allow yourself to feel the anxiety without judgment.
*   **Stay Hydrated and Nourished:** Dehydration and food can worsen anxiety symptoms.
*   **Take Deep Breaths:** Practice deep breathing exercises, meditation, or other relaxation techniques.
*   **Limit Caffeine and Alcohol:** These can exacerbate anxiety.
*   **Avoid Over-the-Counter Medications:** They may not be effective for severe anxiety.
*   **Seek Professional Help:** If you're struggling to manage your anxiety, don't hesitate to seek professional help from a therapist, coun

In [8]:
from mlx_lm import convert

repo = model_path 

convert(
        repo,
        quantize=True,
        dtype="float16",
        q_bits=4,
        mlx_path="./gemma3-mlx")

[INFO] Loading


Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

[INFO] Using dtype: float16
[INFO] Quantizing
[INFO] Quantized model with 4.502 bits per weight.


In [9]:
model, tokenizer = load("./gemma3-mlx")
prompt = "create a list of steps in order to help someone that has an anxiety attack"
messages = [{"role": "user", "content": prompt}]
prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
response = generate(model, tokenizer, prompt=prompt, verbose=True, sampler=sampler, kv_bits=8)

There is nothing in the list that can be directly applied to an anxiety attack. Anxiety is a complex issue with multiple factors, including physical, emotional, and social. There is no single "one-size-fits-all" solution. What is a good approach for someone with an anxiety attack is to engage in a personalized exploration. This can involve a combination of activities, behaviors, and communication.

Here's a sample list of steps to help someone explore the process of finding relief:

**Step-by-Step:**

1.  **Identify Your Feelings:**
    *   What are the specific emotions associated with the anxiety?
    *   Are there any triggers?
    *   Are there any patterns in your experience?
    *   Is there a specific event that triggered the anxiety?
    *   What are the feelings of powerlessness or being in control?
    *   What are the feelings of fear or distress?
    *   What are the feelings of sadness or other intrusive thoughts?
    *   What are the feelings of guilt or shame?
    *   Wh

# Creando Adaptador

In [None]:
adapter_path = "adapters_gemma3"
os.makedirs(adapter_path, exist_ok=True)
adapter_config_path = os.path.join(adapter_path, "adapter_config.json")
adapter_file_path = os.path.join(adapter_path, "adapters.safetensors")

# Lora config
Aquí se ajustan los hyperparámetros para el entrenamiento

In [None]:
lora_config = {
    "num_layers": 4,
    "lora_parameters": {
        "rank": 4,
        "scale": 20.0,
        "dropout": 0.0,
    },
}

In [None]:
with open(adapter_config_path, "w") as f:
    json.dump(lora_config, f, indent=4)

In [None]:
training_args = TrainingArgs(
    batch_size=1,
    adapter_file=adapter_file_path,
    iters=200,
    steps_per_eval=50,
    grad_checkpoint=True,
)

# Parameters and adapter
La razón por la que ves 163,840 parámetros entrenables en lugar de los 270 millones del modelo completo es porque no estás
  re-entrenando el modelo entero. Estás utilizando una técnica de ajuste fino de alta eficiencia de parámetros (PEFT) llamada LoRA 
  (Low-Rank Adaptation).

  Así es como funciona en tu notebook:

   1. Congelar el modelo base: En la celda con el id: a3b86f5c, la primera línea es model.freeze(). Esto "congela" todos los 270 millones
      de parámetros del modelo Gemma, haciendo que no sean entrenables.

   2. Inyectar adaptadores LoRA: La siguiente línea, linear_to_lora_layers(...), añade pequeños "adaptadores" o capas de bajo rango a
      ciertas partes del modelo (en tu caso, a 8 capas, según se define en lora_config).

   3. Entrenar solo los adaptadores: Solo se entrenan los parámetros de estos nuevos y pequeños adaptadores. El número 163,840 es la suma
      de todos los parámetros de estas nuevas capas LoRA que se han añadido.

  En resumen:

   * 270 Millones: Es el tamaño total del modelo base, que permanece sin cambios.
   * 163,840: Es el número de parámetros nuevos y adicionales que estás entrenando. Estos parámetros son los que "aprenden" la nueva
     tarea (en este caso, generar consultas SQL) y adaptan el conocimiento del modelo original.

  Esta es la gran ventaja de LoRA: te permite especializar un modelo enorme en una tarea específica de forma muy rápida y con muchos
  menos recursos computacionales, ya que solo necesitas entrenar una fracción minúscula (<0.1%) de los parámetros totales.

In [None]:
model.freeze()
linear_to_lora_layers(model, lora_config["num_layers"], lora_config["lora_parameters"])
num_train_params = sum(v.size for _, v in tree_flatten(model.trainable_parameters()))
print(f"Number of trainable parameters: {num_train_params}")
model.train()

In [None]:
class Metrics:
    def __init__(self) -> None:
        self.train_losses: List[Tuple[int, float]] = []
        self.val_losses: List[Tuple[int, float]] = []

    def on_train_loss_report(self, info: Dict[str, Union[float, int]]) -> None:
        self.train_losses.append((info["iteration"], info["train_loss"]))

    def on_val_loss_report(self, info: Dict[str, Union[float, int]]) -> None:
        self.val_losses.append((info["iteration"], info["val_loss"]))

In [None]:
metrics = Metrics()

# load data

In [None]:
import types
from mlx_lm.tuner.datasets import load_custom_hf_dataset

args = types.SimpleNamespace(
    hf_dataset={
        "path": "nvidia/HelpSteer",
        "train_split": "train[:1%]",
        "valid_split": "train[-1%:]",        
        "prompt_feature": "prompt",
        "completion_feature": "response",
        "config": {},                        
    },
    mask_prompt=False,                       
    train=True,                              
    test=False                                
)
train_set, val_set, test_set = load_custom_hf_dataset(
    args=args,
    tokenizer=tokenizer
    
)

In [None]:
print(f"Test set size: {len(test_set)}")
print(f"Validation set size: {len(val_set)}")
print(f"Training set size: {len(train_set)}")
print(f"test set: {test_set[:2]}")

In [None]:
from mlx_lm.tuner.datasets import CacheDataset

train_dataset = CacheDataset(train_set)
val_dataset = CacheDataset(val_set)

train(
    model,
    optim.Adam(learning_rate=1e-5),
    train_dataset,
    val_dataset,
    args=training_args,
    training_callback=metrics
)


## Fusionar modelo base con adaptador
revisar, por que al parecer se está fijando el valor del modelo a fucionar, esto debería ser así?

In [None]:
! python -m mlx_lm fuse  --model ./models_cache/hub/models--google--gemma-3-270m-it/snapshots/ac82b4e820549b854eebf28ce6dedaf9fdfa17b3 --adapter-path ./adapters_gemma3 --save-path ./new_gemma3 

# Subir modelo a HF
Utilizando el API de HF se sube el modelo a deimagjas/Phi-3.5-mini-instruct-4bit-sft

In [None]:
from huggingface_hub import  upload_folder

repo_id = "deimagjas/gemma-3-270m-it-sft"

upload_folder(
    folder_path="./new_gemma3",
    repo_id=repo_id
)


## Test HF model

In [None]:
model_path = "deimagjas/gemma-3-270m-it-sft"
model_sft, tokenizer_sft = load(model_path)

In [None]:
prompt = "generate an SQL query to find all users who registered in the last 30 days"
messages = [{"role": "user", "content": prompt}]
prompt = tokenizer_sft.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
response = generate(model_sft, tokenizer_sft, prompt=prompt, verbose=True)

# Conclusión

El fine tunning en este caso fallo, el modelo presenta fallas en la inferencia. ¿por qué?
Ademas consume 5X memoria mas o menos