In [1]:
import os
import pathlib

project_root = pathlib.Path.cwd()

cache_dir = project_root / "models_cache"
cache_dir.mkdir(exist_ok=True)
os.environ['HF_HOME'] = str(cache_dir)
print(f"La variable de entorno HF_HOME se ha establecido en: {os.environ['HF_HOME']}")


La variable de entorno HF_HOME se ha establecido en: /Users/deimagjas/qubits.cloud/machinelearning/gemma3-finetunning/models_cache


In [2]:
import json
from typing import Dict, List, Tuple, Union

import mlx.optimizers as optim
from mlx.utils import tree_flatten
from mlx_lm import load, generate
from mlx_lm.tuner import TrainingArgs, linear_to_lora_layers, train

In [3]:
!uv pip show mlx_lm

Name: mlx-lm
Version: 0.28.2
Location: /Users/deimagjas/qubits.cloud/machinelearning/gemma3-finetunning/.venv/lib/python3.13/site-packages
Requires: jinja2, mlx, numpy, protobuf, pyyaml, transformers
Required-by:


In [4]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Carga de modelo desde HF

La razón por la que el código funciona con google/gemma-3-270m-it pero no con google/gemma-3-270m se debe a la diferencia entre los
  dos tipos de modelos:

   1. `google/gemma-3-270m-it`: El sufijo "-it" significa "Instruction Tuned" (ajustado para instrucciones). Este modelo ha sido
      específicamente entrenado para entender y seguir instrucciones en un formato de chat o de pregunta-respuesta. Su tokenizador
      incluye una "plantilla de chat" (chat template) que formatea la entrada de manera que el modelo la entienda.

   2. `google/gemma-3-270m`: Este es el modelo base. Es un modelo de lenguaje pre-entrenado que es bueno para predecir la siguiente
      palabra en un texto, pero no ha sido ajustado para seguir instrucciones o para conversar. Su tokenizador no tiene una plantilla de
      chat predeterminada.

In [5]:
from mlx_lm.sample_utils import make_sampler

sampler = make_sampler(temp=0.2)
model_path = "google/functiongemma-270m-it"
model, tokenizer = load(model_path)

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/176 [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/63.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/706 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/13.8k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/536M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

In [6]:
prompt = "create a list with what you can do"
messages = [{"role": "user", "content": prompt}]
prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
response = generate(
        model,
        tokenizer,
        prompt=prompt,
        verbose=True,
        sampler=sampler         
    )

I can assist with a wide range of tasks related to information and assistance. My capabilities include:
*   Providing knowledge and information
*   Translating languages
*   Writing
*   Brainstorming ideas
*   Scheduling
*   Managing tasks

I am ready to assist with whatever you need.<end_of_turn>
<end_of_turn>
<end_of_turn>
<start_function_call> model비로 답변할 수 있습니다.<end_of_turn>
<start_function_call> unknown<end_of_turn>
<start_function_call> unknown<escape><end_of_turn>
<start_function_call> unknown<escape><end_of_turn>
<start_function_call> unknown<escape><end_of_turn>
<start_function_call> unknown<escape><end_of_turn>
<start_function_call> unknown<escape><end_of_turn>
<start_function_call> unknown<escape><end_of_turn>
<start_function_call> unknown<escape><end_of_turn>
<start_function_call> unknown<escape><end_of_turn>
<start_function_call> unknown<escape><end_of_turn>
<start_function_call> unknown<escape><end_of_turn>
<start_function_call> unknown<escape><end_of_turn>
<start_functio

In [7]:
from mlx_lm import convert

repo = model_path 

convert(
        repo,
        quantize=True,
        dtype="float16",
        q_bits=4,
        mlx_path="./gemma3f-mlx")

[INFO] Loading


Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

[INFO] Using dtype: float16
[INFO] Quantizing
[INFO] Quantized model with 4.502 bits per weight.


README.md:   0%|          | 0.00/21.2k [00:00<?, ?B/s]

In [8]:
model, tokenizer = load("./gemma3f-mlx")
prompt = "create a list with what you can do"
messages = [{"role": "user", "content": prompt}]
prompt = tokenizer.apply_chat_template(
    messages, add_generation_prompt=True
)
response = generate(model, tokenizer, prompt=prompt, verbose=True, sampler=sampler)

I'm a specialized AI model created by the creators of 'Zero.AI'. My purpose is to provide helpful and informative responses within a specific topic. I cannot directly list all possible capabilities. My purpose is to use this capability for a specific task.<end_of_turn>
I am a specialized AI model created by the creators of 'Zero.AI'. My purpose is to provide helpful and informative responses within a specific topic. I cannot directly list all possible capabilities.<end_of_turn><end_of_turn>
I cannot assist with this request. My purpose is to use this capability for a specific task.<end_of_turn>
<end_of_turn>
I cannot provide a list of possible capabilities. My purpose is to use this capability for a specific task.<end_of_turn>
<end_of_turn>
I am a specialized AI model created by the creators of 'Zero.AI'. My purpose is to provide helpful and informative responses within a specific topic. I cannot directly list all possible capabilities.<end_of_turn><end_of_turn>
<end_of_turn>
I cannot 

# Creando Adaptador

In [9]:
adapter_path = "adaptersf_gemma3"
os.makedirs(adapter_path, exist_ok=True)
adapter_config_path = os.path.join(adapter_path, "adapter_config.json")
adapter_file_path = os.path.join(adapter_path, "adapters.safetensors")

# Lora config
Aquí se ajustan los hyperparámetros para el entrenamiento

In [10]:
lora_config = {
    "num_layers": 4,
    "lora_parameters": {
        "rank": 4,
        "scale": 20.0,
        "dropout": 0.0,
    },
}

In [11]:
with open(adapter_config_path, "w") as f:
    json.dump(lora_config, f, indent=4)

In [12]:
training_args = TrainingArgs(
    batch_size=1,
    adapter_file=adapter_file_path,
    iters=200,
    steps_per_eval=50,
    grad_checkpoint=True,
)

# Parameters and adapter
La razón por la que ves 163,840 parámetros entrenables en lugar de los 270 millones del modelo completo es porque no estás
  re-entrenando el modelo entero. Estás utilizando una técnica de ajuste fino de alta eficiencia de parámetros (PEFT) llamada LoRA 
  (Low-Rank Adaptation).

  Así es como funciona en tu notebook:

   1. Congelar el modelo base: En la celda con el id: a3b86f5c, la primera línea es model.freeze(). Esto "congela" todos los 270 millones
      de parámetros del modelo Gemma, haciendo que no sean entrenables.

   2. Inyectar adaptadores LoRA: La siguiente línea, linear_to_lora_layers(...), añade pequeños "adaptadores" o capas de bajo rango a
      ciertas partes del modelo (en tu caso, a 8 capas, según se define en lora_config).

   3. Entrenar solo los adaptadores: Solo se entrenan los parámetros de estos nuevos y pequeños adaptadores. El número 163,840 es la suma
      de todos los parámetros de estas nuevas capas LoRA que se han añadido.

  En resumen:

   * 270 Millones: Es el tamaño total del modelo base, que permanece sin cambios.
   * 163,840: Es el número de parámetros nuevos y adicionales que estás entrenando. Estos parámetros son los que "aprenden" la nueva
     tarea (en este caso, generar consultas SQL) y adaptan el conocimiento del modelo original.

  Esta es la gran ventaja de LoRA: te permite especializar un modelo enorme en una tarea específica de forma muy rápida y con muchos
  menos recursos computacionales, ya que solo necesitas entrenar una fracción minúscula (<0.1%) de los parámetros totales.

In [13]:
model.freeze()
linear_to_lora_layers(model, lora_config["num_layers"], lora_config["lora_parameters"])
num_train_params = sum(v.size for _, v in tree_flatten(model.trainable_parameters()))
print(f"Number of trainable parameters: {num_train_params}")
model.train()

Number of trainable parameters: 210944


Model(
  (model): Gemma3Model(
    (embed_tokens): QuantizedEmbedding(262144, 640, group_size=64, bits=4, mode=affine)
    (layers.0): TransformerBlock(
      (self_attn): Attention(
        (q_proj): QuantizedLinear(input_dims=640, output_dims=1024, bias=False, group_size=64, bits=4, mode=affine)
        (k_proj): QuantizedLinear(input_dims=640, output_dims=256, bias=False, group_size=64, bits=4, mode=affine)
        (v_proj): QuantizedLinear(input_dims=640, output_dims=256, bias=False, group_size=64, bits=4, mode=affine)
        (o_proj): QuantizedLinear(input_dims=1024, output_dims=640, bias=False, group_size=64, bits=4, mode=affine)
        (q_norm): RMSNorm()
        (k_norm): RMSNorm()
        (rope): RoPE(256, traditional=False)
      )
      (mlp): MLP(
        (gate_proj): QuantizedLinear(input_dims=640, output_dims=2048, bias=False, group_size=64, bits=4, mode=affine)
        (down_proj): QuantizedLinear(input_dims=2048, output_dims=640, bias=False, group_size=64, bits=4, mod

In [14]:
class Metrics:
    def __init__(self) -> None:
        self.train_losses: List[Tuple[int, float]] = []
        self.val_losses: List[Tuple[int, float]] = []

    def on_train_loss_report(self, info: Dict[str, Union[float, int]]) -> None:
        self.train_losses.append((info["iteration"], info["train_loss"]))

    def on_val_loss_report(self, info: Dict[str, Union[float, int]]) -> None:
        self.val_losses.append((info["iteration"], info["val_loss"]))

In [15]:
metrics = Metrics()

# load data

In [16]:
import types
from mlx_lm.tuner.datasets import load_custom_hf_dataset

args = types.SimpleNamespace(
    hf_dataset={
        "path": "google/mobile-actions",
        "train_split": "train[:70%]",
        "valid_split": "train[-10%:]",        
        "prompt_feature": "prompt",
        "completion_feature": "response",
        "config": {},                        
    },
    mask_prompt=False,                       
    train=True,                              
    test=False                                
)
train_set, val_set, test_set = load_custom_hf_dataset(
    args=args,
    tokenizer=tokenizer
    
)

Loading Hugging Face dataset google/mobile-actions.


README.md: 0.00B [00:00, ?B/s]

dataset.jsonl:   0%|          | 0.00/25.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9654 [00:00<?, ? examples/s]

In [17]:
print(f"Test set size: {len(test_set)}")
print(f"Validation set size: {len(val_set)}")
print(f"Training set size: {len(train_set)}")
print(f"test set: {test_set[:2]}")

Test set size: 0
Validation set size: 965
Training set size: 6758
test set: []


In [18]:
from mlx_lm.tuner.datasets import CacheDataset

train_dataset = CacheDataset(train_set)
val_dataset = CacheDataset(val_set)

train(
    model,
    optim.Adam(learning_rate=1e-5),
    train_dataset,
    val_dataset,
    args=training_args,
    training_callback=metrics
)


Starting training..., iters: 200


Calculating loss...: 100%|██████████| 25/25 [00:02<00:00,  8.79it/s]

Iter 1: Val loss 3.526, Val took 2.846s





Iter 10: Train loss 2.822, Learning Rate 1.000e-05, It/sec 1.992, Tokens/sec 2965.436, Trained Tokens 14888, Peak mem 4.242 GB
Iter 20: Train loss 1.734, Learning Rate 1.000e-05, It/sec 4.337, Tokens/sec 6504.059, Trained Tokens 29884, Peak mem 4.299 GB
Iter 30: Train loss 1.188, Learning Rate 1.000e-05, It/sec 4.367, Tokens/sec 6325.429, Trained Tokens 44369, Peak mem 4.299 GB
Iter 40: Train loss 0.987, Learning Rate 1.000e-05, It/sec 4.490, Tokens/sec 6636.340, Trained Tokens 59149, Peak mem 4.299 GB


Calculating loss...: 100%|██████████| 25/25 [00:02<00:00,  9.49it/s]

Iter 50: Val loss 0.711, Val took 2.639s





Iter 50: Train loss 0.850, Learning Rate 1.000e-05, It/sec 4.262, Tokens/sec 6454.452, Trained Tokens 74292, Peak mem 4.299 GB
Iter 60: Train loss 0.671, Learning Rate 1.000e-05, It/sec 4.465, Tokens/sec 6619.040, Trained Tokens 89116, Peak mem 4.299 GB
Iter 70: Train loss 0.548, Learning Rate 1.000e-05, It/sec 4.490, Tokens/sec 6619.233, Trained Tokens 103857, Peak mem 4.299 GB
Iter 80: Train loss 0.456, Learning Rate 1.000e-05, It/sec 4.435, Tokens/sec 6521.536, Trained Tokens 118562, Peak mem 4.299 GB
Iter 90: Train loss 0.413, Learning Rate 1.000e-05, It/sec 4.493, Tokens/sec 6577.972, Trained Tokens 133203, Peak mem 4.299 GB


Calculating loss...: 100%|██████████| 25/25 [00:02<00:00,  9.55it/s]

Iter 100: Val loss 0.315, Val took 2.621s





Iter 100: Train loss 0.414, Learning Rate 1.000e-05, It/sec 4.381, Tokens/sec 6590.693, Trained Tokens 148246, Peak mem 4.299 GB
Iter 100: Saved adapter weights to adaptersf_gemma3/adapters.safetensors and adaptersf_gemma3/0000100_adapters.safetensors.
Iter 110: Train loss 0.349, Learning Rate 1.000e-05, It/sec 4.394, Tokens/sec 6574.233, Trained Tokens 163207, Peak mem 4.299 GB
Iter 120: Train loss 0.340, Learning Rate 1.000e-05, It/sec 4.392, Tokens/sec 6512.432, Trained Tokens 178034, Peak mem 4.300 GB
Iter 130: Train loss 0.270, Learning Rate 1.000e-05, It/sec 4.480, Tokens/sec 6589.036, Trained Tokens 192742, Peak mem 4.300 GB
Iter 140: Train loss 0.258, Learning Rate 1.000e-05, It/sec 4.471, Tokens/sec 6597.955, Trained Tokens 207499, Peak mem 4.300 GB


Calculating loss...: 100%|██████████| 25/25 [00:02<00:00,  9.27it/s]

Iter 150: Val loss 0.268, Val took 2.698s





Iter 150: Train loss 0.257, Learning Rate 1.000e-05, It/sec 4.466, Tokens/sec 6580.067, Trained Tokens 222234, Peak mem 4.300 GB
Iter 160: Train loss 0.251, Learning Rate 1.000e-05, It/sec 4.379, Tokens/sec 6523.166, Trained Tokens 237129, Peak mem 4.327 GB
Iter 170: Train loss 0.229, Learning Rate 1.000e-05, It/sec 4.448, Tokens/sec 6545.368, Trained Tokens 251845, Peak mem 4.327 GB
Iter 180: Train loss 0.252, Learning Rate 1.000e-05, It/sec 4.358, Tokens/sec 6532.730, Trained Tokens 266834, Peak mem 4.327 GB
Iter 190: Train loss 0.221, Learning Rate 1.000e-05, It/sec 4.403, Tokens/sec 6530.304, Trained Tokens 281665, Peak mem 4.327 GB


Calculating loss...: 100%|██████████| 25/25 [00:02<00:00,  9.48it/s]

Iter 200: Val loss 0.197, Val took 2.639s





Iter 200: Train loss 0.246, Learning Rate 1.000e-05, It/sec 4.390, Tokens/sec 6549.904, Trained Tokens 296586, Peak mem 4.327 GB
Iter 200: Saved adapter weights to adaptersf_gemma3/adapters.safetensors and adaptersf_gemma3/0000200_adapters.safetensors.
Saved final weights to adaptersf_gemma3/adapters.safetensors.


## Fusionar modelo base con adaptador
revisar, por que al parecer se está fijando el valor del modelo a fucionar, esto debería ser así?

In [19]:
! python -m mlx_lm fuse  --model ./models_cache/hub/models--google--functiongemma-270m-it/snapshots/ead2a1f9df8d6431408ccff6c9e5e60028addde0 --adapter-path ./adaptersf_gemma3/ --save-path ./new_functiongemma3 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Loading pretrained model


# Subir modelo a HF
Utilizando el API de HF se sube el modelo a deimagjas/Phi-3.5-mini-instruct-4bit-sft

In [21]:
from huggingface_hub import  upload_folder

repo_id = "deimagjas/functiongemma-3-270m-it-sft"

upload_folder(
    folder_path="./new_functiongemma3",
    repo_id=repo_id
)


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/deimagjas/functiongemma-3-270m-it-sft/commit/49f96ee07caaddbbb80796161c8d9f08cd15bc13', commit_message='Upload folder using huggingface_hub', commit_description='', oid='49f96ee07caaddbbb80796161c8d9f08cd15bc13', pr_url=None, repo_url=RepoUrl('https://huggingface.co/deimagjas/functiongemma-3-270m-it-sft', endpoint='https://huggingface.co', repo_type='model', repo_id='deimagjas/functiongemma-3-270m-it-sft'), pr_revision=None, pr_num=None)

## Test HF model

In [22]:
model_path = "deimagjas/functiongemma-3-270m-it-sft"
model_sft, tokenizer_sft = load(model_path)

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/706 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/17.2k [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/176 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/536M [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/13.8k [00:00<?, ?B/s]

In [23]:
prompt = "create a list with what you can do"
messages = [{"role": "user", "content": prompt}]
prompt = tokenizer_sft.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
response = generate(model_sft, tokenizer_sft, prompt=prompt, verbose=True)

I can help with a wide variety of tasks that can be organized into a structured list. Could you tell me what you would like me to do?<end_of_turn>
I am ready at your earliest convenience.<end_of_turn>
<start_function_call>callakimullounces{query:bookshelf bookshelf description and features<escape>}<end_function_call>
Prompt: 17 tokens, 4.288 tokens-per-sec
Generation: 61 tokens, 351.848 tokens-per-sec
Peak memory: 4.327 GB


# Conclusión

El fine tunning en este caso fue exitoso. No obstante, se sigue presentando que el Peak memory es superior despues del fine tunning, posiblemente reduciendo la portabilidad de este modelo. Por què sucede esto?