# Fine-tuning FunctionGemma pour Home Assistant

Ce notebook permet d'entraîner **FunctionGemma-270m-it** sur Google Colab.

**Pattern multi-turn:**
1. User demande une action
2. Model appelle `get_entities` pour récupérer les entités du bon domaine
3. Tool retourne les entités disponibles
4. Model appelle l'action avec la bonne entité

**Instructions:**
1. Activez le GPU: Runtime → Change runtime type → GPU
2. Exécutez les cellules dans l'ordre

## 1. Installation des dépendances

In [None]:
# Installation avec versions compatibles
!pip install -q transformers==4.42.0 peft==0.11.1 accelerate==0.30.0 datasets bitsandbytes huggingface_hub

In [None]:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"GPU disponible: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

## 2. Configuration Hugging Face

FunctionGemma est un modèle gated:
1. Accepter les conditions sur https://huggingface.co/google/functiongemma-270m-it
2. Créer un token sur https://huggingface.co/settings/tokens

In [None]:
from huggingface_hub import login
from google.colab import userdata

try:
    hf_token = userdata.get('HF_TOKEN')
    login(token=hf_token)
    print("Connecté via secret Colab")
except:
    login()

## 3. Upload du dataset

In [None]:
from google.colab import files
import os

os.makedirs("data", exist_ok=True)

print("Uploadez train.jsonl et val.jsonl")
uploaded = files.upload()

for filename in uploaded.keys():
    os.rename(filename, f"data/{filename}")
    print(f"  → data/{filename}")

In [None]:
import json

def count_lines(filepath):
    with open(filepath, 'r') as f:
        return sum(1 for _ in f)

train_count = count_lines("data/train.jsonl")
val_count = count_lines("data/val.jsonl")

print(f"Dataset:")
print(f"  Train: {train_count} exemples")
print(f"  Validation: {val_count} exemples")

# Aperçu
with open("data/train.jsonl", 'r') as f:
    example = json.loads(f.readline())
    print(f"\nExemple:")
    print(example['text'])

## 4. Configuration

In [None]:
CONFIG = {
    "model_name": "google/functiongemma-270m-it",
    "max_length": 512,  # Plus long pour le multi-turn
    
    # LoRA
    "lora_r": 32,  # Augmenté pour meilleure capacité
    "lora_alpha": 64,
    "lora_dropout": 0.05,
    "lora_target_modules": [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    
    # Entraînement (optimisé pour A100 40GB)
    "batch_size": 16,
    "gradient_accumulation_steps": 1,
    "learning_rate": 1e-4,
    "num_epochs": 10,
    "warmup_ratio": 0.1,
    "weight_decay": 0.01,
    
    "output_dir": "./output",
    "save_steps": 50,
    "logging_steps": 10,
}

print("Configuration:")
for k, v in CONFIG.items():
    print(f"  {k}: {v}")

## 5. Chargement du modèle

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model, TaskType

print(f"Chargement de {CONFIG['model_name']}...")

model = AutoModelForCausalLM.from_pretrained(
    CONFIG["model_name"],
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(
    CONFIG["model_name"],
    trust_remote_code=True,
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.eos_token_id

print(f"Modèle chargé! Paramètres: {model.num_parameters():,}")

In [None]:
lora_config = LoraConfig(
    r=CONFIG["lora_r"],
    lora_alpha=CONFIG["lora_alpha"],
    lora_dropout=CONFIG["lora_dropout"],
    target_modules=CONFIG["lora_target_modules"],
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

## 6. Préparation du dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset(
    "json",
    data_files={
        "train": "data/train.jsonl",
        "validation": "data/val.jsonl",
    }
)

print(f"Train: {len(dataset['train'])} | Val: {len(dataset['validation'])}")

In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=CONFIG["max_length"],
    )

print("Tokenization...")
tokenized_dataset = dataset.map(
    tokenize_function,
    remove_columns=["text"],
    batched=True,
)
print("OK!")

## 7. Entraînement

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir=CONFIG["output_dir"],
    num_train_epochs=CONFIG["num_epochs"],
    per_device_train_batch_size=CONFIG["batch_size"],
    per_device_eval_batch_size=CONFIG["batch_size"],
    gradient_accumulation_steps=CONFIG["gradient_accumulation_steps"],
    learning_rate=CONFIG["learning_rate"],
    warmup_ratio=CONFIG["warmup_ratio"],
    weight_decay=CONFIG["weight_decay"],
    logging_steps=CONFIG["logging_steps"],
    save_steps=CONFIG["save_steps"],
    eval_strategy="steps",
    eval_steps=CONFIG["save_steps"],
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    bf16=True,
    report_to="none",
    remove_unused_columns=False,
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
)

print(f"Epochs: {CONFIG['num_epochs']} | Batch: {CONFIG['batch_size']} | LR: {CONFIG['learning_rate']}")

In [None]:
print("Entraînement...")
trainer.train()

## 8. Sauvegarde

In [None]:
final_path = f"{CONFIG['output_dir']}/final"
trainer.save_model(final_path)
tokenizer.save_pretrained(final_path)
print(f"Sauvegardé: {final_path}")

In [None]:
import shutil
shutil.make_archive("functiongemma-ha", 'zip', final_path)
files.download("functiongemma-ha.zip")

## 9. Test du modèle

**IMPORTANT:** Le format de test doit correspondre EXACTEMENT au format d'entraînement.

In [None]:
def test_model(query: str):
    """Test le modèle - format IDENTIQUE à l'entraînement."""
    # Format exact du dataset d'entraînement
    text = f"<start_of_turn>user\n{query}<end_of_turn>\n<start_of_turn>model\n"
    
    inputs = tokenizer(text, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=150,
            do_sample=False,  # Greedy decoding - plus stable
            pad_token_id=tokenizer.eos_token_id,
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=False)
    
    # Extraire la réponse du modèle
    if "<start_of_turn>model" in response:
        response = response.split("<start_of_turn>model")[-1]
    if "<end_of_turn>" in response:
        response = response.split("<end_of_turn>")[0]
    
    return response.strip()

# Tests avec des requêtes similaires au dataset
test_queries = [
    "Allume la lumière du salon",
    "Éteins la lumière de la cuisine",
    "Mets le chauffage à 21 degrés",
    "Active le thermostat",
    "Éteins ESPresense",
    "Active la scène cinéma",
]

print("Tests du modèle fine-tuné:\n")
for query in test_queries:
    print(f"User: {query}")
    response = test_model(query)
    print(f"Model: {response}")
    print()

## 10. Test multi-turn complet

Simuler le flow complet: query → get_entities → tool response → action

In [None]:
def test_multiturn(query: str, fake_entities: str):
    """Test le flow multi-turn complet."""
    
    # Étape 1: Requête utilisateur → get_entities
    text1 = f"<start_of_turn>user\n{query}<end_of_turn>\n<start_of_turn>model\n"
    inputs1 = tokenizer(text1, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        out1 = model.generate(**inputs1, max_new_tokens=80, do_sample=False, pad_token_id=tokenizer.eos_token_id)
    
    resp1 = tokenizer.decode(out1[0], skip_special_tokens=False)
    if "<start_of_turn>model" in resp1:
        resp1 = resp1.split("<start_of_turn>model")[-1]
    if "<end_of_turn>" in resp1:
        resp1 = resp1.split("<end_of_turn>")[0]
    resp1 = resp1.strip()
    
    print(f"User: {query}")
    print(f"Model (step 1): {resp1}")
    
    # Étape 2: Injecter la réponse tool → action finale
    text2 = (
        f"<start_of_turn>user\n{query}<end_of_turn>\n"
        f"<start_of_turn>model\n{resp1}<end_of_turn>\n"
        f"<start_of_turn>tool\n{fake_entities}<end_of_turn>\n"
        f"<start_of_turn>model\n"
    )
    inputs2 = tokenizer(text2, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        out2 = model.generate(**inputs2, max_new_tokens=80, do_sample=False, pad_token_id=tokenizer.eos_token_id)
    
    resp2 = tokenizer.decode(out2[0], skip_special_tokens=False)
    if "<start_of_turn>model" in resp2:
        resp2 = resp2.split("<start_of_turn>model")[-1]
    if "<end_of_turn>" in resp2:
        resp2 = resp2.split("<end_of_turn>")[0]
    resp2 = resp2.strip()
    
    print(f"Tool: {fake_entities[:100]}...")
    print(f"Model (step 2): {resp2}")
    print()

# Test
print("=== Test Multi-Turn ===")
test_multiturn(
    "Allume la lumière du salon",
    "Entités light disponibles: light.salon, light.cuisine, light.chambre"
)
test_multiturn(
    "Mets le chauffage à 22 degrés",
    "Entités climate disponibles: climate.thermostat_salon, climate.thermostat_bureau"
)