# Entrenamiento MLflow (serverless)

Notebook listo para lanzar un experimento con `models/train_databricks_mlflow.py` en un cluster serverless. Ajusta par√°metros y ejecuta en orden.

In [None]:
# Instala dependencias desde este repo (serverless no usa init scripts)
import os, sys, subprocess
from pathlib import Path

repo_root = Path.cwd()
if not (repo_root / "requirements.txt").exists():
    for parent in repo_root.parents:
        cand = parent / "requirements.txt"
        if cand.exists():
            repo_root = parent
            break

req_path = repo_root / "requirements.txt"
print(f"Instalando dependencias desde {req_path}")
subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", str(req_path), "--upgrade", "--no-cache-dir"])


In [None]:
# Parametros del experimento y rutas

if "repo_root" not in globals():
    repo_root = Path.cwd()

experiment_path = "/Shared/airline-satisfaction"
run_name = "rf-serverless"
register_model_name = "airline_sat_rf"  # Dejar "" si no quieres registrar
n_estimators = 300
max_depth = 12  # Usa None para sin limite
test_size = 0.2
random_state = 42

train_path = repo_root / "data" / "train.csv"
test_path = repo_root / "data" / "test.csv"
artifacts_dir = repo_root / "models" / "artifacts"

print(f"repo_root: {repo_root}")
print(f"train_path exists: {train_path.exists()}")
print(f"test_path exists: {test_path.exists()}")
print(f"script exists: {(repo_root / 'models' / 'train_databricks_mlflow.py').exists()}")

In [None]:
# Ejecuta el entrenamiento via script (MLflow registra en Databricks)
import shlex, sys, subprocess

script_path = repo_root / "models" / "train_databricks_mlflow.py"
cmd = [
    sys.executable,
    str(script_path),
    "--experiment-path", experiment_path,
    "--run-name", run_name,
    "--n-estimators", str(n_estimators),
    "--test-size", str(test_size),
    "--random-state", str(random_state),
    "--train-path", str(train_path),
    "--artifacts-dir", str(artifacts_dir),
]

if max_depth is not None:
    cmd += ["--max-depth", str(max_depth)]

# Solo pasa test_path si existe; de lo contrario el script hara split interno
if test_path.exists():
    cmd += ["--test-path", str(test_path)]
if register_model_name:
    cmd += ["--register-model-name", register_model_name]

print("Ejecutando:", " ".join(shlex.quote(x) for x in cmd))
# Stream output directly to cell instead of capturing
result = subprocess.run(cmd, text=True)
result.check_returncode()
