# Entrenamiento MLflow (serverless)

Notebook listo para lanzar un experimento con `models/train_databricks_mlflow.py` en un cluster serverless. Solo ajusta los parámetros en la celda de configuración.

In [None]:
# Instala dependencias desde este repo (serverless no usa init scripts)
import os, sys, subprocess

repo_root = os.getcwd()
req_path = os.path.join(repo_root, "requirements.txt")
print(f"Instalando dependencias desde {req_path}")
subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", req_path, "--upgrade", "--no-cache-dir"])

In [None]:
# Parametros del experimento
experiment_path = "/Shared/airline-satisfaction"  # Ruta de experimento en MLflow
run_name = "rf-serverless"
register_model_name = "airline_sat_rf"  # Dejar en "" si no quieres registrar
n_estimators = 300
max_depth = 12  # Usa None para sin limite
test_size = 0.2
random_state = 42

# Rutas de datos
train_path = os.path.join(os.getcwd(), "data", "train.csv")
test_path = os.path.join(os.getcwd(), "data", "test.csv")

# Carpeta local donde se guardaran artefactos (en el workspace del repo)
artifacts_dir = os.path.join(os.getcwd(), "models", "artifacts")

In [None]:
# Ejecuta el entrenamiento via script (MLflow registra automaticamente en Databricks)
import shlex

cmd = [
    sys.executable,
    os.path.join(os.getcwd(), "models", "train_databricks_mlflow.py"),
    "--experiment-path", experiment_path,
    "--run-name", run_name,
    "--n-estimators", str(n_estimators),
    "--max-depth", "None" if max_depth is None else str(max_depth),
    "--test-size", str(test_size),
    "--random-state", str(random_state),
    "--train-path", train_path,
    "--test-path", test_path,
    "--artifacts-dir", artifacts_dir,
]

if register_model_name:
    cmd += ["--register-model-name", register_model_name]

print("Ejecutando:", " ".join(shlex.quote(x) for x in cmd))
subprocess.check_call(cmd)