# Entrenamiento MLflow (serverless)

Notebook listo para lanzar un experimento con `models/train_databricks_mlflow.py` en cluster serverless. Ejecuta celdas en orden.

In [None]:
# Instala dependencias desde este repo (serverless no usa init scripts)
import os, sys, subprocess
from pathlib import Path

repo_root = Path.cwd()
# Fix path if running from notebooks folder
if repo_root.name == "notebooks":
    repo_root = repo_root.parent

# Fallback check
if not (repo_root / "requirements.txt").exists():
    for parent in repo_root.parents:
        if (parent / "requirements.txt").exists():
            repo_root = parent
            break

req_path = repo_root / "requirements.txt"
print(f"Repo root detected: {repo_root}")
print(f"Instalando dependencias desde {req_path}")
subprocess.check_call([sys.executable, "-m", "pip", "install", "-r", str(req_path), "--upgrade", "--no-cache-dir"])



In [None]:
# Parametros del experimento y rutas
from pathlib import Path

if "repo_root" not in globals():
    repo_root = Path.cwd()
    if repo_root.name == "notebooks":
        repo_root = repo_root.parent

# Databricks Community Edition: Usar nombre de experimento simple
# El experimento se crear√° autom√°ticamente en tu carpeta de usuario
experiment_path = "airline-satisfaction"

print(f"Experiment Path: {experiment_path}")

run_name = "rf-community-edition"
register_model_name = "airline_sat_rf"  
n_estimators = 300
max_depth = 12  
test_size = 0.2
random_state = 42

train_path = repo_root / "data" / "train.csv"
test_path = repo_root / "data" / "test.csv"
artifacts_dir = repo_root / "models" / "artifacts"

print(f"repo_root: {repo_root}")
print(f"train_path exists: {train_path.exists()}")
print(f"test_path exists: {test_path.exists()}")
print(f"script exists: {(repo_root / 'models' / 'train_databricks_mlflow.py').exists()}")


In [None]:
# Entrenamiento directo (compatible con Community Edition)
import pandas as pd
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from joblib import dump
import json

import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature

print("‚úì Iniciando entrenamiento (Community Edition)\n")

# --- PREPARACI√ìN DE DATOS ---
print("Cargando datos...")
train_df = pd.read_csv(train_path)

# Limpiar columnas
drop_cols = [c for c in train_df.columns if c.lower().startswith("unnamed") or c.lower() == "id"]
train_df = train_df.drop(columns=drop_cols, errors="ignore")

# Mapear satisfacci√≥n a binario
if "satisfaction" in train_df.columns:
    train_df["satisfaction"] = train_df["satisfaction"].map(
        {"satisfied": 1, "neutral or dissatisfied": 0}
    )

# Split o usar test.csv
if test_path.exists():
    test_df = pd.read_csv(test_path)
    test_df = test_df.drop(columns=drop_cols, errors="ignore")
    if "satisfaction" in test_df.columns:
        test_df["satisfaction"] = test_df["satisfaction"].map(
            {"satisfied": 1, "neutral or dissatisfied": 0}
        )
else:
    train_df, test_df = train_test_split(
        train_df, test_size=test_size, random_state=random_state, 
        stratify=train_df["satisfaction"]
    )

X_train = train_df.drop(columns=["satisfaction"])
y_train = train_df["satisfaction"]
X_test = test_df.drop(columns=["satisfaction"])
y_test = test_df["satisfaction"]

# Identificar columnas categ√≥ricas y num√©ricas
cat_cols = X_train.select_dtypes(exclude="number").columns.tolist()
num_cols = X_train.select_dtypes(include="number").columns.tolist()

print(f"Features: {len(X_train.columns)} ({len(num_cols)} num√©ricas, {len(cat_cols)} categ√≥ricas)")
print(f"Train: {len(X_train)} | Test: {len(X_test)}\n")

# --- ENTRENAR ---
print("üöÄ Entrenando modelo...")
preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ],
    remainder="drop",
)

model = RandomForestClassifier(
    n_estimators=n_estimators,
    max_depth=max_depth,
    random_state=random_state,
    n_jobs=-1
)

pipeline = Pipeline([
    ("preprocess", preprocess),
    ("model", model)
])

pipeline.fit(X_train, y_train)
print("‚úì Modelo entrenado\n")

# --- EVALUAR ---
predictions = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)

print("üìä M√©tricas:")
print(f"  Accuracy:  {accuracy:.4f}")
print(f"  Precision: {precision:.4f}")
print(f"  Recall:    {recall:.4f}")
print(f"  F1:        {f1:.4f}\n")

# --- GUARDAR LOCALMENTE ---
artifacts_dir.mkdir(parents=True, exist_ok=True)
model_path = artifacts_dir / f"model_{int(time.time())}.joblib"
metrics_path = artifacts_dir / f"metrics_{int(time.time())}.json"

dump(pipeline, model_path)
with open(metrics_path, "w") as f:
    json.dump({
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "n_estimators": n_estimators,
        "max_depth": max_depth,
        "random_state": random_state
    }, f, indent=2)

print(f"‚úÖ Modelo guardado en: {model_path}")
print(f"‚úÖ M√©tricas guardadas en: {metrics_path}\n")

# Usa el nombre simple de tu modelo
NOMBRE_MODELO_REGISTRO = "Pipeline_Satisfaccion_Cliente_Final"

# --- BLOQUE DE REGISTRO LEGACY ---
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
import time

# --- CONFIGURACI√ìN CLAVE ---
# Usar un nombre simple para registrar en el Legacy Model Registry (pesta√±a "Models")
NOMBRE_MODELO_REGISTRO = "Pipeline_Satisfaccion_Final"
EXPERIMENT_NAME = "/MiPipeline_Satisfaccion"

# Asegura que el experimento est√© activo (crucial para Databricks)
mlflow.set_experiment(EXPERIMENT_NAME)

print(f"1. Iniciando Run en el experimento: {EXPERIMENT_NAME}")

try:
    # 2. Iniciar el Run
    with mlflow.start_run(run_name=f"Run_Pipeline_{int(time.time())}") as run:
        
        # Loguear M√©tricas y Par√°metros (Aseg√∫rate de que estas variables existan)
        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("f1", f1)

        # Generar Firma (Usando el pipeline y los datos de entrenamiento/predicci√≥n)
        signature = infer_signature(X_train, predictions) # Asumiendo X_train y predictions existen
        
        # 3. Registrar el PIPELINE COMPLETO
        # Al usar un nombre simple y no configurar registry_uri, se usa el Legacy Registry.
        mlflow.sklearn.log_model(
            sk_model=pipeline,  # <--- CR√çTICO: Registra el objeto Pipeline de scikit-learn
            artifact_path="full_pipeline",
            registered_model_name=NOMBRE_MODELO_REGISTRO,
            signature=signature
        )
        
        run_id = run.info.run_id
        
        print(f"\n‚úÖ ¬°√âXITO! Modelo registrado en la pesta√±a 'Models'.")
        print(f"   -> Nombre del Modelo: {NOMBRE_MODELO_REGISTRO}")
        print(f"   -> Run ID: {run_id}")
            
except Exception as e:
    print(f"\n‚ùå REGISTRO FALLIDO. La causa es: {e}")
    print("Si el error CONFIG_NOT_AVAILABLE persiste, solo queda la opci√≥n HTTP/PAT.")