In [2]:
import pandas as pd
import os
import yaml # Necesario para leer archivos .dvc (YAML)
import mlflow
import mlflow.data
import mlflow.sklearn

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# --- CAMBIO CLAVE: Importar GradientBoostingClassifier ---
from sklearn.ensemble import GradientBoostingClassifier 

# --- IMPORTACIONES DE M칄TRICAS ---
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score
)

# --- CONFIGURACI칍N DE RUTAS ---
DATASET_PATH = '../data/interim/student_interim_clean_for_model_2.csv' 
DATASET_NAME = 'student_entry_clean'

# --- 1. CARGA DE DATOS (Mismos pasos) ---
try:
    df = pd.read_csv(DATASET_PATH)
    print(f"Dataset cargado desde: {DATASET_PATH}")
except FileNotFoundError:
    print(f"ERROR: Archivo no encontrado en {DATASET_PATH}. Revisa la ruta.")
    exit()

# --- 2. L칍GICA DVC: OBTENER EL HASH (Mismos pasos) ---
dvc_digest = None
dvc_file_path = DATASET_PATH + ".dvc"
if os.path.exists(dvc_file_path):
    try:
        with open(dvc_file_path, 'r') as f:
            dvc_data = yaml.safe_load(f)
        if 'outs' in dvc_data and dvc_data['outs']:
            dvc_digest = dvc_data['outs'][0].get('md5') 
            if not dvc_digest:
                dvc_digest = dvc_data['outs'][0].get('checksum') 
        print(f"DVC Digest encontrado: {dvc_digest}")
    except Exception as e:
        print(f"ADVERTENCIA: No se pudo leer el archivo DVC. Error: {e}")

# --- 3. PREPARACI칍N DE DATOS Y SPLIT (Mismos pasos) ---
X = df.drop(columns=['Performance']) 
y = df['Performance']
cat_cols = ['Gender','Caste','coaching','time','Class_ten_education','twelve_education','medium','Class_ X_Percentage','Class_XII_Percentage','Father_occupation','Mother_occupation']

le = LabelEncoder()
y_enc = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.2, stratify=y_enc, random_state=42)


# ---------------------------------------------------------------------
# --- 4. CONFIGURACI칍N DEL PIPELINE BASE Y LA GRILLA DE B칔SQUEDA ---
# ---------------------------------------------------------------------

# Pipeline base (usando GradientBoostingClassifier)
preprocessor = ColumnTransformer(
    transformers=[('ohe', OneHotEncoder(handle_unknown='ignore'), cat_cols)],
    remainder='drop'
)
# --- CAMBIO 1: Reemplazamos RandomForestClassifier por GradientBoostingClassifier ---
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor), 
    ('clf', GradientBoostingClassifier(random_state=888))
])

# --- CAMBIO 2: Definimos la Grilla de Par치metros para GBC ---
param_grid = {
    'clf__n_estimators': [50, 100],            # N칰mero de etapas de boosting
    'clf__learning_rate': [0.05, 0.1, 0.2],    # Tasa de aprendizaje
    'clf__max_depth': [3, 5]                   # Profundidad de cada 치rbol
}

# Configuraci칩n del Grid Search 
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='f1_weighted', 
    cv=5,                   
    verbose=2,
    n_jobs=1
)

# --- 5. ENTRENAMIENTO Y PREDICCIONES (Ejecuci칩n del Grid Search) ---
print("\nIniciando Grid Search...")
grid_search.fit(X_train, y_train)
print("Grid Search finalizado.")


# ---------------------------------------------------------------------
# --- 6. REGISTRO DE TODAS LAS CORRIDAS EN MLFLOW (Corregido para GBC) ---
# ---------------------------------------------------------------------

mlflow.set_tracking_uri("http://127.0.0.1:5001")
# --- CAMBIO 3: Nuevo nombre de experimento para diferenciar el modelo ---
EXPERIMENT_NAME = "gradientboosting_GridSearch" 
mlflow.set_experiment(EXPERIMENT_NAME)


# Iterar sobre CADA resultado de CV (Cross-Validation)
for i, (mean_score, std_score, params) in enumerate(zip(
    grid_search.cv_results_['mean_test_score'],
    grid_search.cv_results_['std_test_score'],
    grid_search.cv_results_['params']
)):
    
    with mlflow.start_run(run_name=f"run_{i+1}_GBC_GridSearch", nested=True) as run:
        print(f"Registrando corrida {i+1} con par치metros: {params}")

        # --- CORRECCI칍N: LIMPIEZA DE PAR츼METROS ---
        clf_params = {k.replace('clf__', ''): v for k, v in params.items()}
        # ------------------------------------------
        
        # --- CAMBIO 4: Reconstruir el pipeline con GradientBoostingClassifier ---
        current_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor), 
            ('clf', GradientBoostingClassifier(random_state=888, **clf_params)) 
        ])
        
        # Entrenar el modelo con el conjunto de entrenamiento COMPLETO
        current_pipeline.fit(X_train, y_train)
        
        # --- Predicciones y C치lculo de M칠tricas (en el conjunto de prueba) ---
        y_pred_test = current_pipeline.predict(X_test)
        
        # Calcular M칠tricas (c칩digo sin cambios)
        acc_test = accuracy_score(y_test, y_pred_test)
        f1_micro = f1_score(y_test, y_pred_test, average='micro')
        f1_macro = f1_score(y_test, y_pred_test, average='macro')
        f1_weighted = f1_score(y_test, y_pred_test, average='weighted')
        report_text = classification_report(y_test, y_pred_test, target_names=le.classes_)
        
        metrics = {
            "cv_f1_weighted_mean": mean_score, 
            "cv_f1_weighted_std": std_score,
            "test_acc": acc_test,
            "test_f1_weighted": f1_weighted,
            "test_f1_macro": f1_macro,
        }
        
        # --- Registro en MLflow (sin cambios, solo se actualizan los datos) ---
        mlflow.log_params(params)
        mlflow.log_metrics(metrics)
        
        # Guardar modelo (artefacto)
        mlflow.sklearn.log_model(current_pipeline, "gradient_boosting_pipeline")
        
        # Registrar Dataset
        mlflow_dataset = mlflow.data.from_pandas(
            df=df, source=DATASET_PATH, targets=y.name, name=DATASET_NAME, digest=dvc_digest)
        mlflow.log_input(mlflow_dataset, context="training") 

        # Registrar Classification Report
        temp_report_path = f"classification_report_run_{i+1}.txt"
        with open(temp_report_path, "w") as f:
            f.write(report_text)
        mlflow.log_artifact(temp_report_path, artifact_path="report")
        os.remove(temp_report_path)

        # Identificaci칩n del Mejor Modelo
        if mean_score == grid_search.best_score_:
             print(f"!!! Este es el mejor modelo (F1-Weighted CV: {mean_score:.4f}) !!!")
             mlflow.set_tag("best_run", "True")


print("\n--- RESUMEN FINAL DE GRID SEARCH ---")
print(f"El mejor F1-Weighted (CV) es: {grid_search.best_score_:.4f}")
print(f"Los mejores par치metros son: {grid_search.best_params_}")

Dataset cargado desde: ../data/interim/student_interim_clean_for_model_2.csv
DVC Digest encontrado: 76db7197326a5942db9c5b100349f69b

Iniciando Grid Search...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END clf__learning_rate=0.05, clf__max_depth=3, clf__n_estimators=50; total time=   0.1s
[CV] END clf__learning_rate=0.05, clf__max_depth=3, clf__n_estimators=50; total time=   0.1s
[CV] END clf__learning_rate=0.05, clf__max_depth=3, clf__n_estimators=50; total time=   0.1s
[CV] END clf__learning_rate=0.05, clf__max_depth=3, clf__n_estimators=50; total time=   0.1s
[CV] END clf__learning_rate=0.05, clf__max_depth=3, clf__n_estimators=50; total time=   0.1s
[CV] END clf__learning_rate=0.05, clf__max_depth=3, clf__n_estimators=100; total time=   0.2s
[CV] END clf__learning_rate=0.05, clf__max_depth=3, clf__n_estimators=100; total time=   0.2s
[CV] END clf__learning_rate=0.05, clf__max_depth=3, clf__n_estimators=100; total time=   0.2s
[CV] END clf__learning_rate=0.05,

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
2025/11/02 20:01:21 INFO mlflow.tracking._tracking_service.client: 游끢 View run run_1_GBC_GridSearch at: http://127.0.0.1:5001/#/experiments/972473276069550808/runs/2fae6eb3d9ff40e9b5b1b47799a5c7d6.
2025/11/02 20:01:21 INFO mlflow.tracking._tracking_service.client: 游빍 View experiment at: http://127.0.0.1:5001/#/experiments/972473276069550808.


Registrando corrida 2 con par치metros: {'clf__learning_rate': 0.05, 'clf__max_depth': 3, 'clf__n_estimators': 100}


  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
2025/11/02 20:01:41 INFO mlflow.tracking._tracking_service.client: 游끢 View run run_2_GBC_GridSearch at: http://127.0.0.1:5001/#/experiments/972473276069550808/runs/89b104be9b6441ed8d912a55460c4b0e.
2025/11/02 20:01:41 INFO mlflow.tracking._tracking_service.client: 游빍 View experiment at: http://127.0.0.1:5001/#/experiments/972473276069550808.


Registrando corrida 3 con par치metros: {'clf__learning_rate': 0.05, 'clf__max_depth': 5, 'clf__n_estimators': 50}


  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
2025/11/02 20:02:00 INFO mlflow.tracking._tracking_service.client: 游끢 View run run_3_GBC_GridSearch at: http://127.0.0.1:5001/#/experiments/972473276069550808/runs/456a242e9a854eff892dac411bf3db19.
2025/11/02 20:02:00 INFO mlflow.tracking._tracking_service.client: 游빍 View experiment at: http://127.0.0.1:5001/#/experiments/972473276069550808.


!!! Este es el mejor modelo (F1-Weighted CV: 0.5051) !!!
Registrando corrida 4 con par치metros: {'clf__learning_rate': 0.05, 'clf__max_depth': 5, 'clf__n_estimators': 100}


  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
2025/11/02 20:02:18 INFO mlflow.tracking._tracking_service.client: 游끢 View run run_4_GBC_GridSearch at: http://127.0.0.1:5001/#/experiments/972473276069550808/runs/5a232e143c744da1b797d5236bea755c.
2025/11/02 20:02:18 INFO mlflow.tracking._tracking_service.client: 游빍 View experiment at: http://127.0.0.1:5001/#/experiments/972473276069550808.


Registrando corrida 5 con par치metros: {'clf__learning_rate': 0.1, 'clf__max_depth': 3, 'clf__n_estimators': 50}


  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
2025/11/02 20:02:36 INFO mlflow.tracking._tracking_service.client: 游끢 View run run_5_GBC_GridSearch at: http://127.0.0.1:5001/#/experiments/972473276069550808/runs/bc456972dd494500b4cfbd538c75a9ee.
2025/11/02 20:02:36 INFO mlflow.tracking._tracking_service.client: 游빍 View experiment at: http://127.0.0.1:5001/#/experiments/972473276069550808.


Registrando corrida 6 con par치metros: {'clf__learning_rate': 0.1, 'clf__max_depth': 3, 'clf__n_estimators': 100}


  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
2025/11/02 20:02:54 INFO mlflow.tracking._tracking_service.client: 游끢 View run run_6_GBC_GridSearch at: http://127.0.0.1:5001/#/experiments/972473276069550808/runs/efcf54062ba54a34955d20dcfc714383.
2025/11/02 20:02:54 INFO mlflow.tracking._tracking_service.client: 游빍 View experiment at: http://127.0.0.1:5001/#/experiments/972473276069550808.


Registrando corrida 7 con par치metros: {'clf__learning_rate': 0.1, 'clf__max_depth': 5, 'clf__n_estimators': 50}


  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
2025/11/02 20:03:12 INFO mlflow.tracking._tracking_service.client: 游끢 View run run_7_GBC_GridSearch at: http://127.0.0.1:5001/#/experiments/972473276069550808/runs/a4018a330e2b4073b9d97f971ab8c2eb.
2025/11/02 20:03:12 INFO mlflow.tracking._tracking_service.client: 游빍 View experiment at: http://127.0.0.1:5001/#/experiments/972473276069550808.


Registrando corrida 8 con par치metros: {'clf__learning_rate': 0.1, 'clf__max_depth': 5, 'clf__n_estimators': 100}


  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
2025/11/02 20:03:30 INFO mlflow.tracking._tracking_service.client: 游끢 View run run_8_GBC_GridSearch at: http://127.0.0.1:5001/#/experiments/972473276069550808/runs/70dc92f5586c41e380808c2a37187e76.
2025/11/02 20:03:30 INFO mlflow.tracking._tracking_service.client: 游빍 View experiment at: http://127.0.0.1:5001/#/experiments/972473276069550808.


Registrando corrida 9 con par치metros: {'clf__learning_rate': 0.2, 'clf__max_depth': 3, 'clf__n_estimators': 50}


  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
2025/11/02 20:03:47 INFO mlflow.tracking._tracking_service.client: 游끢 View run run_9_GBC_GridSearch at: http://127.0.0.1:5001/#/experiments/972473276069550808/runs/4911f377b942465485213757b593134a.
2025/11/02 20:03:47 INFO mlflow.tracking._tracking_service.client: 游빍 View experiment at: http://127.0.0.1:5001/#/experiments/972473276069550808.


Registrando corrida 10 con par치metros: {'clf__learning_rate': 0.2, 'clf__max_depth': 3, 'clf__n_estimators': 100}


  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
2025/11/02 20:04:06 INFO mlflow.tracking._tracking_service.client: 游끢 View run run_10_GBC_GridSearch at: http://127.0.0.1:5001/#/experiments/972473276069550808/runs/ad0d667e43254d4485518f8feb90f23d.
2025/11/02 20:04:06 INFO mlflow.tracking._tracking_service.client: 游빍 View experiment at: http://127.0.0.1:5001/#/experiments/972473276069550808.


Registrando corrida 11 con par치metros: {'clf__learning_rate': 0.2, 'clf__max_depth': 5, 'clf__n_estimators': 50}


  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
2025/11/02 20:04:24 INFO mlflow.tracking._tracking_service.client: 游끢 View run run_11_GBC_GridSearch at: http://127.0.0.1:5001/#/experiments/972473276069550808/runs/724056b09fbb4ebba94764e1a77b203a.
2025/11/02 20:04:24 INFO mlflow.tracking._tracking_service.client: 游빍 View experiment at: http://127.0.0.1:5001/#/experiments/972473276069550808.


Registrando corrida 12 con par치metros: {'clf__learning_rate': 0.2, 'clf__max_depth': 5, 'clf__n_estimators': 100}


  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
2025/11/02 20:04:42 INFO mlflow.tracking._tracking_service.client: 游끢 View run run_12_GBC_GridSearch at: http://127.0.0.1:5001/#/experiments/972473276069550808/runs/69da643cac494681a915779922dd951c.
2025/11/02 20:04:42 INFO mlflow.tracking._tracking_service.client: 游빍 View experiment at: http://127.0.0.1:5001/#/experiments/972473276069550808.



--- RESUMEN FINAL DE GRID SEARCH ---
El mejor F1-Weighted (CV) es: 0.5051
Los mejores par치metros son: {'clf__learning_rate': 0.05, 'clf__max_depth': 5, 'clf__n_estimators': 50}
