In [1]:
import pandas as pd
import os
import yaml 
import mlflow
import mlflow.data
import mlflow.sklearn

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# --- CAMBIO CLAVE 1: Importar SVC (Support Vector Classifier) ---
from sklearn.svm import SVC 

# --- IMPORTACIONES DE M칄TRICAS (sin cambios) ---
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score
)

# --- CONFIGURACI칍N DE RUTAS (sin cambios) ---
DATASET_PATH = '../data/raw/student_entry_performance_original.csv' 
DATASET_NAME = 'student_entry_clean'

# --- 1. CARGA DE DATOS (Mismos pasos) ---
try:
    df = pd.read_csv(DATASET_PATH)
    print(f"Dataset cargado desde: {DATASET_PATH}")
except FileNotFoundError:
    print(f"ERROR: Archivo no encontrado en {DATASET_PATH}. Revisa la ruta.")
    exit()

# --- 2. L칍GICA DVC: OBTENER EL HASH (Mismos pasos) ---
dvc_digest = None
dvc_file_path = DATASET_PATH + ".dvc"
if os.path.exists(dvc_file_path):
    try:
        with open(dvc_file_path, 'r') as f:
            dvc_data = yaml.safe_load(f)
        if 'outs' in dvc_data and dvc_data['outs']:
            dvc_digest = dvc_data['outs'][0].get('md5') 
            if not dvc_digest:
                dvc_digest = dvc_data['outs'][0].get('checksum') 
        print(f"DVC Digest encontrado: {dvc_digest}")
    except Exception as e:
        print(f"ADVERTENCIA: No se pudo leer el archivo DVC. Error: {e}")

# --- 3. PREPARACI칍N DE DATOS Y SPLIT (Mismos pasos) ---
X = df.drop(columns=['Performance']) 
y = df['Performance']
cat_cols = ['Gender','Caste','coaching','time','Class_ten_education','twelve_education','medium','Class_ X_Percentage','Class_XII_Percentage','Father_occupation','Mother_occupation']

le = LabelEncoder()
y_enc = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.2, stratify=y_enc, random_state=42)


# ---------------------------------------------------------------------
# --- 4. CONFIGURACI칍N DEL PIPELINE BASE Y LA GRILLA DE B칔SQUEDA ---
# ---------------------------------------------------------------------

# Pipeline base
preprocessor = ColumnTransformer(
    transformers=[('ohe', OneHotEncoder(handle_unknown='ignore'), cat_cols)],
    remainder='drop'
)
# --- CAMBIO 2: Reemplazamos el clasificador por SVC ---
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor), 
    ('clf', SVC(random_state=888, probability=True)) # 'probability=True' es 칰til para m칠tricas m치s avanzadas, pero hace el entrenamiento m치s lento.
])

# --- CAMBIO 3: Definimos la Grilla de Par치metros para SVC ---
# Nota: SVC con GridSearch puede ser lento. Se usan pocos valores.
param_grid = {
    'clf__C': [0.1, 1, 10],            # Par치metro de regularizaci칩n
    'clf__kernel': ['rbf', 'poly'],    # Tipo de kernel
    'clf__gamma': ['scale', 'auto']    # Coeficiente de kernel para 'rbf', 'poly', etc.
}

# Configuraci칩n del Grid Search (sin cambios)
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='f1_weighted', 
    cv=5,                 
    verbose=2,
    n_jobs=1
)

# --- 5. ENTRENAMIENTO Y PREDICCIONES (Ejecuci칩n del Grid Search) ---
print("\nIniciando Grid Search...")
grid_search.fit(X_train, y_train)
print("Grid Search finalizado.")


# ---------------------------------------------------------------------
# --- 6. REGISTRO DE TODAS LAS CORRIDAS EN MLFLOW ---
# ---------------------------------------------------------------------

mlflow.set_tracking_uri("http://127.0.0.1:5001")
# --- CAMBIO 4: Nuevo nombre de experimento para SVC ---
EXPERIMENT_NAME = "svc_GridSearch" 
mlflow.set_experiment(EXPERIMENT_NAME)


# Iterar sobre CADA resultado de CV (Cross-Validation)
for i, (mean_score, std_score, params) in enumerate(zip(
    grid_search.cv_results_['mean_test_score'],
    grid_search.cv_results_['std_test_score'],
    grid_search.cv_results_['params']
)):
    
    with mlflow.start_run(run_name=f"run_{i+1}_SVC_GridSearch", nested=True) as run:
        print(f"Registrando corrida {i+1} con par치metros: {params}")

        # --- CORRECCI칍N: LIMPIEZA DE PAR츼METROS ---
        clf_params = {k.replace('clf__', ''): v for k, v in params.items()}
        # ------------------------------------------
        
        # --- CAMBIO 5: Reconstruir el pipeline con SVC ---
        current_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor), 
            ('clf', SVC(random_state=888, probability=True, **clf_params)) 
        ])
        
        # Entrenar el modelo con el conjunto de entrenamiento COMPLETO
        current_pipeline.fit(X_train, y_train)
        
        # --- Predicciones y C치lculo de M칠tricas (en el conjunto de prueba) ---
        y_pred_test = current_pipeline.predict(X_test)
        
        # Calcular M칠tricas (c칩digo sin cambios)
        acc_test = accuracy_score(y_test, y_pred_test)
        f1_micro = f1_score(y_test, y_pred_test, average='micro')
        f1_macro = f1_score(y_test, y_pred_test, average='macro')
        f1_weighted = f1_score(y_test, y_pred_test, average='weighted')
        report_text = classification_report(y_test, y_pred_test, target_names=le.classes_)
        
        metrics = {
            "cv_f1_weighted_mean": mean_score, 
            "cv_f1_weighted_std": std_score,
            "test_acc": acc_test,
            "test_f1_weighted": f1_weighted,
            "test_f1_macro": f1_macro,
        }
        
        # --- Registro en MLflow (sin cambios, solo se actualizan los datos) ---
        mlflow.log_params(params)
        mlflow.log_metrics(metrics)
        
        # Guardar modelo (artefacto)
        # --- CAMBIO 6: Actualizar el nombre del artefacto ---
        mlflow.sklearn.log_model(current_pipeline, "svc_pipeline") 
        
        # Registrar Dataset (sin cambios)
        mlflow_dataset = mlflow.data.from_pandas(
            df=df, source=DATASET_PATH, targets=y.name, name=DATASET_NAME, digest=dvc_digest)
        mlflow.log_input(mlflow_dataset, context="training") 

        # Registrar Classification Report (sin cambios)
        temp_report_path = f"classification_report_run_{i+1}.txt"
        with open(temp_report_path, "w") as f:
            f.write(report_text)
        mlflow.log_artifact(temp_report_path, artifact_path="report")
        os.remove(temp_report_path)

        # Identificaci칩n del Mejor Modelo (sin cambios)
        if mean_score == grid_search.best_score_:
              print(f"!!! Este es el mejor modelo (F1-Weighted CV: {mean_score:.4f}) !!!")
              mlflow.set_tag("best_run", "True")


print("\n--- RESUMEN FINAL DE GRID SEARCH ---")
print(f"El mejor F1-Weighted (CV) es: {grid_search.best_score_:.4f}")
print(f"Los mejores par치metros son: {grid_search.best_params_}")

Dataset cargado desde: ../data/raw/student_entry_performance_original.csv
DVC Digest encontrado: fe5c6bdf2fe1d5c36afc295a345fccab

Iniciando Grid Search...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END ......clf__C=0.1, clf__gamma=scale, clf__kernel=rbf; total time=   0.0s
[CV] END ......clf__C=0.1, clf__gamma=scale, clf__kernel=rbf; total time=   0.0s
[CV] END ......clf__C=0.1, clf__gamma=scale, clf__kernel=rbf; total time=   0.0s
[CV] END ......clf__C=0.1, clf__gamma=scale, clf__kernel=rbf; total time=   0.0s
[CV] END ......clf__C=0.1, clf__gamma=scale, clf__kernel=rbf; total time=   0.0s
[CV] END .....clf__C=0.1, clf__gamma=scale, clf__kernel=poly; total time=   0.0s
[CV] END .....clf__C=0.1, clf__gamma=scale, clf__kernel=poly; total time=   0.0s
[CV] END .....clf__C=0.1, clf__gamma=scale, clf__kernel=poly; total time=   0.0s
[CV] END .....clf__C=0.1, clf__gamma=scale, clf__kernel=poly; total time=   0.0s
[CV] END .....clf__C=0.1, clf__gamma=scale, clf__kerne

2025/11/02 20:09:45 INFO mlflow.tracking.fluent: Experiment with name 'svc_GridSearch' does not exist. Creating a new experiment.


Grid Search finalizado.
Registrando corrida 1 con par치metros: {'clf__C': 0.1, 'clf__gamma': 'scale', 'clf__kernel': 'rbf'}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
2025/11/02 20:10:04 INFO mlflow.tracking._tracking_service.client: 游끢 View run run_1_SVC_GridSearch at: http://127.0.0.1:5001/#/experiments/953003479690098145/runs/0c4f2fb718064402886dc2584d75c681.
2025/11/02 20:10:04 INFO mlflow.tracking._tracking_service.client: 游빍 View experiment at: http://127.0.0.1:5001/#/experiments/953003479690098145.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Registrando corrida 2 con par치metros: {'clf__C': 0.1, 'clf__gamma': 'scale', 'clf__kernel': 'poly'}


  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
2025/11/02 20:10:18 INFO mlflow.tracking._tracking_service.client: 游끢 View run run_2_SVC_GridSearch at: http://127.0.0.1:5001/#/experiments/953003479690098145/runs/14aacd9f4cc14b35bafcd8348b3ffa55.
2025/11/02 20:10:18 INFO mlflow.tracking._tracking_service.client: 游빍 View experiment at: http://127.0.0.1:5001/#/experiments/953003479690098145.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Registrando corrida 3 con par치metros: {'clf__C': 0.1, 'clf__gamma': 'auto', 'clf__kernel': 'rbf'}


  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
2025/11/02 20:10:33 INFO mlflow.tracking._tracking_service.client: 游끢 View run run_3_SVC_GridSearch at: http://127.0.0.1:5001/#/experiments/953003479690098145/runs/4f843a9852ec4d50892e3a748f2daabd.
2025/11/02 20:10:33 INFO mlflow.tracking._tracking_service.client: 游빍 View experiment at: http://127.0.0.1:5001/#/experiments/953003479690098145.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Registrando corrida 4 con par치metros: {'clf__C': 0.1, 'clf__gamma': 'auto', 'clf__kernel': 'poly'}


  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
2025/11/02 20:10:50 INFO mlflow.tracking._tracking_service.client: 游끢 View run run_4_SVC_GridSearch at: http://127.0.0.1:5001/#/experiments/953003479690098145/runs/e2ebc1673b7f449cb37cf806d8464b1c.
2025/11/02 20:10:50 INFO mlflow.tracking._tracking_service.client: 游빍 View experiment at: http://127.0.0.1:5001/#/experiments/953003479690098145.


Registrando corrida 5 con par치metros: {'clf__C': 1, 'clf__gamma': 'scale', 'clf__kernel': 'rbf'}


  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
2025/11/02 20:11:09 INFO mlflow.tracking._tracking_service.client: 游끢 View run run_5_SVC_GridSearch at: http://127.0.0.1:5001/#/experiments/953003479690098145/runs/72658f8a8a8a4fb49a45be5c57b1e98c.
2025/11/02 20:11:09 INFO mlflow.tracking._tracking_service.client: 游빍 View experiment at: http://127.0.0.1:5001/#/experiments/953003479690098145.


!!! Este es el mejor modelo (F1-Weighted CV: 0.4786) !!!
Registrando corrida 6 con par치metros: {'clf__C': 1, 'clf__gamma': 'scale', 'clf__kernel': 'poly'}


  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
2025/11/02 20:11:25 INFO mlflow.tracking._tracking_service.client: 游끢 View run run_6_SVC_GridSearch at: http://127.0.0.1:5001/#/experiments/953003479690098145/runs/77ae9a8833d845808a3923a628920b7d.
2025/11/02 20:11:25 INFO mlflow.tracking._tracking_service.client: 游빍 View experiment at: http://127.0.0.1:5001/#/experiments/953003479690098145.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Registrando corrida 7 con par치metros: {'clf__C': 1, 'clf__gamma': 'auto', 'clf__kernel': 'rbf'}


  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
2025/11/02 20:11:44 INFO mlflow.tracking._tracking_service.client: 游끢 View run run_7_SVC_GridSearch at: http://127.0.0.1:5001/#/experiments/953003479690098145/runs/191ae9b8f60d4fe086e4bde4b359e763.
2025/11/02 20:11:44 INFO mlflow.tracking._tracking_service.client: 游빍 View experiment at: http://127.0.0.1:5001/#/experiments/953003479690098145.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Registrando corrida 8 con par치metros: {'clf__C': 1, 'clf__gamma': 'auto', 'clf__kernel': 'poly'}


  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
2025/11/02 20:12:01 INFO mlflow.tracking._tracking_service.client: 游끢 View run run_8_SVC_GridSearch at: http://127.0.0.1:5001/#/experiments/953003479690098145/runs/55af6a3787ee401da66bac4fba898e92.
2025/11/02 20:12:01 INFO mlflow.tracking._tracking_service.client: 游빍 View experiment at: http://127.0.0.1:5001/#/experiments/953003479690098145.


Registrando corrida 9 con par치metros: {'clf__C': 10, 'clf__gamma': 'scale', 'clf__kernel': 'rbf'}


  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
2025/11/02 20:12:19 INFO mlflow.tracking._tracking_service.client: 游끢 View run run_9_SVC_GridSearch at: http://127.0.0.1:5001/#/experiments/953003479690098145/runs/f6232b526bdd49b5ae57bac78f89d6c4.
2025/11/02 20:12:19 INFO mlflow.tracking._tracking_service.client: 游빍 View experiment at: http://127.0.0.1:5001/#/experiments/953003479690098145.


Registrando corrida 10 con par치metros: {'clf__C': 10, 'clf__gamma': 'scale', 'clf__kernel': 'poly'}


  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
2025/11/02 20:12:35 INFO mlflow.tracking._tracking_service.client: 游끢 View run run_10_SVC_GridSearch at: http://127.0.0.1:5001/#/experiments/953003479690098145/runs/c4483a1ed41346c2b3de6342f0bea679.
2025/11/02 20:12:35 INFO mlflow.tracking._tracking_service.client: 游빍 View experiment at: http://127.0.0.1:5001/#/experiments/953003479690098145.


Registrando corrida 11 con par치metros: {'clf__C': 10, 'clf__gamma': 'auto', 'clf__kernel': 'rbf'}


  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
2025/11/02 20:12:52 INFO mlflow.tracking._tracking_service.client: 游끢 View run run_11_SVC_GridSearch at: http://127.0.0.1:5001/#/experiments/953003479690098145/runs/548e5002fbcd4fe59ec4a7b55af625d6.
2025/11/02 20:12:52 INFO mlflow.tracking._tracking_service.client: 游빍 View experiment at: http://127.0.0.1:5001/#/experiments/953003479690098145.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Registrando corrida 12 con par치metros: {'clf__C': 10, 'clf__gamma': 'auto', 'clf__kernel': 'poly'}


  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
2025/11/02 20:13:09 INFO mlflow.tracking._tracking_service.client: 游끢 View run run_12_SVC_GridSearch at: http://127.0.0.1:5001/#/experiments/953003479690098145/runs/6cb6f1a0017748a18c68e433e1a6d0ce.
2025/11/02 20:13:09 INFO mlflow.tracking._tracking_service.client: 游빍 View experiment at: http://127.0.0.1:5001/#/experiments/953003479690098145.



--- RESUMEN FINAL DE GRID SEARCH ---
El mejor F1-Weighted (CV) es: 0.4786
Los mejores par치metros son: {'clf__C': 1, 'clf__gamma': 'scale', 'clf__kernel': 'rbf'}
