In [2]:
import mlflow
import mlflow.data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import mlflow.sklearn
import pandas as pd
import os
import yaml # Necesario para leer archivos .dvc (YAML)

# --- CONFIGURACIÓN DE RUTAS ---
# Ajusta esta ruta si es diferente de donde ejecutas el script
DATASET_PATH = '../data/raw/student_entry_performance_original.csv' 
DATASET_NAME = 'student_entry_raw'

# --- 1. CARGA DE DATOS ---
try:
    df = pd.read_csv(DATASET_PATH) # <-- CARGA DEL DATAFRAME
    print(f"Dataset cargado desde: {DATASET_PATH}")
except FileNotFoundError:
    print(f"ERROR: Archivo no encontrado en {DATASET_PATH}. Revisa la ruta.")
    exit() 

# --- 2. LÓGICA DVC: OBTENER EL HASH (DIGEST) ---
dvc_digest = None
dvc_file_path = DATASET_PATH + ".dvc" # Busca el puntero .dvc

if os.path.exists(dvc_file_path):
    try:
        with open(dvc_file_path, 'r') as f:
            dvc_data = yaml.safe_load(f)
        
        # Extraer el hash MD5 (el identificador de versión de DVC)
        if 'outs' in dvc_data and dvc_data['outs']:
            # Priorizamos 'md5' que DVC usa por defecto para archivos
            dvc_digest = dvc_data['outs'][0].get('md5') 
            if not dvc_digest:
                 dvc_digest = dvc_data['outs'][0].get('checksum') 
            
        print(f"DVC Digest encontrado: {dvc_digest}")
    except Exception as e:
        print(f"ADVERTENCIA: No se pudo leer el archivo DVC. Error: {e}")

# --- 3. PREPARACIÓN DE DATOS Y SPLIT ---
X = df.drop(columns=['Performance']) 
y = df['Performance'] # Columna Target
cat_cols = ['Gender','Caste','coaching','time','Class_ten_education','twelve_education','medium',
            'Class_ X_Percentage','Class_XII_Percentage','Father_occupation','Mother_occupation']

# Label encode target
le = LabelEncoder()
y_enc = le.fit_transform(y)

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.2, stratify=y_enc, random_state=42
)

# --- 4. CONFIGURACIÓN DEL PIPELINE Y PARÁMETROS ---
# Preprocesamiento
ohe = OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer(
    transformers=[('ohe', ohe, cat_cols)],
    remainder='drop'
)

params ={
    "n_estimators":100,
    "max_depth":6,
    "min_samples_split":10,
    "min_samples_leaf": 4,
    "bootstrap": True,
    "oob_score": False,
    "random_state": 888
}

# Pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('clf', RandomForestClassifier(**params))
])

# --- 5. ENTRENAMIENTO Y PREDICCIONES ---
pipeline.fit(X_train, y_train)
    
# Predicciones
y_pred_train = pipeline.predict(X_train)
y_pred_test = pipeline.predict(X_test)
    
# Métricas
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)

metrics = {"acc_train": acc_train, "acc_test": acc_test}
    
print("Acc train:", acc_train)
print("Acc test :", acc_test)
print("\nClassification report (test):\n", classification_report(y_test, y_pred_test, target_names=le.classes_))

# --- 6. INICIAR CORRIDA MLFLOW Y REGISTRO ---
mlflow.set_tracking_uri("http://127.0.0.1:5001")
rf_experiment = mlflow.set_experiment("randomforest_Models_1")
run_name = "student_rf_test_1_1_con_dvc"
artifact_path = "rf_student"

with mlflow.start_run(run_name=run_name) as run:
    
    # Registrar hiperparámetros
    mlflow.log_params(params)
    
    # Registrar métricas (CORREGIDO: usando log_metrics)
    mlflow.log_metrics(metrics)  
    
    # Guardar modelo
    mlflow.sklearn.log_model(pipeline, "random_forest_pipeline")

    # Crear el objeto Dataset a partir del DataFrame cargado
    mlflow_dataset = mlflow.data.from_pandas(
        df=df,
        source=DATASET_PATH, 
        targets=y.name, # Nombre de la columna target, ej: 'Performance'
        name=DATASET_NAME,
        digest=dvc_digest # <--- Integración DVC para trazabilidad
    )
    
    # Registrar la entrada de datos en la corrida de MLflow
    mlflow.log_input(mlflow_dataset, context="training") 

    print(f"Modelo, métricas, y dataset registrados en MLflow con run_id: {run.info.run_id}")
    print(f"Verifica el campo 'Dataset' en la interfaz de MLflow para el digest de DVC.")

Dataset cargado desde: ../data/raw/student_entry_performance_original.csv
Acc train: 0.6372180451127819
Acc test : 0.5447761194029851

Classification report (test):
               precision    recall  f1-score   support

     Average       0.76      0.88      0.81        32
   Excellent       0.67      0.10      0.17        20
        Good       0.43      0.50      0.46        42
          Vg       0.49      0.55      0.52        40

    accuracy                           0.54       134
   macro avg       0.59      0.51      0.49       134
weighted avg       0.56      0.54      0.52       134



MlflowException: API request to http://127.0.0.1:5001/api/2.0/mlflow/experiments/get-by-name failed with exception HTTPConnectionPool(host='127.0.0.1', port=5001): Max retries exceeded with url: /api/2.0/mlflow/experiments/get-by-name?experiment_name=randomforest_Models_1 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000201BEFA7D10>: Failed to establish a new connection: [WinError 10061] No se puede establecer una conexión ya que el equipo de destino denegó expresamente dicha conexión'))