# Módulo 1.2: Pipelines de Clasificación con MLflow

## Objetivos
- Crear pipelines de preprocesamiento reproducibles
- Tracking de pipelines completos con MLflow
- Comparar múltiples algoritmos de clasificación
- Nested runs para organizar experimentos

In [None]:
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer, load_wine
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_auc_score, roc_curve
)
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [None]:
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("sklearn-classification-pipelines")

## 1. Dataset: Breast Cancer Wisconsin

Dataset de clasificación binaria para detección de cáncer

In [None]:
data = load_breast_cancer()
X = data.data
y = data.target

df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y
df['diagnosis'] = df['target'].map({0: 'malignant', 1: 'benign'})

print(f"Dataset shape: {df.shape}")
print(f"\nClass distribution:")
print(df['diagnosis'].value_counts())
print(f"\nClass balance: {df['diagnosis'].value_counts(normalize=True)}")

print("\nFeature statistics:")
print(df.describe())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Train positive rate: {y_train.mean():.3f}")
print(f"Test positive rate: {y_test.mean():.3f}")

## 2. Pipelines con scikit-learn

Un pipeline encadena preprocesamiento y modelo en un solo objeto

In [None]:
def create_pipeline(scaler_type='standard', model=None):
    
    scalers = {
        'standard': StandardScaler(),
        'minmax': MinMaxScaler(),
        'robust': RobustScaler()
    }
    
    pipeline = Pipeline([
        ('scaler', scalers[scaler_type]),
        ('classifier', model)
    ])
    
    return pipeline

In [None]:
def evaluate_model(pipeline, X_train, y_train, X_test, y_test):
    
    y_pred_train = pipeline.predict(X_train)
    y_pred_test = pipeline.predict(X_test)
    y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
    
    metrics = {
        'train_accuracy': accuracy_score(y_train, y_pred_train),
        'test_accuracy': accuracy_score(y_test, y_pred_test),
        'precision': precision_score(y_test, y_pred_test),
        'recall': recall_score(y_test, y_pred_test),
        'f1_score': f1_score(y_test, y_pred_test),
        'roc_auc': roc_auc_score(y_test, y_pred_proba)
    }
    
    cv_scores = cross_val_score(
        pipeline, X_train, y_train, cv=5, scoring='accuracy'
    )
    metrics['cv_accuracy_mean'] = cv_scores.mean()
    metrics['cv_accuracy_std'] = cv_scores.std()
    
    return metrics, y_pred_test, y_pred_proba

## 3. Experimento 1: Comparar Scalers

Vamos a comparar diferentes métodos de escalado con el mismo modelo

In [None]:
base_model = LogisticRegression(max_iter=1000, random_state=42)
scalers_to_test = ['standard', 'minmax', 'robust']

with mlflow.start_run(run_name="scaler_comparison") as parent_run:
    
    mlflow.set_tag("experiment_type", "scaler_comparison")
    mlflow.log_param("base_model", "LogisticRegression")
    
    scaler_results = []
    
    for scaler_name in scalers_to_test:
        with mlflow.start_run(run_name=f"scaler_{scaler_name}", nested=True):
            
            pipeline = create_pipeline(scaler_type=scaler_name, model=base_model)
            
            pipeline.fit(X_train, y_train)
            
            mlflow.log_param("scaler", scaler_name)
            mlflow.log_param("model", "LogisticRegression")
            
            metrics, y_pred, y_proba = evaluate_model(
                pipeline, X_train, y_train, X_test, y_test
            )
            
            for metric_name, metric_value in metrics.items():
                mlflow.log_metric(metric_name, metric_value)
            
            cm = confusion_matrix(y_test, y_pred)
            plt.figure(figsize=(8, 6))
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
            plt.title(f'Confusion Matrix - {scaler_name} scaler')
            plt.ylabel('True')
            plt.xlabel('Predicted')
            plt.savefig(f'cm_{scaler_name}.png')
            mlflow.log_artifact(f'cm_{scaler_name}.png')
            plt.close()
            
            mlflow.sklearn.log_model(pipeline, f"pipeline_{scaler_name}")
            
            scaler_results.append({
                'scaler': scaler_name,
                **metrics
            })
            
            print(f"{scaler_name}: Accuracy={metrics['test_accuracy']:.4f}, "
                  f"ROC-AUC={metrics['roc_auc']:.4f}")
    
    results_df = pd.DataFrame(scaler_results)
    results_df.to_csv('scaler_comparison.csv', index=False)
    mlflow.log_artifact('scaler_comparison.csv')
    
    print("\nComparación de Scalers:")
    print(results_df[['scaler', 'test_accuracy', 'roc_auc', 'f1_score']])

## 4. Experimento 2: Comparar Múltiples Algoritmos

Ahora comparemos diferentes algoritmos de clasificación

In [None]:
models_to_test = {
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(probability=True, random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'NaiveBayes': GaussianNB()
}

with mlflow.start_run(run_name="model_comparison") as parent_run:
    
    mlflow.set_tag("experiment_type", "model_comparison")
    mlflow.log_param("scaler", "standard")
    mlflow.log_param("num_models", len(models_to_test))
    
    model_results = []
    
    for model_name, model in models_to_test.items():
        with mlflow.start_run(run_name=f"model_{model_name}", nested=True):
            
            pipeline = create_pipeline(scaler_type='standard', model=model)
            
            pipeline.fit(X_train, y_train)
            
            mlflow.log_param("model", model_name)
            mlflow.log_param("scaler", "standard")
            
            if hasattr(model, 'get_params'):
                model_params = model.get_params()
                for param_name, param_value in model_params.items():
                    if isinstance(param_value, (int, float, str, bool)):
                        mlflow.log_param(f"model_{param_name}", param_value)
            
            metrics, y_pred, y_proba = evaluate_model(
                pipeline, X_train, y_train, X_test, y_test
            )
            
            for metric_name, metric_value in metrics.items():
                mlflow.log_metric(metric_name, metric_value)
            
            fpr, tpr, _ = roc_curve(y_test, y_proba)
            plt.figure(figsize=(8, 6))
            plt.plot(fpr, tpr, label=f"ROC curve (AUC = {metrics['roc_auc']:.3f})")
            plt.plot([0, 1], [0, 1], 'k--', label='Random')
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title(f'ROC Curve - {model_name}')
            plt.legend()
            plt.savefig(f'roc_{model_name}.png')
            mlflow.log_artifact(f'roc_{model_name}.png')
            plt.close()
            
            mlflow.sklearn.log_model(pipeline, f"pipeline_{model_name}")
            
            model_results.append({
                'model': model_name,
                **metrics
            })
            
            print(f"{model_name}: Accuracy={metrics['test_accuracy']:.4f}, "
                  f"F1={metrics['f1_score']:.4f}, ROC-AUC={metrics['roc_auc']:.4f}")
    
    results_df = pd.DataFrame(model_results)
    results_df = results_df.sort_values('test_accuracy', ascending=False)
    results_df.to_csv('model_comparison.csv', index=False)
    mlflow.log_artifact('model_comparison.csv')
    
    best_model = results_df.iloc[0]['model']
    best_accuracy = results_df.iloc[0]['test_accuracy']
    mlflow.log_param("best_model", best_model)
    mlflow.log_metric("best_accuracy", best_accuracy)
    
    print("\n" + "="*60)
    print("Ranking de Modelos por Accuracy:")
    print(results_df[['model', 'test_accuracy', 'f1_score', 'roc_auc']])
    print("="*60)
    print(f"\nMejor modelo: {best_model} con accuracy={best_accuracy:.4f}")

## 5. Visualización Comparativa de Todos los Modelos

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

results_df_sorted = results_df.sort_values('test_accuracy', ascending=True)
axes[0, 0].barh(results_df_sorted['model'], results_df_sorted['test_accuracy'])
axes[0, 0].set_xlabel('Accuracy')
axes[0, 0].set_title('Test Accuracy por Modelo')
axes[0, 0].grid(axis='x', alpha=0.3)

results_df_sorted = results_df.sort_values('f1_score', ascending=True)
axes[0, 1].barh(results_df_sorted['model'], results_df_sorted['f1_score'], color='orange')
axes[0, 1].set_xlabel('F1 Score')
axes[0, 1].set_title('F1 Score por Modelo')
axes[0, 1].grid(axis='x', alpha=0.3)

results_df_sorted = results_df.sort_values('roc_auc', ascending=True)
axes[1, 0].barh(results_df_sorted['model'], results_df_sorted['roc_auc'], color='green')
axes[1, 0].set_xlabel('ROC-AUC')
axes[1, 0].set_title('ROC-AUC por Modelo')
axes[1, 0].grid(axis='x', alpha=0.3)

metrics_to_plot = ['precision', 'recall', 'f1_score']
x = np.arange(len(results_df))
width = 0.25
for i, metric in enumerate(metrics_to_plot):
    axes[1, 1].bar(x + i*width, results_df[metric], width, label=metric)
axes[1, 1].set_xlabel('Modelo')
axes[1, 1].set_ylabel('Score')
axes[1, 1].set_title('Precision, Recall y F1-Score')
axes[1, 1].set_xticks(x + width)
axes[1, 1].set_xticklabels(results_df['model'], rotation=45, ha='right')
axes[1, 1].legend()
axes[1, 1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('comprehensive_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("Visualización guardada como 'comprehensive_comparison.png'")

## 6. Análisis de Cross-Validation

In [None]:
with mlflow.start_run(run_name="cross_validation_analysis"):
    
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    pipeline = create_pipeline(scaler_type='standard', model=model)
    
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='accuracy')
    
    mlflow.log_param("model", "RandomForest")
    mlflow.log_param("cv_folds", 10)
    mlflow.log_param("cv_strategy", "StratifiedKFold")
    
    mlflow.log_metric("cv_mean", cv_scores.mean())
    mlflow.log_metric("cv_std", cv_scores.std())
    mlflow.log_metric("cv_min", cv_scores.min())
    mlflow.log_metric("cv_max", cv_scores.max())
    
    for i, score in enumerate(cv_scores):
        mlflow.log_metric(f"cv_fold_{i+1}", score)
    
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, 11), cv_scores, marker='o', linestyle='-', linewidth=2, markersize=8)
    plt.axhline(y=cv_scores.mean(), color='r', linestyle='--', label=f'Mean: {cv_scores.mean():.4f}')
    plt.fill_between(range(1, 11), 
                     cv_scores.mean() - cv_scores.std(),
                     cv_scores.mean() + cv_scores.std(),
                     alpha=0.2, color='red', label=f'±1 Std: {cv_scores.std():.4f}')
    plt.xlabel('Fold')
    plt.ylabel('Accuracy')
    plt.title('10-Fold Cross-Validation Scores')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.savefig('cv_analysis.png')
    mlflow.log_artifact('cv_analysis.png')
    plt.show()
    
    print(f"Cross-Validation Results:")
    print(f"Mean Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")
    print(f"Min Accuracy: {cv_scores.min():.4f}")
    print(f"Max Accuracy: {cv_scores.max():.4f}")
    print(f"\nIndividual Fold Scores:")
    for i, score in enumerate(cv_scores, 1):
        print(f"Fold {i}: {score:.4f}")

## 7. Búsqueda y Comparación de Runs

In [None]:
experiment = mlflow.get_experiment_by_name("sklearn-classification-pipelines")
runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])

print(f"Total runs: {len(runs)}")
print("\nTop 5 runs por test_accuracy:")
top_runs = runs.nlargest(5, 'metrics.test_accuracy')
print(top_runs[['run_id', 'tags.mlflow.runName', 'metrics.test_accuracy', 'metrics.f1_score']])

## Resumen del Módulo 1.2

### Conceptos Clave:

1. **Pipelines de scikit-learn**
   - Encadenar preprocesamiento y modelo
   - Logging completo del pipeline con MLflow
   - Reproducibilidad garantizada

2. **Nested Runs**
   - Organizar experimentos jerárquicamente
   - Parent run para comparaciones globales
   - Child runs para configuraciones individuales

3. **Comparación de Modelos**
   - Tracking sistemático de múltiples algoritmos
   - Métricas estandarizadas
   - Visualizaciones automáticas

4. **Cross-Validation con MLflow**
   - Logging de scores por fold
   - Análisis de variabilidad

### Siguiente Paso:
En el siguiente notebook trabajaremos con regresión y técnicas avanzadas.