# 16 - DuckDB + Scikit-learn + ONNX: ML Cl√°sico con Export

## üéØ Objetivos
- Procesamiento de datos con DuckDB
- Feature engineering con SQL
- Entrenamiento de modelos Scikit-learn
- Exportaci√≥n a ONNX para producci√≥n
- Inferencia con ONNX Runtime
- Comparaci√≥n de performance Python vs ONNX
- MLflow tracking completo

## üìö Tecnolog√≠as
- **DuckDB**: SQL analytics y feature engineering
- **Scikit-learn**: Algoritmos de ML cl√°sico
- **ONNX**: Formato de intercambio de modelos
- **ONNX Runtime**: Inferencia optimizada
- **MLflow**: Experiment tracking

## ‚≠ê Complejidad: Intermedio

## 1. Instalaci√≥n y Setup

In [None]:
# Instalar dependencias
!pip install duckdb pandas numpy scikit-learn mlflow onnx onnxruntime skl2onnx matplotlib seaborn plotly -q

In [None]:
import duckdb
import mlflow
import mlflow.sklearn
import mlflow.onnx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from pathlib import Path
import json
import time
import warnings
warnings.filterwarnings('ignore')

# Scikit-learn
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve

# ONNX
import onnx
import onnxruntime as rt
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

print(f"‚úÖ DuckDB version: {duckdb.__version__}")
print(f"‚úÖ MLflow version: {mlflow.__version__}")
print(f"‚úÖ ONNX version: {onnx.__version__}")
print(f"‚úÖ ONNX Runtime version: {rt.__version__}")

## 2. Configurar MLflow

In [None]:
mlflow.set_tracking_uri("./mlruns")
experiment_name = "sklearn_onnx_export"
mlflow.set_experiment(experiment_name)

print(f"‚úÖ MLflow configurado")
print(f"üìä Experimento: {experiment_name}")

## 3. Generar Dataset con DuckDB

In [None]:
# Conectar a DuckDB
con = duckdb.connect(':memory:')

# Generar datos sint√©ticos de fraude bancario
np.random.seed(42)

n_samples = 10000

# Transacciones normales (80%)
normal_transactions = int(n_samples * 0.8)
fraud_transactions = n_samples - normal_transactions

# Features para transacciones normales
normal_data = {
    'transaction_id': range(1, normal_transactions + 1),
    'amount': np.random.lognormal(4, 1.5, normal_transactions),
    'transaction_hour': np.random.randint(6, 23, normal_transactions),
    'day_of_week': np.random.randint(0, 7, normal_transactions),
    'merchant_category': np.random.choice(['retail', 'restaurant', 'gas', 'grocery', 'online'], normal_transactions),
    'distance_from_home': np.random.exponential(10, normal_transactions),
    'distance_from_last': np.random.exponential(5, normal_transactions),
    'ratio_to_median': np.random.normal(1.0, 0.3, normal_transactions),
    'used_chip': np.random.choice([0, 1], normal_transactions, p=[0.1, 0.9]),
    'used_pin': np.random.choice([0, 1], normal_transactions, p=[0.2, 0.8]),
    'online_order': np.random.choice([0, 1], normal_transactions, p=[0.7, 0.3]),
    'is_fraud': [0] * normal_transactions
}

# Features para transacciones fraudulentas (patrones diferentes)
fraud_data = {
    'transaction_id': range(normal_transactions + 1, n_samples + 1),
    'amount': np.random.lognormal(5.5, 1.2, fraud_transactions),  # Montos m√°s altos
    'transaction_hour': np.random.randint(0, 6, fraud_transactions),  # Horas inusuales
    'day_of_week': np.random.randint(0, 7, fraud_transactions),
    'merchant_category': np.random.choice(['retail', 'online', 'gas'], fraud_transactions, p=[0.2, 0.6, 0.2]),
    'distance_from_home': np.random.exponential(50, fraud_transactions),  # Lejos de casa
    'distance_from_last': np.random.exponential(30, fraud_transactions),  # Lejos de √∫ltima transacci√≥n
    'ratio_to_median': np.random.normal(2.5, 0.5, fraud_transactions),  # Ratio alto
    'used_chip': np.random.choice([0, 1], fraud_transactions, p=[0.8, 0.2]),  # Sin chip
    'used_pin': np.random.choice([0, 1], fraud_transactions, p=[0.9, 0.1]),  # Sin PIN
    'online_order': np.random.choice([0, 1], fraud_transactions, p=[0.2, 0.8]),  # M√°s online
    'is_fraud': [1] * fraud_transactions
}

# Combinar
df_normal = pd.DataFrame(normal_data)
df_fraud = pd.DataFrame(fraud_data)
df = pd.concat([df_normal, df_fraud], ignore_index=True)

# Shuffle
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"üìä Dataset generado: {len(df):,} transacciones")
print(f"üìä Fraudes: {df['is_fraud'].sum():,} ({df['is_fraud'].mean()*100:.1f}%)")
print(f"\nüìä Primeras filas:")
print(df.head())

## 4. Feature Engineering con DuckDB

In [None]:
# An√°lisis exploratorio con DuckDB
fraud_stats = con.execute("""
    SELECT 
        is_fraud,
        COUNT(*) as count,
        ROUND(AVG(amount), 2) as avg_amount,
        ROUND(AVG(distance_from_home), 2) as avg_distance_home,
        ROUND(AVG(transaction_hour), 2) as avg_hour,
        ROUND(AVG(used_chip), 2) as chip_usage,
        ROUND(AVG(online_order), 2) as online_rate
    FROM df
    GROUP BY is_fraud
""").df()

print("üìä Estad√≠sticas por clase:")
print(fraud_stats)

# Feature engineering con SQL
df_engineered = con.execute("""
    SELECT 
        transaction_id,
        amount,
        LN(amount + 1) as log_amount,
        transaction_hour,
        CASE 
            WHEN transaction_hour BETWEEN 6 AND 22 THEN 1
            ELSE 0
        END as is_business_hours,
        day_of_week,
        CASE 
            WHEN day_of_week IN (5, 6) THEN 1
            ELSE 0
        END as is_weekend,
        merchant_category,
        distance_from_home,
        LN(distance_from_home + 1) as log_distance_home,
        distance_from_last,
        LN(distance_from_last + 1) as log_distance_last,
        ratio_to_median,
        used_chip,
        used_pin,
        online_order,
        amount * ratio_to_median as weighted_amount,
        CASE 
            WHEN used_chip = 0 AND used_pin = 0 THEN 1
            ELSE 0
        END as no_security,
        is_fraud
    FROM df
""").df()

print(f"\n‚úÖ Feature engineering completado")
print(f"üìä Features creadas: {df_engineered.shape[1]}")
print(f"\nüìä Nuevas features:")
print(df_engineered.columns.tolist())

## 5. Preparar Datos para ML

In [None]:
# One-hot encoding para merchant_category
df_encoded = pd.get_dummies(df_engineered, columns=['merchant_category'], prefix='merchant')

# Separar features y target
X = df_encoded.drop(['transaction_id', 'is_fraud'], axis=1)
y = df_encoded['is_fraud']

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Normalizar
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"üìä Train set: {X_train.shape}")
print(f"üìä Test set: {X_test.shape}")
print(f"üìä Features: {X_train.shape[1]}")
print(f"\nüìä Feature names:")
print(X.columns.tolist())

## 6. Entrenar M√∫ltiples Modelos con MLflow

In [None]:
def train_and_export_model(model, model_name, X_train, X_test, y_train, y_test, feature_names):
    """
    Entrena modelo, exporta a ONNX y trackea con MLflow
    """
    
    with mlflow.start_run(run_name=f"{model_name}_onnx_export"):
        
        # Log par√°metros del modelo
        mlflow.log_param("model_type", model_name)
        mlflow.log_param("n_features", X_train.shape[1])
        mlflow.log_param("n_train_samples", len(X_train))
        mlflow.log_param("n_test_samples", len(X_test))
        
        if hasattr(model, 'get_params'):
            for param, value in model.get_params().items():
                mlflow.log_param(f"model_{param}", value)
        
        # Entrenar
        print(f"\n{'='*60}")
        print(f"Entrenando: {model_name}")
        print(f"{'='*60}")
        
        start_time = time.time()
        model.fit(X_train, y_train)
        train_time = time.time() - start_time
        
        # Predicciones
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
        
        # M√©tricas
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        mlflow.log_metric("train_time_seconds", train_time)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)
        
        if y_pred_proba is not None:
            auc = roc_auc_score(y_test, y_pred_proba)
            mlflow.log_metric("auc_roc", auc)
        else:
            auc = None
        
        # Guardar modelo scikit-learn
        mlflow.sklearn.log_model(model, "sklearn_model")
        
        # Exportar a ONNX
        print(f"\nüîÑ Exportando a ONNX...")
        
        # Definir tipos de entrada
        initial_type = [('float_input', FloatTensorType([None, X_train.shape[1]]))]
        
        # Convertir a ONNX
        onnx_model = convert_sklearn(model, initial_types=initial_type)
        
        # Guardar ONNX
        onnx_filename = f"{model_name.lower()}_model.onnx"
        onnx.save_model(onnx_model, onnx_filename)
        
        # Log ONNX model en MLflow
        mlflow.onnx.log_model(onnx_model, "onnx_model")
        mlflow.log_artifact(onnx_filename)
        
        print(f"‚úÖ Modelo ONNX guardado: {onnx_filename}")
        
        # Test ONNX inference
        print(f"\nüß™ Testeando inferencia ONNX...")
        
        sess = rt.InferenceSession(onnx_filename)
        input_name = sess.get_inputs()[0].name
        label_name = sess.get_outputs()[0].name
        
        # Inferencia con ONNX
        start_time = time.time()
        onnx_pred = sess.run([label_name], {input_name: X_test.astype(np.float32)})[0]
        onnx_inference_time = time.time() - start_time
        
        # Inferencia con Sklearn (para comparar)
        start_time = time.time()
        sklearn_pred = model.predict(X_test)
        sklearn_inference_time = time.time() - start_time
        
        # Comparar resultados
        predictions_match = np.array_equal(onnx_pred, sklearn_pred)
        
        mlflow.log_metric("onnx_inference_time", onnx_inference_time)
        mlflow.log_metric("sklearn_inference_time", sklearn_inference_time)
        mlflow.log_metric("onnx_speedup", sklearn_inference_time / onnx_inference_time)
        mlflow.log_metric("predictions_match", int(predictions_match))
        
        # Visualizaciones
        # Confusion Matrix
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f'Confusion Matrix - {model_name}')
        plt.ylabel('Real')
        plt.xlabel('Predicci√≥n')
        plt.savefig(f'{model_name}_confusion_matrix.png', dpi=150, bbox_inches='tight')
        mlflow.log_artifact(f'{model_name}_confusion_matrix.png')
        plt.close()
        
        # ROC Curve si es posible
        if y_pred_proba is not None:
            fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
            plt.figure(figsize=(8, 6))
            plt.plot(fpr, tpr, linewidth=2, label=f'AUC = {auc:.4f}')
            plt.plot([0, 1], [0, 1], 'k--', linewidth=1)
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title(f'ROC Curve - {model_name}')
            plt.legend()
            plt.grid(True, alpha=0.3)
            plt.savefig(f'{model_name}_roc_curve.png', dpi=150, bbox_inches='tight')
            mlflow.log_artifact(f'{model_name}_roc_curve.png')
            plt.close()
        
        # Feature importance si est√° disponible
        if hasattr(model, 'feature_importances_'):
            importances = pd.DataFrame({
                'feature': feature_names,
                'importance': model.feature_importances_
            }).sort_values('importance', ascending=False)
            
            plt.figure(figsize=(10, 6))
            plt.barh(importances['feature'][:15], importances['importance'][:15])
            plt.xlabel('Importance')
            plt.title(f'Top 15 Feature Importance - {model_name}')
            plt.gca().invert_yaxis()
            plt.tight_layout()
            plt.savefig(f'{model_name}_feature_importance.png', dpi=150, bbox_inches='tight')
            mlflow.log_artifact(f'{model_name}_feature_importance.png')
            plt.close()
            
            importances.to_csv(f'{model_name}_feature_importance.csv', index=False)
            mlflow.log_artifact(f'{model_name}_feature_importance.csv')
        
        # Resultados
        print(f"\nüìä M√©tricas:")
        print(f"   Accuracy: {accuracy:.4f}")
        print(f"   Precision: {precision:.4f}")
        print(f"   Recall: {recall:.4f}")
        print(f"   F1-Score: {f1:.4f}")
        if auc:
            print(f"   AUC-ROC: {auc:.4f}")
        
        print(f"\n‚ö° Performance:")
        print(f"   Training time: {train_time:.4f}s")
        print(f"   Sklearn inference: {sklearn_inference_time:.4f}s")
        print(f"   ONNX inference: {onnx_inference_time:.4f}s")
        print(f"   Speedup: {sklearn_inference_time/onnx_inference_time:.2f}x")
        print(f"   Predictions match: {predictions_match}")
        
        return {
            'model_name': model_name,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'auc_roc': auc if auc else 0,
            'sklearn_time': sklearn_inference_time,
            'onnx_time': onnx_inference_time,
            'speedup': sklearn_inference_time / onnx_inference_time
        }

# Entrenar modelos
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'RandomForest': RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=100, max_depth=5, random_state=42),
    'DecisionTree': DecisionTreeClassifier(max_depth=10, random_state=42)
}

results = []
for model_name, model in models.items():
    result = train_and_export_model(
        model, 
        model_name, 
        X_train_scaled, 
        X_test_scaled, 
        y_train, 
        y_test,
        X.columns.tolist()
    )
    results.append(result)

## 7. Comparaci√≥n de Modelos

In [None]:
# Crear DataFrame de resultados
results_df = pd.DataFrame(results)

print("üìä COMPARACI√ìN DE MODELOS")
print("=" * 80)
print(results_df.to_string(index=False))

# Visualizar comparaci√≥n
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# M√©tricas de clasificaci√≥n
metrics = ['accuracy', 'precision', 'recall', 'f1_score']
for idx, (ax, metric) in enumerate(zip(axes.flat, metrics)):
    results_df.plot(x='model_name', y=metric, kind='bar', ax=ax, legend=False, color='skyblue')
    ax.set_title(f'{metric.replace("_", " ").title()}')
    ax.set_xlabel('')
    ax.set_ylabel('Score')
    ax.set_ylim([0, 1])
    ax.grid(axis='y', alpha=0.3)
    ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('models_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

# Comparaci√≥n de tiempos
fig, ax = plt.subplots(figsize=(10, 6))
x = np.arange(len(results_df))
width = 0.35

ax.bar(x - width/2, results_df['sklearn_time'] * 1000, width, label='Sklearn', alpha=0.8)
ax.bar(x + width/2, results_df['onnx_time'] * 1000, width, label='ONNX', alpha=0.8)

ax.set_xlabel('Modelo')
ax.set_ylabel('Tiempo de Inferencia (ms)')
ax.set_title('Comparaci√≥n de Tiempo de Inferencia: Sklearn vs ONNX')
ax.set_xticks(x)
ax.set_xticklabels(results_df['model_name'], rotation=45, ha='right')
ax.legend()
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('inference_time_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\nüèÜ Mejor modelo por m√©trica:")
print(f"   Accuracy: {results_df.loc[results_df['accuracy'].idxmax(), 'model_name']}")
print(f"   F1-Score: {results_df.loc[results_df['f1_score'].idxmax(), 'model_name']}")
print(f"   AUC-ROC: {results_df.loc[results_df['auc_roc'].idxmax(), 'model_name']}")
print(f"   Speedup ONNX: {results_df.loc[results_df['speedup'].idxmax(), 'model_name']} ({results_df['speedup'].max():.2f}x)")

## 8. Demo de Inferencia ONNX en Producci√≥n

In [None]:
# Usar el mejor modelo (por F1-Score)
best_model_name = results_df.loc[results_df['f1_score'].idxmax(), 'model_name']
onnx_file = f"{best_model_name.lower()}_model.onnx"

print(f"üèÜ Mejor modelo: {best_model_name}")
print(f"üì¶ Archivo ONNX: {onnx_file}")

# Cargar modelo ONNX
sess = rt.InferenceSession(onnx_file)

# Informaci√≥n del modelo
print(f"\nüìä Informaci√≥n del modelo ONNX:")
print(f"   Inputs: {[inp.name for inp in sess.get_inputs()]}")
print(f"   Outputs: {[out.name for out in sess.get_outputs()]}")
print(f"   Input shape: {sess.get_inputs()[0].shape}")
print(f"   Input type: {sess.get_inputs()[0].type}")

# Simulaci√≥n de inferencia en producci√≥n
def predict_fraud_onnx(transaction_data):
    """
    Predice fraude usando modelo ONNX
    """
    input_name = sess.get_inputs()[0].name
    label_name = sess.get_outputs()[0].name
    
    # Asegurar formato correcto
    if len(transaction_data.shape) == 1:
        transaction_data = transaction_data.reshape(1, -1)
    
    # Inferencia
    pred = sess.run([label_name], {input_name: transaction_data.astype(np.float32)})[0]
    
    return pred

# Test con ejemplos
print(f"\nüß™ Testing predicciones en producci√≥n...\n")

test_samples = X_test_scaled[:10]
real_labels = y_test.values[:10]

for i in range(len(test_samples)):
    prediction = predict_fraud_onnx(test_samples[i])
    real = real_labels[i]
    status = "‚úÖ" if prediction[0] == real else "‚ùå"
    
    print(f"Transacci√≥n {i+1}: Predicci√≥n={prediction[0]}, Real={real} {status}")

# Benchmark de inferencia
print(f"\n‚ö° Benchmark de inferencia (1000 predicciones):")

n_iterations = 1000
test_batch = X_test_scaled[:100]

start = time.time()
for _ in range(n_iterations):
    _ = predict_fraud_onnx(test_batch)
total_time = time.time() - start

print(f"   Total: {total_time:.4f}s")
print(f"   Promedio: {total_time/n_iterations*1000:.4f}ms por batch")
print(f"   Throughput: {n_iterations/total_time:.2f} batches/segundo")

## 9. Resumen y Best Practices

In [None]:
print("üí° BEST PRACTICES: SKLEARN + ONNX")
print("=" * 60)

print("\n1Ô∏è‚É£ FEATURE ENGINEERING con DuckDB:")
print("   ‚úÖ Usa SQL para transformaciones complejas")
print("   ‚úÖ Aprovecha window functions para features temporales")
print("   ‚úÖ Crea features agregadas eficientemente")
print("   ‚úÖ One-hot encoding para categor√≠as")

print("\n2Ô∏è‚É£ EXPORTACI√ìN ONNX:")
print("   ‚úÖ Valida que predicciones coincidan con sklearn")
print("   ‚úÖ Define tipos de entrada correctamente")
print("   ‚úÖ Prueba con datos reales antes de producci√≥n")
print("   ‚úÖ Versiona modelos ONNX junto con c√≥digo")

print("\n3Ô∏è‚É£ PERFORMANCE:")
print("   ‚úÖ ONNX t√≠picamente 2-10x m√°s r√°pido")
print("   ‚úÖ Usa batch inference cuando sea posible")
print("   ‚úÖ Considera ONNX Runtime GPU para vol√∫menes grandes")
print("   ‚úÖ Cachea sesiones de inferencia")

print("\n4Ô∏è‚É£ DEPLOYMENT:")
print("   ‚úÖ ONNX es portable entre lenguajes/plataformas")
print("   ‚úÖ No requiere Python/sklearn en producci√≥n")
print("   ‚úÖ Ideal para edge devices y microservicios")
print("   ‚úÖ Soporta C++, C#, Java, JavaScript, etc.")

print("\n5Ô∏è‚É£ MLFLOW TRACKING:")
print("   ‚úÖ Trackea modelo sklearn y ONNX")
print("   ‚úÖ Compara tiempos de inferencia")
print("   ‚úÖ Guarda artefactos (features, visualizaciones)")
print("   ‚úÖ Versiona todo el pipeline")

print("\n" + "=" * 60)

# Estad√≠sticas finales
print("\nüìä RESUMEN FINAL:")
print(f"   Dataset: {len(df):,} transacciones")
print(f"   Features: {X_train.shape[1]}")
print(f"   Modelos entrenados: {len(results)}")
print(f"   Modelos ONNX exportados: {len(results)}")
print(f"   Promedio speedup ONNX: {results_df['speedup'].mean():.2f}x")
print(f"   Mejor F1-Score: {results_df['f1_score'].max():.4f} ({best_model_name})")

con.close()
print("\n‚úÖ Conexi√≥n DuckDB cerrada")
print("\nüíª Ver resultados: mlflow ui --port 5000")