# Spark + Scikit-learn + MLflow: Machine Learning Cl√°sico Distribuido

## Objetivos
- Comparar Spark ML vs Scikit-learn
- Pipelines de ML h√≠bridos
- Hyperparameter tuning distribuido
- Modelo stacking y ensembles

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import RandomForestClassifier as SparkRF
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

import mlflow
import mlflow.sklearn
import mlflow.spark
import numpy as np
import pandas as pd

In [None]:
# Setup
spark = SparkSession.builder.appName('Sklearn-Spark-MLflow').master('local[*]').getOrCreate()
mlflow.set_tracking_uri('http://localhost:5000')
mlflow.set_experiment('spark-sklearn-comparison')

In [None]:
# Generar datos
from sklearn.datasets import make_classification

X, y = make_classification(
    n_samples=20000,
    n_features=30,
    n_informative=20,
    n_redundant=5,
    n_classes=3,
    random_state=42
)

# Crear DataFrame Spark
feature_cols = [f'feature_{i}' for i in range(X.shape[1])]
df_pandas = pd.DataFrame(X, columns=feature_cols)
df_pandas['label'] = y
df_spark = spark.createDataFrame(df_pandas)

print(f'Dataset: {df_spark.count()} registros, {len(feature_cols)} features')
df_spark.groupBy('label').count().show()

## 1. Modelo con Spark ML

In [None]:
# Preparar datos para Spark ML
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
df_assembled = assembler.transform(df_spark)

train_spark, test_spark = df_assembled.randomSplit([0.8, 0.2], seed=42)

# Entrenar con MLflow
with mlflow.start_run(run_name='spark-ml-random-forest') as run:
    params = {'numTrees': 100, 'maxDepth': 10}
    mlflow.log_params(params)
    mlflow.set_tag('framework', 'spark-ml')
    
    # Modelo
    rf_spark = SparkRF(
        featuresCol='features',
        labelCol='label',
        numTrees=params['numTrees'],
        maxDepth=params['maxDepth']
    )
    
    model_spark = rf_spark.fit(train_spark)
    predictions = model_spark.transform(test_spark)
    
    # Evaluar
    evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction')
    accuracy = evaluator.evaluate(predictions, {evaluator.metricName: 'accuracy'})
    f1 = evaluator.evaluate(predictions, {evaluator.metricName: 'f1'})
    
    mlflow.log_metrics({'accuracy': accuracy, 'f1': f1})
    mlflow.spark.log_model(model_spark, 'spark-rf-model')
    
    print(f'Spark ML - Accuracy: {accuracy:.4f}, F1: {f1:.4f}')
    spark_run_id = run.info.run_id

## 2. Modelos con Scikit-learn

In [None]:
# Preparar datos para sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Entrenar m√∫ltiples modelos
models = {
    'RandomForest': RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=100, max_depth=5, random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42)
}

results = {}

for name, model in models.items():
    with mlflow.start_run(run_name=f'sklearn-{name.lower()}'):
        mlflow.set_tag('framework', 'sklearn')
        mlflow.log_param('model_type', name)
        
        # Entrenar
        model.fit(X_train_scaled, y_train)
        
        # Predecir
        y_pred = model.predict(X_test_scaled)
        
        # M√©tricas
        from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
        
        metrics = {
            'accuracy': accuracy_score(y_test, y_pred),
            'f1_weighted': f1_score(y_test, y_pred, average='weighted'),
            'precision': precision_score(y_test, y_pred, average='weighted'),
            'recall': recall_score(y_test, y_pred, average='weighted')
        }
        
        mlflow.log_metrics(metrics)
        mlflow.sklearn.log_model(model, f'{name.lower()}-model')
        
        results[name] = metrics
        print(f'{name} - Accuracy: {metrics["accuracy"]:.4f}, F1: {metrics["f1_weighted"]:.4f}')

## 3. Hyperparameter Tuning con GridSearchCV

In [None]:
# Grid Search para Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10]
}

with mlflow.start_run(run_name='sklearn-rf-gridsearch'):
    grid_search = GridSearchCV(
        RandomForestClassifier(random_state=42),
        param_grid,
        cv=3,
        scoring='f1_weighted',
        n_jobs=-1,
        verbose=1
    )
    
    grid_search.fit(X_train_scaled, y_train)
    
    # Mejores par√°metros
    mlflow.log_params(grid_search.best_params_)
    mlflow.log_metric('best_cv_score', grid_search.best_score_)
    
    # Evaluar mejor modelo
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test_scaled)
    test_f1 = f1_score(y_test, y_pred, average='weighted')
    
    mlflow.log_metric('test_f1', test_f1)
    mlflow.sklearn.log_model(best_model, 'best-rf-model')
    
    print(f'Mejores par√°metros: {grid_search.best_params_}')
    print(f'CV Score: {grid_search.best_score_:.4f}')
    print(f'Test F1: {test_f1:.4f}')

## 4. Ensemble Stacking

In [None]:
from sklearn.ensemble import StackingClassifier

with mlflow.start_run(run_name='sklearn-stacking-ensemble'):
    # Base models
    estimators = [
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
    ]
    
    # Meta-learner
    stacking = StackingClassifier(
        estimators=estimators,
        final_estimator=LogisticRegression(max_iter=1000),
        cv=3
    )
    
    # Entrenar
    stacking.fit(X_train_scaled, y_train)
    y_pred = stacking.predict(X_test_scaled)
    
    # M√©tricas
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    mlflow.log_params({'ensemble_type': 'stacking', 'n_base_models': len(estimators)})
    mlflow.log_metrics({'accuracy': acc, 'f1': f1})
    mlflow.sklearn.log_model(stacking, 'stacking-model')
    
    print(f'Stacking Ensemble - Accuracy: {acc:.4f}, F1: {f1:.4f}')
    print('\nClassification Report:')
    print(classification_report(y_test, y_pred))

## 5. Comparaci√≥n de Resultados

In [None]:
import matplotlib.pyplot as plt

# Crear gr√°fico comparativo
model_names = list(results.keys())
accuracies = [results[name]['accuracy'] for name in model_names]
f1_scores = [results[name]['f1_weighted'] for name in model_names]

fig, ax = plt.subplots(1, 2, figsize=(12, 5))

ax[0].bar(model_names, accuracies, color='steelblue')
ax[0].set_title('Accuracy por Modelo', fontweight='bold')
ax[0].set_ylabel('Accuracy')
ax[0].set_ylim([0.7, 1.0])
ax[0].grid(axis='y', alpha=0.3)

ax[1].bar(model_names, f1_scores, color='coral')
ax[1].set_title('F1-Score por Modelo', fontweight='bold')
ax[1].set_ylabel('F1-Score')
ax[1].set_ylim([0.7, 1.0])
ax[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('model_comparison.png', dpi=150)
plt.show()

print('\nüìä Resumen de Resultados:')
print('='*60)
for name, metrics in results.items():
    print(f'{name:20s} - Acc: {metrics["accuracy"]:.4f}, F1: {metrics["f1_weighted"]:.4f}')

## Conclusiones

### Spark ML vs Scikit-learn

**Spark ML:**
- ‚úÖ Escalabilidad para grandes datasets
- ‚úÖ Procesamiento distribuido
- ‚ùå Menos algoritmos disponibles
- ‚ùå API menos intuitiva

**Scikit-learn:**
- ‚úÖ Amplia variedad de algoritmos
- ‚úÖ API simple y consistente
- ‚úÖ Mejor para datasets peque√±os/medianos
- ‚ùå No distribuido (limitado a memoria)

### Recomendaciones
- Datasets < 100GB: Scikit-learn
- Datasets > 100GB: Spark ML
- Prototipos r√°pidos: Scikit-learn
- Producci√≥n a escala: Spark ML

### Ejercicios
1. Implementar feature selection distribuido
2. Crear pipelines de preprocesamiento complejos
3. Implementar custom estimators
4. Comparar tiempos de ejecuci√≥n en datasets grandes
5. Implementar AutoML con Hyperopt