# Spark + Clustering + PostgreSQL + MLflow

## Objetivos
- Algoritmos de clustering con Spark ML
- Almacenamiento de resultados en PostgreSQL
- Evaluaci√≥n de modelos de clustering
- Tracking con MLflow

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans, GaussianMixture, BisectingKMeans
from pyspark.ml.evaluation import ClusteringEvaluator

import mlflow
import mlflow.spark
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
import psycopg2

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Configuraci√≥n
spark = SparkSession.builder \
    .appName('Clustering-Postgres-MLflow') \
    .master('local[*]') \
    .config('spark.jars', '/path/to/postgresql-jdbc.jar') \
    .getOrCreate()

mlflow.set_tracking_uri('http://localhost:5000')
mlflow.set_experiment('spark-clustering-postgres')

# PostgreSQL connection
POSTGRES_URL = 'postgresql://spark_user:spark_password@localhost:5432/spark_ml_db'
engine = create_engine(POSTGRES_URL)

## 1. Generar Datos Sint√©ticos

In [None]:
from sklearn.datasets import make_blobs

# Generar clusters
n_samples = 10000
n_features = 10
n_clusters = 5

X, y_true = make_blobs(
    n_samples=n_samples,
    n_features=n_features,
    centers=n_clusters,
    cluster_std=1.5,
    random_state=42
)

# Crear DataFrame
feature_cols = [f'feature_{i}' for i in range(n_features)]
df_pandas = pd.DataFrame(X, columns=feature_cols)
df_pandas['customer_id'] = range(len(df_pandas))

df_spark = spark.createDataFrame(df_pandas)

print(f'Dataset: {df_spark.count()} clientes, {n_features} features')
df_spark.show(5)

## 2. Preparar Features

In [None]:
# Ensamblar features
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features_raw')
df_assembled = assembler.transform(df_spark)

# Escalar
scaler = StandardScaler(inputCol='features_raw', outputCol='features', withMean=True, withStd=True)
scaler_model = scaler.fit(df_assembled)
df_scaled = scaler_model.transform(df_assembled)

print('‚úì Features preparadas y escaladas')
df_scaled.select('customer_id', 'features').show(5, truncate=False)

## 3. K-Means Clustering

In [None]:
# Entrenar m√∫ltiples modelos con diferentes K
k_values = [3, 4, 5, 6, 7, 8]
results = []

evaluator = ClusteringEvaluator(featuresCol='features', metricName='silhouette')

for k in k_values:
    with mlflow.start_run(run_name=f'kmeans-k{k}'):
        # Log params
        mlflow.log_params({'algorithm': 'KMeans', 'k': k, 'max_iter': 20})
        
        # Train
        kmeans = KMeans(featuresCol='features', predictionCol='cluster', k=k, maxIter=20, seed=42)
        model = kmeans.fit(df_scaled)
        
        # Predict
        predictions = model.transform(df_scaled)
        
        # Evaluate
        silhouette = evaluator.evaluate(predictions)
        
        # Cost (Within Set Sum of Squared Errors)
        wssse = model.summary.trainingCost
        
        # Cluster sizes
        cluster_counts = predictions.groupBy('cluster').count().orderBy('cluster').collect()
        cluster_sizes = {row['cluster']: row['count'] for row in cluster_counts}
        
        # Log metrics
        mlflow.log_metrics({
            'silhouette_score': silhouette,
            'wssse': wssse,
            'n_iterations': model.summary.numIter
        })
        
        # Log cluster sizes
        for cluster_id, size in cluster_sizes.items():
            mlflow.log_metric(f'cluster_{cluster_id}_size', size)
        
        # Save model
        mlflow.spark.log_model(model, f'kmeans-model-k{k}')
        
        results.append({
            'k': k,
            'silhouette': silhouette,
            'wssse': wssse,
            'model': model
        })
        
        print(f'K={k}: Silhouette={silhouette:.4f}, WSSSE={wssse:.2f}')

# Encontrar mejor K
best_result = max(results, key=lambda x: x['silhouette'])
print(f'\n‚úì Mejor K: {best_result["k"]} (Silhouette: {best_result["silhouette"]:.4f})')

## 4. Visualizaci√≥n: Elbow Method

In [None]:
# Crear gr√°ficos
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Elbow plot (WSSSE)
k_vals = [r['k'] for r in results]
wssse_vals = [r['wssse'] for r in results]
silhouette_vals = [r['silhouette'] for r in results]

axes[0].plot(k_vals, wssse_vals, marker='o', linewidth=2, markersize=8, color='steelblue')
axes[0].set_xlabel('N√∫mero de Clusters (K)', fontsize=12)
axes[0].set_ylabel('WSSSE', fontsize=12)
axes[0].set_title('Elbow Method - WSSSE', fontweight='bold', fontsize=14)
axes[0].grid(True, alpha=0.3)

# Silhouette plot
axes[1].plot(k_vals, silhouette_vals, marker='s', linewidth=2, markersize=8, color='coral')
axes[1].axhline(y=max(silhouette_vals), color='green', linestyle='--', alpha=0.5, label='M√°ximo')
axes[1].set_xlabel('N√∫mero de Clusters (K)', fontsize=12)
axes[1].set_ylabel('Silhouette Score', fontsize=12)
axes[1].set_title('Silhouette Score vs K', fontweight='bold', fontsize=14)
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('clustering_evaluation.png', dpi=150)
plt.show()

## 5. Gaussian Mixture Model

In [None]:
# GMM clustering
with mlflow.start_run(run_name='gaussian-mixture-model'):
    k = best_result['k']
    
    mlflow.log_params({'algorithm': 'GMM', 'k': k, 'max_iter': 100})
    
    # Train
    gmm = GaussianMixture(featuresCol='features', predictionCol='cluster', k=k, maxIter=100, seed=42)
    gmm_model = gmm.fit(df_scaled)
    
    # Predict
    gmm_predictions = gmm_model.transform(df_scaled)
    
    # Evaluate
    gmm_silhouette = evaluator.evaluate(gmm_predictions)
    
    # Log likelihood
    log_likelihood = gmm_model.summary.logLikelihood
    
    mlflow.log_metrics({
        'silhouette_score': gmm_silhouette,
        'log_likelihood': log_likelihood
    })
    
    mlflow.spark.log_model(gmm_model, 'gmm-model')
    
    print(f'GMM - Silhouette: {gmm_silhouette:.4f}')
    print(f'GMM - Log Likelihood: {log_likelihood:.2f}')

## 6. Guardar Resultados en PostgreSQL

In [None]:
# Usar mejor modelo
best_model = best_result['model']
final_predictions = best_model.transform(df_scaled)

# Seleccionar columnas relevantes
df_to_save = final_predictions.select('customer_id', 'cluster', *feature_cols)

# Convertir a Pandas
df_results = df_to_save.toPandas()

# Guardar en PostgreSQL
try:
    df_results.to_sql(
        'customer_clusters',
        engine,
        if_exists='replace',
        index=False,
        method='multi',
        chunksize=1000
    )
    print(f'‚úì {len(df_results)} registros guardados en PostgreSQL (tabla: customer_clusters)')
except Exception as e:
    print(f'‚ö† Error al guardar en PostgreSQL: {e}')
    print('Aseg√∫rate de que PostgreSQL est√© corriendo y las credenciales sean correctas')

# Tambi√©n guardar estad√≠sticas de clusters
cluster_stats = df_results.groupby('cluster').agg({
    'customer_id': 'count',
    **{col: ['mean', 'std'] for col in feature_cols}
}).reset_index()

cluster_stats.columns = ['_'.join(col).strip('_') for col in cluster_stats.columns.values]

try:
    cluster_stats.to_sql(
        'cluster_statistics',
        engine,
        if_exists='replace',
        index=False
    )
    print(f'‚úì Estad√≠sticas de {len(cluster_stats)} clusters guardadas')
except Exception as e:
    print(f'‚ö† Error: {e}')

## 7. Leer desde PostgreSQL

In [None]:
# Leer datos con Spark JDBC
try:
    df_from_postgres = spark.read \
        .format('jdbc') \
        .option('url', 'jdbc:postgresql://localhost:5432/spark_ml_db') \
        .option('dbtable', 'customer_clusters') \
        .option('user', 'spark_user') \
        .option('password', 'spark_password') \
        .option('driver', 'org.postgresql.Driver') \
        .load()
    
    print('‚úì Datos le√≠dos desde PostgreSQL:')
    df_from_postgres.show(10)
    
    # Distribuci√≥n de clusters
    print('\nüìä Distribuci√≥n de Clusters:')
    df_from_postgres.groupBy('cluster').count().orderBy('cluster').show()
    
except Exception as e:
    print(f'‚ö† Error al leer desde PostgreSQL: {e}')
    print('Necesitas descargar postgresql-jdbc.jar y configurar la ruta')

## 8. An√°lisis de Clusters

In [None]:
# Analizar caracter√≠sticas de cada cluster
print('\nüìä AN√ÅLISIS DE CLUSTERS\n' + '='*60)

for cluster_id in range(best_result['k']):
    cluster_data = df_results[df_results['cluster'] == cluster_id]
    
    print(f'\nCluster {cluster_id}:')
    print(f'  Tama√±o: {len(cluster_data)} clientes ({len(cluster_data)/len(df_results)*100:.1f}%)')
    
    # Top 3 features m√°s distintivas
    feature_means = cluster_data[feature_cols].mean()
    global_means = df_results[feature_cols].mean()
    deviations = (feature_means - global_means).abs().sort_values(ascending=False)
    
    print('  Top 3 features distintivas:')
    for i, (feat, dev) in enumerate(deviations.head(3).items(), 1):
        print(f'    {i}. {feat}: {feature_means[feat]:.2f} (global: {global_means[feat]:.2f})')

## 9. Visualizaci√≥n PCA de Clusters

In [None]:
from sklearn.decomposition import PCA

# Reducir a 2D con PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(df_results[feature_cols])

# Visualizar
plt.figure(figsize=(12, 8))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=df_results['cluster'], 
                     cmap='viridis', alpha=0.6, s=30)
plt.colorbar(scatter, label='Cluster')
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]*100:.1f}%)', fontsize=12)
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]*100:.1f}%)', fontsize=12)
plt.title('Visualizaci√≥n de Clusters (PCA)', fontweight='bold', fontsize=14)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('clusters_pca.png', dpi=150)
plt.show()

print(f'‚úì Varianza explicada: {sum(pca.explained_variance_ratio_)*100:.1f}%')

## 10. Queries SQL de Ejemplo

In [None]:
# Queries √∫tiles para an√°lisis
queries = {
    'Clientes por cluster': """
        SELECT cluster, COUNT(*) as num_customers
        FROM customer_clusters
        GROUP BY cluster
        ORDER BY cluster;
    """,
    
    'Clientes del cluster m√°s grande': """
        SELECT cluster, COUNT(*) as num_customers
        FROM customer_clusters
        GROUP BY cluster
        ORDER BY num_customers DESC
        LIMIT 1;
    """,
    
    'Muestra de clientes de cada cluster': """
        SELECT *
        FROM (
            SELECT *, ROW_NUMBER() OVER (PARTITION BY cluster ORDER BY customer_id) as rn
            FROM customer_clusters
        ) t
        WHERE rn <= 5
        ORDER BY cluster, customer_id;
    """
}

print('\nüìù Queries SQL de Ejemplo:\n')
for name, query in queries.items():
    print(f'{name}:')
    print(query)
    print()

# Ejecutar query de ejemplo
try:
    result = pd.read_sql(queries['Clientes por cluster'], engine)
    print('\nResultado:')
    print(result)
except Exception as e:
    print(f'‚ö† Error: {e}')

## Conclusiones

### Aprendizajes Clave

**Clustering con Spark:**
- ‚úÖ K-Means escalable para grandes datasets
- ‚úÖ GMM para clusters probabil√≠sticos
- ‚úÖ Evaluaci√≥n con Silhouette Score
- ‚úÖ Elbow method para selecci√≥n de K

**Integraci√≥n PostgreSQL:**
- ‚úÖ Persistencia de resultados
- ‚úÖ Queries SQL para an√°lisis
- ‚úÖ JDBC para lectura/escritura
- ‚úÖ Escalabilidad con particionamiento

**MLflow Tracking:**
- ‚úÖ Comparaci√≥n de modelos
- ‚úÖ Versionado de experimentos
- ‚úÖ Reproducibilidad

### Casos de Uso
- Segmentaci√≥n de clientes
- Detecci√≥n de anomal√≠as
- An√°lisis de comportamiento
- Recomendaciones personalizadas

### Ejercicios
1. Implementar DBSCAN distribuido
2. Clustering jer√°rquico con BisectingKMeans
3. Feature engineering para mejorar clusters
4. Crear dashboard en Grafana con datos de PostgreSQL
5. Implementar clustering incremental para streaming data

### Pr√≥ximos Pasos
1. Implementar pipeline de actualizaci√≥n autom√°tica
2. A√±adir monitoreo de drift de clusters
3. Crear API para asignaci√≥n de clusters en tiempo real
4. Integrar con sistemas de CRM
5. Implementar A/B testing de diferentes estrategias de clustering