# Spark + TensorFlow + MLflow

Este notebook demuestra:
- Integración de TensorFlow con Spark
- Procesamiento distribuido de datos
- Entrenamiento de modelos con TensorFlow 2.x
- Tracking con MLflow

In [None]:
import tensorflow as tf
from pyspark.sql import SparkSession
import mlflow
import mlflow.tensorflow
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

print(f'TensorFlow version: {tf.__version__}')
print(f'GPU available: {tf.test.is_gpu_available()}')

In [None]:
# Configuración
spark = SparkSession.builder.appName('TensorFlow-Spark').master('local[*]').getOrCreate()
mlflow.set_tracking_uri('http://localhost:5000')
mlflow.set_experiment('spark-tensorflow')

In [None]:
# Generar datos
from sklearn.datasets import make_regression
X, y = make_regression(n_samples=10000, n_features=20, noise=10, random_state=42)
df = spark.createDataFrame(pd.DataFrame(X))
df.show(5)

In [None]:
# Modelo TensorFlow
def create_model(input_dim):
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(128, activation='relu', input_dim=input_dim),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

model = create_model(X.shape[1])
model.summary()

In [None]:
# Entrenar con MLflow
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

with mlflow.start_run(run_name='tensorflow-regression') as run:
    # Log params
    mlflow.log_params({
        'layers': '128-64-32',
        'dropout': 0.3,
        'optimizer': 'adam',
        'batch_size': 32,
        'epochs': 50
    })
    
    # Callbacks
    early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    
    # Train
    history = model.fit(
        X_train, y_train,
        validation_split=0.2,
        epochs=50,
        batch_size=32,
        callbacks=[early_stop],
        verbose=1
    )
    
    # Evaluate
    test_loss, test_mae = model.evaluate(X_test, y_test)
    mlflow.log_metrics({'test_loss': test_loss, 'test_mae': test_mae})
    
    # Log model
    mlflow.tensorflow.log_model(model, 'model')
    
    print(f'Test MAE: {test_mae:.4f}')
    print(f'Run ID: {run.info.run_id}')

## Conclusiones

- TensorFlow 2.x se integra fácilmente con Spark
- MLflow trackea automáticamente métricas de Keras
- Ideal para modelos de deep learning en producción

### Ejercicios
1. Implementar un modelo CNN para imágenes
2. Usar TensorFlow Datasets con Spark
3. Implementar transfer learning con modelos pre-entrenados
4. Crear un pipeline de preprocesamiento con tf.data