# MLflow + Spark Integration
Train ML models with Spark and track experiments in MLflow

In [None]:
import os
import mlflow
from spark_config import get_spark_session

# Configure MLflow
mlflow.set_tracking_uri(os.environ.get('MLFLOW_TRACKING_URI', 'http://mlflow:5000'))
mlflow.set_experiment("spark-ml-demo")

# Get Spark session
spark = get_spark_session(app_name="MLflowSparkDemo")

In [None]:
# Create sample dataset
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
import pandas as pd
import numpy as np

# Generate synthetic data
np.random.seed(42)
n_samples = 1000
pdf = pd.DataFrame({
    'feature1': np.random.randn(n_samples),
    'feature2': np.random.randn(n_samples),
    'feature3': np.random.randn(n_samples),
})
pdf['target'] = 2 * pdf['feature1'] + 3 * pdf['feature2'] - pdf['feature3'] + np.random.randn(n_samples) * 0.5

df = spark.createDataFrame(pdf)
df.show(5)

In [None]:
# Prepare features
feature_cols = ['feature1', 'feature2', 'feature3']
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
df_assembled = assembler.transform(df)

# Split data
train_df, test_df = df_assembled.randomSplit([0.8, 0.2], seed=42)
print(f"Training set: {train_df.count()}, Test set: {test_df.count()}")

In [None]:
# Train with MLflow tracking
with mlflow.start_run(run_name="linear_regression_spark"):
    # Log parameters
    params = {
        "maxIter": 100,
        "regParam": 0.1,
        "elasticNetParam": 0.0
    }
    mlflow.log_params(params)
    
    # Train model
    lr = LinearRegression(
        featuresCol='features',
        labelCol='target',
        maxIter=params['maxIter'],
        regParam=params['regParam'],
        elasticNetParam=params['elasticNetParam']
    )
    model = lr.fit(train_df)
    
    # Evaluate
    predictions = model.transform(test_df)
    evaluator = RegressionEvaluator(labelCol='target', predictionCol='prediction')
    
    rmse = evaluator.evaluate(predictions, {evaluator.metricName: 'rmse'})
    r2 = evaluator.evaluate(predictions, {evaluator.metricName: 'r2'})
    
    # Log metrics
    mlflow.log_metrics({
        "rmse": rmse,
        "r2": r2
    })
    
    # Log model coefficients
    mlflow.log_param("coefficients", str(model.coefficients.toArray().tolist()))
    mlflow.log_param("intercept", model.intercept)
    
    # Log model
    mlflow.spark.log_model(model, "model")
    
    print(f"RMSE: {rmse:.4f}")
    print(f"R2: {r2:.4f}")
    print(f"Coefficients: {model.coefficients}")
    print(f"Intercept: {model.intercept}")

In [None]:
# Load model from MLflow and make predictions
runs = mlflow.search_runs(experiment_names=["spark-ml-demo"])
latest_run_id = runs.iloc[0]['run_id']

loaded_model = mlflow.spark.load_model(f"runs:/{latest_run_id}/model")
new_predictions = loaded_model.transform(test_df)
new_predictions.select('features', 'target', 'prediction').show(5)

In [None]:
spark.stop()