In [0]:
%pip install tensorflow

In [0]:
import numpy as np
import pandas as pd
import tensorflow as tf

from pyspark.sql.functions import udf, pandas_udf
from pyspark.sql.types import ArrayType, FloatType, DoubleType
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col
from scipy.sparse import issparse # To check if it's a sparse vector

In [0]:
VOLUME_ROOT_PATH = "/Volumes/cscie103_catalog/final_project/data"
# place where prepared data is written
VOLUME_SILVER_DIR = f"{VOLUME_ROOT_PATH}/silver"

# write to Silver tier as Delta table
silver_path = f'{VOLUME_SILVER_DIR}/training'
training_df = spark.read.format("delta").load(silver_path)

In [0]:
training_df.printSchema()

In [0]:
def build_ffnn_model(input_dim):
    """Defines a simple Feedforward Neural Network."""
    model = tf.keras.Sequential([
        # Input Layer matches the dimension of the hashed feature vector
        tf.keras.layers.Dense(256, activation='relu', input_shape=(input_dim,)),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        # Output layer for binary classification
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

In [0]:
# 1. List all columns that will serve as input features
feature_cols = [
    "hash_storenbr_family",
    "hash_city_state_type_cluster",
    "hash_state_isHoliday",
    "hash_storeNbr",
    # Numerical features that were not hashed
    "onpromotion",
    "transactions"
]

# 2. Initialize the VectorAssembler
assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features" # The final single vector column
)

# 3. Apply the transformation to your DataFrame
# Assume your existing DataFrame is named 'input_df'
assembled_df = assembler.transform(training_df).select("date", col("sales").alias("label"), "features")

# Show the new 'features' column
assembled_df.select("features", "label").show(3, truncate=False)

In [0]:
from pyspark.sql.functions import lit, col, expr, date_add, to_date

# Define the chronological split date
CUTOFF_DATE = "2016-01-01"
TRAIN_PATH = f"{VOLUME_SILVER_DIR}/ffnn_sales_training_data"
TEST_PATH = f"{VOLUME_SILVER_DIR}/ffnn_sales_test_data"

# Split Spark DataFrames
train_spark_df = assembled_df.filter(col("date") < lit(CUTOFF_DATE))
test_spark_df = assembled_df.filter(col("date") >= lit(CUTOFF_DATE))

print(f"Training records: {train_spark_df.count()}")
print(f"Test records: {test_spark_df.count()}")

# Save for Petastorm consumption (crucial step to avoid NumPy)
# dbutils.fs.rm(TRAIN_PATH, recurse=True)
# dbutils.fs.rm(TEST_PATH, recurse=True)
train_spark_df.write.format("delta").save(TRAIN_PATH)
test_spark_df.write.format("delta").save(TEST_PATH)

In [0]:
import tensorflow as tf
import horovod.tensorflow.keras as hvd
from petastorm.spark import SparkDatasetConverter
import mlflow.tensorflow

# --- Horovod Training Function (This runs on every worker) ---
def train_ffnn_horovod(input_dim, train_path, test_path, epochs, batch_size):
    # Initialize Horovod
    hvd.init()
    
    # Set GPU affinity if applicable
    gpus = tf.config.experimental.list_physical_devices('GPU')
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    if gpus:
        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')

    # --- 1. Model Definition (Regression) ---
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(512, activation='relu', input_shape=(input_dim,)),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(1, activation='linear') # Regression Output
    ])

    # --- 2. Distributed Optimizer and Compilation ---
    # Scale learning rate by the number of workers
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001 * hvd.size())
    optimizer = hvd.DistributedOptimizer(optimizer)
    
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])

    # --- 3. Petastorm Data Loading (Distributed) ---
    converter = SparkDatasetConverter(spark)
    
    with converter.make_tf_dataset(spark.read.parquet(train_path)) as train_dataset, \
         converter.make_tf_dataset(spark.read.parquet(test_path)) as val_dataset:
        
        # Define how Petastorm maps columns to model input
        feature_columns = ['features']
        target_column = 'label'

        def map_fn(x):
            return x[feature_columns[0]], x[target_column]

        # Configure Petastorm to work with Horovod
        train_hvd_dataset = train_dataset.map(map_fn).shard(hvd.size(), hvd.rank()).shuffle(100).batch(batch_size)
        val_hvd_dataset = val_dataset.map(map_fn).shard(hvd.size(), hvd.rank()).batch(batch_size)

        # --- 4. Training ---
        callbacks = [
            hvd.callbacks.BroadcastGlobalVariablesCallback(0),
            hvd.callbacks.MetricAverageCallback(),
        ]
        
        # Horovod training requires steps per epoch, not just epochs
        train_steps = int(train_spark_df.count() / hvd.size() / batch_size)
        val_steps = int(test_spark_df.count() / hvd.size() / batch_size)

        history = model.fit(
            train_hvd_dataset,
            steps_per_epoch=train_steps,
            epochs=epochs,
            validation_data=val_hvd_dataset,
            validation_steps=val_steps,
            callbacks=callbacks,
            verbose=1 if hvd.rank() == 0 else 0 # Only rank 0 prints output
        )

    # --- 5. MLflow Logging and Saving (Only on rank 0) ---
    if hvd.rank() == 0:
        with mlflow.start_run() as run:
            # Log metrics
            mlflow.log_param("input_dim", input_dim)
            mlflow.log_metric("final_val_mae", history.history['val_mae'][-1])
            
            # Save the trained model
            mlflow.tensorflow.log_model(model, "model", registered_model_name="Sales_FFNN_Model")
            print(f"MLflow Run ID: {run.info.run_id}")

# --- 6. Launch the Distributed Training ---
from spark_tensorflow_distributor import MirroredStrategyRunner

# NOTE: Set num_slots to the number of workers you want to use (e.g., number of cores)
# For local testing, you might set this to the number of local cores.
runner = MirroredStrategyRunner(num_slots=4) 

runner.run(
    train_ffnn_horovod, 
    input_dim=INPUT_DIMENSION, 
    train_path=TRAIN_PATH, 
    test_path=TEST_PATH, 
    epochs=10, 
    batch_size=64
)