#### Model Comparision and Feature engineering

In [0]:
# import libraries. pandas for sklearn models and for sparkML we load using sparkDataframe
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from mlflow.models.signature import infer_signature

In [0]:
# 
spark.sql("USE CATALOG ecom_catalog")
spark.sql("USE SCHEMA ecom_schema")


DataFrame[]

In [0]:
# data load

df_gold = spark.table("gold_events").toPandas()
df_gold_clean = df_gold.dropna()

# predict purchase based on views and carts
X = df_gold_clean[['views', 'carts']]
y = df_gold_clean['purchases']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = X_train.astype(float) # best practice: Convert integer columns to float
X_test = X_test.astype(float)

display(X_train.head())

views,carts
11448.0,603.0
268449.0,35236.0
11136.0,744.0
36374.0,148.0
2178.0,31.0


#### Use scikit-learn (sklearn) 
#### - Purpose: Train multiple models quickly and find the best one comparing them using MLflow.
#### - Runs on a single machine, Works best with small to medium datasets (fits in RAM)
#### - Great for: quick experiments , model comparison, classical ML (regression, trees, random forest.)

In [0]:
# Train 3 models and compare the metrics and track the results. 

models = {
    "linear_regression": LinearRegression(),
    "decision_tree": DecisionTreeRegressor(max_depth=5),
    "random_forest": RandomForestRegressor(n_estimators=100, max_depth=5)
}

for name, model in models.items():
    with mlflow.start_run(run_name=f"Model_{name}"):
        # Log Hyperparameters
        mlflow.log_param("model_type", name)
        
        # model Train
        model.fit(X_train, y_train)
        
        # Predict & Evaluate
        score = model.score(X_test, y_test)
        
        # Log Metrics & Model
        mlflow.log_metric("r2_score", score)
        mlflow.log_metric("mse", mean_squared_error(y_test, model.predict(X_test)))

        # NEW: create signature + input example
        signature = infer_signature(X_train, model.predict(X_train))
        input_example = X_train.iloc[:1]

        # Log model with signature + example
        mlflow.sklearn.log_model(
            model,
            "model",
            signature=signature,
            input_example=input_example
        )
   
        print(f" {name:20} | R2 Score: {score:.4f}")


 linear_regression    | R2 Score: 0.9954
 decision_tree        | R2 Score: 0.9475
 random_forest        | R2 Score: 0.8967


### Spark ML
#### - Runs distributed across a cluster, designed for big data (big data and distributed processing.)
#### - Uses DataFrames and Pipelines
#### - Great for:  large-scale preprocessing, training models on large datasets, production pipelines in Databricks

#### Build spark ML pipeline

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression as SparkLR
from pyspark.ml.evaluation import RegressionEvaluator

# Load Spark data
spark_df = spark.table("gold_events").na.drop()

# Converts features into a single vector. Spark models require a single features column.
assembler = VectorAssembler(
    inputCols=["views", "carts"],
    outputCol="features"
)
# define the model
lr_spark = SparkLR(featuresCol="features", labelCol="purchases", maxIter=10, regParam=0.3)

#build pipeline
pipeline = Pipeline(stages=[assembler, lr_spark])

# Train / test split
train_df, test_df = spark_df.randomSplit([0.8, 0.2], seed=42)

# Fit pipeline
print("Training Spark ML Pipeline...")
spark_model = pipeline.fit(train_df)

# make predictions
predictions = spark_model.transform(test_df)

evaluator = RegressionEvaluator(
    labelCol="purchases",
    predictionCol="prediction",
    metricName="r2"
)
r2_spark = evaluator.evaluate(predictions)
print(f"Spark Linear Regression R2: {r2_spark:.4f}")

display(predictions.select("category_code","views","carts","purchases","prediction").limit(10))

Training Spark ML Pipeline...
Spark Linear Regression R2: 0.9862


category_code,views,carts,purchases,prediction
electronics.tablet,8332,544,301,360.26051138710926
computers.peripherals.keyboard,21615,1599,1006,1026.881325182106
computers.peripherals.monitor,34169,2882,1658,1839.056542204622
electronics.audio.subwoofer,87516,4456,4082,2819.752124268129
computers.components.cooler,7900,562,342,371.906485863043
computers.components.power_supply,7981,644,415,424.1130895245028
computers.components.cpu,23803,1044,441,672.4002950208244
computers.components.memory,17747,1414,887,910.6220207763668
electronics.video.tv,386516,56813,32496,36051.41542169521
electronics.audio.acoustic,23794,302,216,199.6948380867792


#### Pipelines: A Pipeline (Assembler -> Model) is safer than running steps separately because it guarantees the exact same feature  transformations are applied to both training data and test data preventing data‑skew and leakage.

#### MLflow for scikit‑learn → logs model parameters and metrics, saves the trained model, and can register in the Model Registry
#### MLflow for Spark ML → logs parameters and metrics, and saves the entire pipeline (VectorAssembler + model + transformations)