#### Data Preparation

In [0]:
df = spark.table("dl_ecommerce_idc.gold.category_daily_metrics")
df=df.toPandas()

In [0]:
df.head()

Unnamed: 0,category_code,event_date,views,carts,purchases,price
0,appliances.kitchen.steam_cooker,2019-10-14,170,7,4,130270.3
1,computers.peripherals.camera,2019-10-26,69,1,1,17180.38
2,computers.components.memory,2019-11-08,539,46,15,142884.42
3,kids.dolls,2019-10-19,498,6,8,73767.56
4,sport.bicycle,2019-11-17,2275,179,121,3572707.04


In [0]:
from sklearn.model_selection import train_test_split
X = df[["views", "carts"]]
y = df["purchases"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Train Multiple Models

In [0]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

models = {
    "linear": LinearRegression(),
    "decision_tree": DecisionTreeRegressor(max_depth=5),
    "random_forest": RandomForestRegressor(n_estimators=100)
}

#### Set MLflow experiment and log the parameters

In [0]:
import mlflow
import mlflow.sklearn
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

mlflow.set_experiment("/Users/dheerajlakkakula1419@gmail.com/Databricks_IDC/Day13_ModelComparison")

for name, model in models.items():
    with mlflow.start_run(run_name=f"{name}_model"):
        # Log params
        mlflow.log_param("model_type", name)
        mlflow.log_param("features", "views,carts")
        mlflow.log_param("test_size", 0.2)
        # Train
        model.fit(X_train, y_train)

        # Predict
        preds = model.predict(X_test)

        # Metrics
        r2 = r2_score(y_test, preds)
        rmse = np.sqrt(mean_squared_error(y_test, preds))

        # Log metrics
        mlflow.log_metric("r2_score", r2)
        mlflow.log_metric("rmse", rmse)

        # Log trained model
        mlflow.sklearn.log_model(model,"model")

        print(f"{name} → R²: {r2:.4f}, RMSE: {rmse:.4f}")



linear → R²: 0.9669, RMSE: 151.6012




decision_tree → R²: 0.4661, RMSE: 609.3042




random_forest → R²: 0.9209, RMSE: 234.5804


#### Spark ML Pipeline

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

df = spark.table("dl_ecommerce_idc.gold.category_daily_metrics")
# Train/test split
train, test = df.randomSplit([0.8, 0.2], seed=42)

# assemling all the features 
assembler = VectorAssembler(inputCols=["views", "carts"], outputCol="features")
#  Models to compare (Spark ML)
models = {
    "LinearRegressionP": LinearRegression(featuresCol="features", labelCol="purchases"),
    "DecisionTreeP": DecisionTreeRegressor(featuresCol="features", labelCol="purchases", maxDepth=5),
    "RandomForestP": RandomForestRegressor(featuresCol="features", labelCol="purchases", numTrees=100, maxDepth=5)
}

In [0]:
import mlflow
import mlflow.spark
from pyspark.ml.evaluation import RegressionEvaluator

rmse_eval = RegressionEvaluator(labelCol="purchases",predictionCol="prediction",metricName="rmse")
r2_eval = RegressionEvaluator(labelCol="purchases",predictionCol="prediction",metricName="r2")

mlflow.set_experiment("/Users/dheerajlakkakula1419@gmail.com/Databricks_IDC/Day13_ModelComparison")


In [0]:
for name, algo in models.items():
    with mlflow.start_run(run_name=name):

        # Build pipeline = feature step + model
        pipeline = Pipeline(stages=[assembler, algo])
        # Train
        pipeline_model = pipeline.fit(train)
        # Predict
        preds = pipeline_model.transform(test)

        # Evaluate
        rmse = rmse_eval.evaluate(preds)
        r2 = r2_eval.evaluate(preds)

        # Log parameters
        mlflow.log_param("model_type", name)
        mlflow.log_param("features", "views,carts")

        # Log metrics
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        # Log full pipeline model
        mlflow.spark.log_model(pipeline_model,"model", dfs_tmpdir="/Volumes/workspace/ecommerce_idc/ecommerce_data_idc")
        print(f"{name} → RMSE: {rmse:.2f}, R²: {r2:.4f}")




LinearRegressionP → RMSE: 131.98, R²: 0.9268




DecisionTreeP → RMSE: 447.77, R²: 0.1578




RandomForestP → RMSE: 479.03, R²: 0.2497


In [0]:
import mlflow
runs = mlflow.search_runs()
runs[["run_id", "params.model_type", "metrics.rmse", "metrics.r2"]]


Unnamed: 0,run_id,params.model_type,metrics.rmse,metrics.r2
0,9aa2bbe4c98f4b07b906d4b9f3ce9a23,RandomForestP,479.033299,0.249723
1,05905c9ab6834fbebc2c254debebaccc,DecisionTreeP,447.774314,0.157809
2,beb141cf1b704fbfa1a8f8f1e376d1a2,LinearRegressionP,131.977075,0.926837


In [0]:
best_run = runs.sort_values("metrics.rmse").iloc[0]

best_model = best_run["params.model_type"]
best_rmse = best_run["metrics.rmse"]
best_r2 = best_run["metrics.r2"]
best_run_id = best_run["run_id"]

print("Best Model Selected")
print("Model:", best_model)
print("RMSE:", round(best_rmse, 2))
print("R²:", round(best_r2, 4))
print("Run ID:", best_run_id)


Best Model Selected
Model: LinearRegressionP
RMSE: 131.98
R²: 0.9268
Run ID: beb141cf1b704fbfa1a8f8f1e376d1a2
