#### MLflow tool helps keep track of different model runs and records the parameters and monitor performance. 
It also stores models in an organized way, easy to reuse, compare, or deploy them easily

In [0]:
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


In [0]:
# prepare data
spark.sql("USE CATALOG ecom_catalog")
spark.sql("USE SCHEMA ecom_schema")


DataFrame[]

In [0]:
%sql  show tables

database,tableName,isTemporary
ecom_schema,bronze_events,False
ecom_schema,gold_events,False
ecom_schema,gold_ml_features,False
ecom_schema,gold_product_performance,False
ecom_schema,silver_events,False
ecom_schema,silver_events_partioned,False


In [0]:
%sql
select * from gold_events limit 2

category_id,category_code,views,carts,purchases,revenue,cart_to_purchase_rate
2088750570935419494,construction.tools.painting,28,0,0,,0.0
2106075725441269865,,16077,291,181,7139.08,62.19931271477663


#### Load and prepare data

In [0]:
# load gold performance data and prepare for model
df_gold = spark.table("gold_events").toPandas()

#fill nan with 0
df_gold_clean = df_gold.fillna(0)
X = df_gold_clean[["views","carts"]]
Y = df_gold_clean["purchases"]

X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2, random_state=42)

display(X_train.head())

views,carts
5428,38
9109,150
280,4
39835,2442
15780,185


ML flow experiment

In [0]:
# Linear Regression
from sklearn.metrics import r2_score, mean_squared_error
from mlflow.models.signature import infer_signature
import numpy as np

# MLflow experiment
with mlflow.start_run(run_name="linear_regression_v1"):
    # Log parameters
    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_param("test_size", 0.2)

    # Train
    model = LinearRegression()
    model.fit(X_train, Y_train)

    # Evaluate
    score = model.score(X_test, Y_test)
    mlflow.log_metric("r2_score", score)

    mse_value = mean_squared_error(Y_test, model.predict(X_test))
    mlflow.log_metric("mse", mse_value)
    mlflow.log_metric("rmse", np.sqrt(mse_value))

    # NEW: create signature + input example
    signature = infer_signature(X_train, model.predict(X_train))
    input_example = X_train.iloc[:1]

    # Log model with signature + example
    mlflow.sklearn.log_model(
        model,
        "model",
        signature=signature,
        input_example=input_example
    )

print(f"R² Score: {score:.4f}")



R² Score: 0.9810
