In [0]:
%pip install typing_extensions mlflow

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
from mlflow import MlflowClient
from pprint import pprint
from sklearn.ensemble import RandomForestRegressor

client = MlflowClient()

In [0]:
all_experiments = client.search_experiments()

print(all_experiments)

[<Experiment: artifact_location='dbfs:/databricks/mlflow-tracking/4230816160263178', creation_time=1754017391097, experiment_id='4230816160263178', last_update_time=1754017393582, lifecycle_stage='active', name='/Shared/MLflow-CE/extras/2_mlflow_project', tags={'mlflow.experiment.sourceName': '/Shared/MLflow-CE/extras/2_mlflow_project',
 'mlflow.experimentType': 'NOTEBOOK',
 'mlflow.ownerEmail': 'dimitar_pg13@hotmail.com',
 'mlflow.ownerId': '3716595958488557'}>, <Experiment: artifact_location='dbfs:/databricks/mlflow-tracking/4230816160263163', creation_time=1753938387036, experiment_id='4230816160263163', last_update_time=1753984401622, lifecycle_stage='active', name='/Shared/MLflow-CE/extras/1_get_started_fun_lab_lr', tags={'mlflow.experiment.sourceName': '/Shared/MLflow-CE/extras/1_get_started_fun_lab_lr',
 'mlflow.experimentType': 'NOTEBOOK',
 'mlflow.ownerEmail': 'dimitar_pg13@hotmail.com',
 'mlflow.ownerId': '3716595958488557'}>, <Experiment: artifact_location='dbfs:/databricks/

In [0]:
default_experiment = [
    {"name": experiment.name, "lifecycle_stage": experiment.lifecycle_stage}
    for experiment in all_experiments
    if experiment.name == "/Shared/MLflow-CE/extras/0_get_started_fun_lab"
][0]

pprint(default_experiment)

{'lifecycle_stage': 'active',
 'name': '/Shared/MLflow-CE/extras/0_get_started_fun_lab'}


Provide an experiment description that will appear on the UI

In [0]:
experiment_description = (
    "This is grocery-forecasting experiment"
    "The experiment contains the produce models for apples"
)


Provide searchable tags that define the characteristics of Runs that will be part of this experiemnt.


In [0]:
experiment_tags = {
    "project_name": "grocery-forecasting",
    "store_dept": "produce",
    "team": "stores-ml",
    "project_quarter": "Q3-2023",
    "mlflow.note.content": experiment_description,
}


Create an experiment , providing a unique name


In [0]:
produce_apples_experiment = client.create_experiment(
    name="/Users/dimitar_pg13@hotmail.com/Apple_Models", tags=experiment_tags
)

In [0]:
apples_experiment = client.search_experiments(
    filter_string="tags.`project_name` = 'grocery-forecasting'"
)

print(vars(apples_experiment[0]))

{'_experiment_id': '2387429575046603', '_name': '/Users/dimitar_pg13@hotmail.com/Apple_Models', '_artifact_location': 'dbfs:/databricks/mlflow-tracking/2387429575046603', '_lifecycle_stage': 'active', '_tags': {'mlflow.experiment.sourceName': '/Users/dimitar_pg13@hotmail.com/Apple_Models', 'mlflow.ownerId': '3716595958488557', 'mlflow.ownerEmail': 'dimitar_pg13@hotmail.com', 'mlflow.experimentType': 'MLFLOW_EXPERIMENT', 'mlflow.experimentKind': 'custom_model_development', 'mlflow.note.content': 'This is grocery-forecasting experimentThe experiment contains the produce models for apples', 'project_name': 'grocery-forecasting', 'project_quarter': 'Q3-2023', 'store_dept': 'produce', 'team': 'stores-ml'}, '_creation_time': 1754098322707, '_last_update_time': 1754098322707}


### Synthetic Data Creation
In order to produce some meaningful data (and a model) for us to log to MLflow, we'll use a apple-related dataset. 



#### Defining a dataset generator
For our examples to work, we're going to need something that can actually fit, but not something that fits too well. We're going to be training multiple iterations in order to show the effect of modifying our model's hyperparameters, so there needs to be some amount of unexplained variance in the feature set. However, we need some degree of correlation between our target variable (demand, in the case of our apples sales data that we want to predict) and the feature set.

We can introduce this correlation by crafting a relationship between our features and our target. The random elements of some of the factors will handle the unexplained variance portion.

In [0]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta


def generate_apple_sales_data_with_promo_adjustment(
    base_demand: int = 1000, n_rows: int = 5000
):
    """
    Generates a synthetic dataset for predicting apple sales demand with seasonality
    and inflation.

    This function creates a pandas DataFrame with features relevant to apple sales.
    The features include date, average_temperature, rainfall, weekend flag, holiday flag,
    promotional flag, price_per_kg, and the previous day's demand. The target variable,
    'demand', is generated based on a combination of these features with some added noise.

    Args:
        base_demand (int, optional): Base demand for apples. Defaults to 1000.
        n_rows (int, optional): Number of rows (days) of data to generate. Defaults to 5000.

    Returns:
        pd.DataFrame: DataFrame with features and target variable for apple sales prediction.

    Example:
        >>> df = generate_apple_sales_data_with_seasonality(base_demand=1200, n_rows=6000)
        >>> df.head()
    """

    # Set seed for reproducibility
    np.random.seed(9999)

    # Create date range
    dates = [datetime.now() - timedelta(days=i) for i in range(n_rows)]
    dates.reverse()

    # Generate features
    df = pd.DataFrame(
        {
            "date": dates,
            "average_temperature": np.random.uniform(10, 35, n_rows),
            "rainfall": np.random.exponential(5, n_rows),
            "weekend": [(date.weekday() >= 5) * 1 for date in dates],
            "holiday": np.random.choice([0, 1], n_rows, p=[0.97, 0.03]),
            "price_per_kg": np.random.uniform(0.5, 3, n_rows),
            "month": [date.month for date in dates],
        }
    )

    # Introduce inflation over time (years)
    df["inflation_multiplier"] = (
        1 + (df["date"].dt.year - df["date"].dt.year.min()) * 0.03
    )

    # Incorporate seasonality due to apple harvests
    df["harvest_effect"] = np.sin(2 * np.pi * (df["month"] - 3) / 12) + np.sin(
        2 * np.pi * (df["month"] - 9) / 12
    )

    # Modify the price_per_kg based on harvest effect
    df["price_per_kg"] = df["price_per_kg"] - df["harvest_effect"] * 0.5

    # Adjust promo periods to coincide with periods lagging peak harvest by 1 month
    peak_months = [4, 10]  # months following the peak availability
    df["promo"] = np.where(
        df["month"].isin(peak_months),
        1,
        np.random.choice([0, 1], n_rows, p=[0.85, 0.15]),
    )

    # Generate target variable based on features
    base_price_effect = -df["price_per_kg"] * 50
    seasonality_effect = df["harvest_effect"] * 50
    promo_effect = df["promo"] * 200

    df["demand"] = (
        base_demand
        + base_price_effect
        + seasonality_effect
        + promo_effect
        + df["weekend"] * 300
        + np.random.normal(0, 50, n_rows)
    ) * df[
        "inflation_multiplier"
    ]  # adding random noise

    # Add previous day's demand
    df["previous_days_demand"] = df["demand"].shift(1)
    df["previous_days_demand"].fillna(
        method="bfill", inplace=True
    )  # fill the first row

    # Drop temporary columns
    df.drop(columns=["inflation_multiplier", "harvest_effect", "month"], inplace=True)

    return df

Generate data and save the result

In [0]:
data = generate_apple_sales_data_with_promo_adjustment(base_demand=1_000, n_rows=1_000)

data[-20:]

Unnamed: 0,date,average_temperature,rainfall,weekend,holiday,price_per_kg,promo,demand,previous_days_demand
980,2025-07-14 03:38:33.456903,34.130183,1.454065,0,0,1.449177,0,999.30629,1356.418398
981,2025-07-15 03:38:33.456902,32.353643,9.462859,0,0,2.856503,0,842.129427,999.30629
982,2025-07-16 03:38:33.456901,18.816833,0.39147,0,0,1.326429,0,990.616709,842.129427
983,2025-07-17 03:38:33.456900,34.533012,2.120477,0,0,0.970131,0,1068.802075,990.616709
984,2025-07-18 03:38:33.456899,23.057202,2.365705,0,0,1.049931,0,1019.486305,1068.802075
985,2025-07-19 03:38:33.456898,34.810165,3.089005,1,0,2.035149,0,1329.564672,1019.486305
986,2025-07-20 03:38:33.456898,29.208905,3.673292,1,0,2.518098,0,1413.143402,1329.564672
987,2025-07-21 03:38:33.456897,16.428676,4.077782,0,0,1.268979,0,1093.207186,1413.143402
988,2025-07-22 03:38:33.456896,32.067512,2.734454,0,0,0.762317,0,1069.939894,1093.207186
989,2025-07-23 03:38:33.456895,31.938203,13.883486,0,0,1.153301,0,994.40954,1069.939894


### Using MLflow Tracking to keep track of training
Now that we have our data set and have seen a little bit of how runs are recorded, let's dive in to using MLflow to tracking a training iteration.

To start with, we will need to import our required modules.

In [0]:
import mlflow
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

Since we are running inside databricks we do not need to specify full uri. However, if we run this notebook locally we will need to set the following values

```python
os.environ["DATABRICKS_HOST"] = "https://dbc-1234567890123456.cloud.databricks.com" # set to your server URI
os.environ["DATABRICKS_TOKEN"] = "dapixxxxxxxxxxxxx"

mlflow.set_tracking_uri("databricks")
mlflow.set_experiment("/your-experiment")
```
For details refer to https://docs.databricks.com/aws/en/mlflow/tracking#where-mlflow-runs-are-logged

In [0]:
mlflow.set_tracking_uri("databricks")

import os
os.environ["DATABRICKS_TOKEN"] = "<your-access-token>"


We will define a few more constants that we're going to be using when logging our training events to MLflow in the form of runs. We'll start by defining an Experiment that will be used to log runs to. There is a parent-child relationship of Experiments to Runs. The utility of this relationship will be used once we start iterating over some ideas and need to compare the results of our tests.

In [0]:
# Sets the current active experiment to the "Apple_Models" experiment and
# returns the Experiment metadata
apple_experiment = mlflow.set_experiment("/Users/dimitar_pg13@hotmail.com/Apple_Models")

# Define a run name for this iteration of training.
# If this is not set, a unique name will be auto-generated for your run.
run_name = "apples_rf_test"

# Define an artifact path that the model will be saved to.
artifact_path = "rf_apples"

With these variables defined, we can start training a model.

Firstly, let's look at what we're going to be running. Following the code display, we'll look at an annotated version of the code.

In [0]:
# Split the data into features and target and drop irrelevant date field and target field
X = data.drop(columns=["date", "demand"])
y = data["demand"]

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

params = {
    "n_estimators": 100,
    "max_depth": 6,
    "min_samples_split": 10,
    "min_samples_leaf": 4,
    "bootstrap": True,
    "oob_score": False,
    "random_state": 888,
}

# Train the RandomForestRegressor
rf = RandomForestRegressor(**params)

# Fit the model on the training data
rf.fit(X_train, y_train)

# Predict on the validation set
y_pred = rf.predict(X_val)

# Calculate error metrics
mae = mean_absolute_error(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_val, y_pred)

# Assemble the metrics we're going to write into a collection
metrics = {"mae": mae, "mse": mse, "rmse": rmse, "r2": r2}

# Initiate the MLflow run context
with mlflow.start_run(run_name=run_name) as run:
    # Log the parameters used for the model fit
    mlflow.log_params(params)

    # Log the error metrics that were calculated during validation
    mlflow.log_metrics(metrics)

    # Log an instance of the trained model for later use
    mlflow.sklearn.log_model(sk_model=rf, input_example=X_val, name=artifact_path)

🔗 View Logged Model at: https://dbc-b2d30165-76df.cloud.databricks.com/ml/experiments/2387429575046603/models/m-2b6882959fc442a3a26fa33cdb5f3111?o=4476931374519718
