In [1]:
from mlflow import MlflowClient
from pprint import pprint
from sklearn.ensemble import RandomForestRegressor

In [3]:
client = MlflowClient(tracking_uri="http://127.0.0.1:5000")

In [4]:
# Search experiments without providing query terms behaves effectively as a 'list' action

all_experiments = client.search_experiments()

print(all_experiments)


[<Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1704948514681, experiment_id='0', last_update_time=1704948514681, lifecycle_stage='active', name='Default', tags={}>]


In [5]:
# Extract the experiment name and lifecycle_stage

default_experiment = [
    {"name": experiment.name, "lifecycle_stage": experiment.lifecycle_stage}
    for experiment in all_experiments
    if experiment.name == "Default"
][0]

pprint(default_experiment)


{'lifecycle_stage': 'active', 'name': 'Default'}


In [8]:
experiment_description = (
    "This is the grocery forecasting project. "
    "This experiment contains the produce models for apples"
)

experiment_tags = {
    "project_name": "grocery_forecasting",
    "store_dept": "produce",
    "team": "store-ml",
    "project_quarter": "Q1-2024",
    "mlflow.note.content": experiment_description
}

produce_apples_experiment = client.create_experiment(
    name="Apple_models",
    tags=experiment_tags,
)

In [19]:
apples_experiment = client.search_experiments(
    filter_string="tags.project_quarter = 'Q1-2024'"
)

pprint(apples_experiment[0])

<Experiment: artifact_location='mlflow-artifacts:/600722188044428228', creation_time=1704948866809, experiment_id='600722188044428228', last_update_time=1704948866809, lifecycle_stage='active', name='Apple_models', tags={'mlflow.note.content': 'This is the grocery forecasting project. This '
                        'experiment contains the produce models for apples',
 'project_name': 'grocery_forecasting',
 'project_quarter': 'Q1-2024',
 'store_dept': 'produce',
 'team': 'store-ml'}>


In [20]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta


def generate_apple_sales_data_with_promo_adjustment(base_demand: int = 1000, n_rows: int = 5000):
    """
    Generates a synthetic dataset for predicting apple sales demand with seasonality and inflation.

    This function creates a pandas DataFrame with features relevant to apple sales.
    The features include date, average_temperature, rainfall, weekend flag, holiday flag,
    promotional flag, price_per_kg, and the previous day's demand. The target variable,
    'demand', is generated based on a combination of these features with some added noise.

    Args:
        base_demand (int, optional): Base demand for apples. Defaults to 1000.
        n_rows (int, optional): Number of rows (days) of data to generate. Defaults to 5000.

    Returns:
        pd.DataFrame: DataFrame with features and target variable for apple sales prediction.

    Example:
        >>> df = generate_apple_sales_data_with_seasonality(base_demand=1200, n_rows=6000)
        >>> df.head()
    """

    # Set seed for reproducibility
    np.random.seed(9999)

    # Create date range
    dates = [datetime.now() - timedelta(days=i) for i in range(n_rows)]
    dates.reverse()

    # Generate features
    df = pd.DataFrame(
        {
            "date": dates,
            "average_temperature": np.random.uniform(10, 35, n_rows),
            "rainfall": np.random.exponential(5, n_rows),
            "weekend": [(date.weekday() >= 5) * 1 for date in dates],
            "holiday": np.random.choice([0, 1], n_rows, p=[0.97, 0.03]),
            "price_per_kg": np.random.uniform(0.5, 3, n_rows),
            "month": [date.month for date in dates],
        }
    )

    # Introduce inflation over time (years)
    df["inflation_multiplier"] = 1 + (df["date"].dt.year - df["date"].dt.year.min()) * 0.03

    # Incorporate seasonality due to apple harvests
    df["harvest_effect"] = np.sin(2 * np.pi * (df["month"] - 3) / 12) + np.sin(
        2 * np.pi * (df["month"] - 9) / 12
    )

    # Modify the price_per_kg based on harvest effect
    df["price_per_kg"] = df["price_per_kg"] - df["harvest_effect"] * 0.5

    # Adjust promo periods to coincide with periods lagging peak harvest by 1 month
    peak_months = [4, 10]  # months following the peak availability
    df["promo"] = np.where(
        df["month"].isin(peak_months),
        1,
        np.random.choice([0, 1], n_rows, p=[0.85, 0.15]),
    )

    # Generate target variable based on features
    base_price_effect = -df["price_per_kg"] * 50
    seasonality_effect = df["harvest_effect"] * 50
    promo_effect = df["promo"] * 200

    df["demand"] = (
        base_demand
        + base_price_effect
        + seasonality_effect
        + promo_effect
        + df["weekend"] * 300
        + np.random.normal(0, 50, n_rows)
    ) * df[
        "inflation_multiplier"
    ]  # adding random noise

    # Add previous day's demand
    df["previous_days_demand"] = df["demand"].shift(1)
    df["previous_days_demand"].fillna(method="bfill", inplace=True)  # fill the first row

    # Drop temporary columns
    df.drop(columns=["inflation_multiplier", "harvest_effect", "month"], inplace=True)

    return df



In [21]:
data = generate_apple_sales_data_with_promo_adjustment(
    base_demand=1000, n_rows=1000
)

data[-20:]

  df["previous_days_demand"].fillna(method="bfill", inplace=True)  # fill the first row


Unnamed: 0,date,average_temperature,rainfall,weekend,holiday,price_per_kg,promo,demand,previous_days_demand
980,2023-12-23 10:34:51.617439,34.130183,1.454065,1,0,1.449177,0,1289.802447,1001.085782
981,2023-12-24 10:34:51.617438,32.353643,9.462859,1,0,2.856503,0,1136.951553,1289.802447
982,2023-12-25 10:34:51.617437,18.816833,0.39147,0,0,1.326429,0,963.352029,1136.951553
983,2023-12-26 10:34:51.617435,34.533012,2.120477,0,0,0.970131,0,1039.385504,963.352029
984,2023-12-27 10:34:51.617434,23.057202,2.365705,0,0,1.049931,0,991.427049,1039.385504
985,2023-12-28 10:34:51.617433,34.810165,3.089005,0,0,2.035149,0,974.971149,991.427049
986,2023-12-29 10:34:51.617431,29.208905,3.673292,0,0,2.518098,0,1056.249547,974.971149
987,2023-12-30 10:34:51.617430,16.428676,4.077782,1,0,1.268979,0,1381.118915,1056.249547
988,2023-12-31 10:34:51.617429,32.067512,2.734454,1,0,0.762317,0,1358.492007,1381.118915
989,2024-01-01 10:34:51.617427,31.938203,13.883486,0,0,1.153301,0,994.40954,1358.492007


In [22]:
import mlflow
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mlflow.set_tracking_uri(
    "http://127.0.0.1:5000"
)

apple_experiment = mlflow.set_experiment(
    "Apple_models"
)

run_name = "apples_rf_test"

artifact_path = "rf_apples"

In [23]:
X = data.drop(columns=["date", "demand"])
y = data["demand"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

params = {
    "n_estimators": 100,
    "max_depth": 6,
    "min_samples_split": 10,
    "min_samples_leaf": 4,
    "random_state": 888,
    "bootstrap": True,
    "oob_score": False
}

rf = RandomForestRegressor(**params)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)


metrics = {
    "mae": mae,
    "mse": mse,
    "rmse": rmse,
    "r2": r2
}

with mlflow.start_run(run_name=run_name) as run:
    mlflow.log_params(params)
    mlflow.log_metrics(metrics)
    mlflow.sklearn.log_model(sk_model=rf, input_example=X_test, artifact_path=artifact_path)

  input_schema = _infer_schema(input_example)


In [24]:
import mlflow
logged_model = 'runs:/701fd4b8f6a94ae49b7ac2d388d0ea73/rf_apples'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

# Predict on a Pandas DataFrame.
import pandas as pd
loaded_model.predict(X_test)

Downloading artifacts:   0%|          | 0/6 [00:00<?, ?it/s]

array([1003.06051606, 1159.23744409, 1123.87850715,  933.60443752,
        981.25466328,  891.51446268, 1205.59183733,  899.0970115 ,
        904.07670726,  972.26649728,  915.05556051,  965.23961993,
        914.60674174, 1183.3265168 , 1213.5100646 , 1195.98471348,
       1424.73295891, 1294.00135963,  927.94555513,  888.0956347 ,
       1192.70612541, 1090.48970766, 1418.23414031, 1003.74168417,
       1317.51857866,  973.67255519,  976.55393516,  904.48213165,
       1111.89702514,  905.11652058,  919.53748683,  917.06105779,
        986.30750966,  983.2569026 , 1142.8153535 ,  903.87318167,
       1201.01447681,  968.59496694, 1102.85011558,  977.2739449 ,
        926.32374533,  894.65334352,  917.99336955, 1206.8438304 ,
        988.84410276,  970.15186779, 1537.9792124 ,  902.19580338,
        968.50814657,  912.94541841, 1091.84514314,  994.75581995,
       1169.58706336, 1084.78866513,  973.03533708,  902.83878319,
       1530.66280132, 1161.10439112, 1106.07852475, 1168.27802

In [25]:
# create another run

run_name = "apples_rf_test_2"

artifact_path = "rf_apples"

X = data.drop(columns=["date", "demand"])

y = data["demand"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

params = {
    "n_estimators": 200,
    "max_depth": 8,
    "min_samples_split": 10,
    "min_samples_leaf": 4,
    "random_state": 888,
    "bootstrap": True,
    "oob_score": False
}

rf = RandomForestRegressor(**params)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

r2 = r2_score(y_test, y_pred)

metrics = {
    "mae": mae,
    "mse": mse,
    "rmse": rmse,
    "r2": r2
}

with mlflow.start_run(run_name=run_name) as run:
    mlflow.log_params(params)
    mlflow.log_metrics(metrics)
    mlflow.sklearn.log_model(sk_model=rf, input_example=X_test, artifact_path=artifact_path)

  input_schema = _infer_schema(input_example)
