In [None]:
import os
import sys
from pathlib import Path
from typing import Any, Dict, List, Optional

import datarobot as dr
from dotenv import load_dotenv

# The notebook should be executed from the project root directory
if "_correct_path" not in locals():
    os.chdir("..")
    sys.path.append(".")
    print(f"changed dir to {Path('.').resolve()})")
    _correct_path = True
load_dotenv()
client = dr.Client()

In [None]:
from datarobotx.idp.use_cases import get_or_create_use_case

from infra.settings_main import use_case_args

if "DATAROBOT_DEFAULT_USE_CASE" in os.environ:
    use_case_id = os.environ["DATAROBOT_DEFAULT_USE_CASE"]
else:
    use_case_id = get_or_create_use_case(
        endpoint=client.endpoint,
        token=client.token,
        name=use_case_args.resource_name,
        description=use_case_args.description,
    )

# Data Ingest and Preparation

In [None]:
import pandas as pd

from infra.settings_datasets import training_dataset

# Replace as needed with your own data ingest and/or preparation logic
df = pd.read_csv(training_dataset.file_path)

In [None]:
from datarobotx.idp.datasets import get_or_create_dataset_from_df

print("Uploading training data to AI Catalog...")
training_dataset_id = get_or_create_dataset_from_df(
    endpoint=client.endpoint,
    token=client.token,
    data_frame=df,
    name=training_dataset.resource_name,
    use_cases=use_case_id,
)

# Model Training

In [None]:
from forecastic.schema import FeatureSettingConfig
from infra.common.schema import (
    AdvancedOptionsArgs,
    AnalyzeAndModelArgs,
    AutopilotRunArgs,
    CalendarArgs,
    DatetimePartitioningArgs,
)
from infra.settings_main import project_name

calendar_args = CalendarArgs(
    country_code="US",
    name=f"Calendar [{project_name}]",
    start_date="2012-01-01",
    end_date="2022-01-01",
)
autopilotrun_args = AutopilotRunArgs(
    name=f"Forecast Assistant Project [{project_name}]",
    advanced_options_config=AdvancedOptionsArgs(seed=42),
    analyze_and_model_config=AnalyzeAndModelArgs(
        metric="RMSE",
        mode=dr.enums.AUTOPILOT_MODE.QUICK,
        target="Sales",
        worker_count=-1,
    ),
    datetime_partitioning_config=DatetimePartitioningArgs(
        datetime_partition_column="Date",
        multiseries_id_columns=["Store"],
        use_time_series=True,
        feature_derivation_window_start=-35,
        feature_derivation_window_end=0,
        forecast_window_start=1,
        forecast_window_end=30,
    ),
    feature_settings_config=[
        FeatureSettingConfig(feature_name="Store_Size", known_in_advance=True),
        FeatureSettingConfig(feature_name="Marketing", known_in_advance=True),
        FeatureSettingConfig(feature_name="TouristEvent", known_in_advance=True),
    ],
)

registered_model_name = f"Forecastic Registered Model [{project_name}]"

In [None]:
from datarobotx.idp.autopilot import get_or_create_autopilot_run
from datarobotx.idp.calendars import get_or_create_calendar_dataset_from_country_code
from datarobotx.idp.registered_model_versions import (
    get_or_create_registered_leaderboard_model_version,
)

calendar_id = get_or_create_calendar_dataset_from_country_code(
    endpoint=client.endpoint, token=client.token, **calendar_args.model_dump()
)

print("Running Autopilot...")
project_id = get_or_create_autopilot_run(
    endpoint=client.endpoint,
    token=client.token,
    calendar_id=calendar_id,
    dataset_id=training_dataset_id,
    use_case=use_case_id,
    **autopilotrun_args.model_dump(),
)

model_id = dr.ModelRecommendation.get(project_id).model_id

print("Registered recommended model...")
registered_model_version_id = get_or_create_registered_leaderboard_model_version(
    endpoint=client.endpoint,
    token=client.token,
    model_id=model_id,
    registered_model_name=registered_model_name,
    compute_all_ts_intervals=True,
)

# Generate modeling artifacts needed for app

In [None]:
from forecastic.schema import WhatIfFeature


def get_what_if_features(
    project_id: str,
    model_id: str,
    feature_settings_config: Optional[List[FeatureSettingConfig]] = None,
) -> List[WhatIfFeature]:
    """Returns features to be exposed in app for what if analysis

    Only returns categorical and numeric known in advance features.
    Categories are returned with selectable options.

    Parameters
    ----------
    feature_settings_config : Optional[List[Dict[str, Any]]]
        Known in advance features
    """

    if not feature_settings_config:
        return []

    project = dr.Project.get(project_id)  # type: ignore[attr-defined]
    model = dr.Model.get(project=project_id, model_id=model_id)  # type: ignore[attr-defined]
    dataset = project.get_dataset()
    if dataset is None:
        raise ValueError("Dataset not found")
    model_features = set(model.get_features_used())
    feature_types = dataset.get_all_features()
    dataframe = dataset.get_as_dataframe()

    numerics = set([i.name for i in feature_types if i.feature_type == "Numeric"])
    categoricals = set(
        [i.name for i in feature_types if i.feature_type == "Categorical"]
    )
    allowed_features = numerics.union(categoricals)

    whatif_features = []
    for feature in feature_settings_config:
        if (
            feature.known_in_advance
            and feature.feature_name in model_features
            and feature.feature_name in allowed_features
        ):
            append_feature = feature.model_dump(mode="json")
            if feature.feature_name in categoricals:
                append_feature["values"] = list(
                    dataframe[feature.feature_name].unique()
                )

            whatif_features.append(WhatIfFeature(**append_feature))
    return whatif_features

In [None]:
def get_most_important_features(
    project_id: str,
    model_id: str,
    minimum_importance: float = 0.03,
    max_wait: int = 600,
) -> List[Dict[str, Any]]:
    """Get the most important features for the model.

    Parameters
    ----------
    max_features : int
        The maximum number of features to return
    max_wait : int
        The maximum time to wait for the feature impact to be calculated
    """

    model = dr.Model.get(model_id=model_id, project=project_id)  # type: ignore[attr-defined]
    feature_impact = model.get_or_request_feature_impact(max_wait=max_wait)

    return [
        {
            "featureName": feature["featureName"],
            "impactNormalized": feature["impactNormalized"],
        }
        for feature in feature_impact
        if feature["impactNormalized"] > minimum_importance
    ]

In [None]:
def get_timestep_settings(
    project_id: str,
    datetime_partition_column: str,
) -> Dict[str, Any]:
    """Get window basis unit and interval from timeseries project

    Returns
    -------
    Dict[str, Any]
        Time unit and step
    """
    url = f"projects/{project_id}/features/{datetime_partition_column}/multiseriesProperties"
    response = client.get(url).json()
    timestep_settings: dict[str, Any] = response["detectedMultiseriesIdColumns"][0]
    del timestep_settings["multiseriesIdColumns"]
    return timestep_settings

In [None]:
print("Running feature impact...")
important_features = get_most_important_features(
    project_id=project_id,
    model_id=model_id,
    minimum_importance=0.05,  # cleanup
)

# Export settings for provisioning app, other dependent resources

In [None]:
import textwrap

from forecastic.schema import CategoryFilter, StaticAppSettings

static_app_settings = StaticAppSettings(
    filterable_categories=[
        CategoryFilter(column_name="Store", display_name="Store"),
        CategoryFilter(column_name="Region", display_name="Region"),
        CategoryFilter(column_name="Market", display_name="Market"),
    ],
    page_description="This application forecasts the sale revenue of a national retailer. The forecast can be focused by region, market, or store.",
    lower_bound_forecast_at_0=True,
    graph_y_axis="Sales ($)",
    page_title="Multistore Sales Forecast Interpreter",
    headline_prompt=textwrap.dedent(
        """\
        You are a data analyst and your job is to explain to non-technical executive business leaders what the data suggests.
        Executive leadership will provide a sales forecast and you will interpret it and summarize the outlook, highlighting key insights.
        Your response should be only 1 sentence long, not very wordy. It should be like a news headline. Do not put quotation marks around it.
        Your response, while insightful, should speak to the general direction of the forecast.
"""
    ),
)

In [None]:
import yaml

from forecastic.schema import AppSettings
from infra.settings_main import model_training_output_file

print("Capturing settings required to deploy the frontend...")
registered_model = next(
    rm
    for rm in dr.RegisteredModel.list(search=registered_model_name)
    if rm.name == registered_model_name
)

app_settings = AppSettings.from_registered_model_version(
    target=autopilotrun_args.analyze_and_model_config.target,
    registered_model_id=registered_model.id,
    registered_model_version_id=registered_model_version_id,
    what_if_features=get_what_if_features(
        project_id=project_id,
        model_id=model_id,
        feature_settings_config=autopilotrun_args.feature_settings_config,
    ),
    important_features=important_features,
    prediction_interval=80,
    static_app_settings=static_app_settings,
)

with open(model_training_output_file, "w") as f:
    yaml.dump(app_settings.model_dump(), f)