In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import logging

from lmkgroup_ds_utils.db.connector import DB

In [None]:
class args:
    company = 'RT'
    db_pw = ''
    local = True
    write_to_db = False
    save_output_locally = True

company = args.company
local = args.local
save_output_locally = args.save_output_locally
db_pw = args.db_pw

In [None]:
from paths import CONFIG_DIR, SQL_DIR
from utils import read_yaml
db_configs = read_yaml(
    file_name="db",
    directory=CONFIG_DIR
)

read_db_name = db_configs["read"]["db_name"]
read_db_env = db_configs["read"]["env"]
read_db_settings_path = db_configs["read"]["db_settings_path"]

company_configs = read_yaml(
    file_name="company_configs",
    directory=CONFIG_DIR
)
company_config = company_configs[company]

In [None]:
logging.info("Connecting to db...")
read_db = DB(
    local=local,
    db_name=read_db_name,
    db_settings_path=read_db_settings_path,
    password=db_pw,
    env=read_db_env
)

In [None]:
from orders_forecasting.pipeline import get_data_from_db
df_order_history, df_estimations_total, df_estimations_dishes = get_data_from_db(
    read_db=read_db,
    company_config=company_config
)

In [None]:
target = "num_dishes_orders"

In [None]:
import plotly.express as px
from paths import PROJECT_DIR

fig = px.line(
    df_order_history,
    x="week",
    y=target,
    color='year'
)
fig.write_html(f"{PROJECT_DIR}/year_order_{target}_{company}.html")

# Baseline model

In [None]:
from orders_forecasting.helpers import get_year_week_from_date
df_manual_forecast = fetch_data_from_sql(
    read_db=read_db,
    sql_name="manual_forecast",
    directory=SQL_DIR,
    company_id=company_config["company_id"]
)

df_manual_forecast = df_manual_forecast.sort_values(by="run_timestamp")
df_manual_forecast["run_date"] = df_manual_forecast["run_timestamp"].dt.date
df_manual_forecast["forecast_yyyyww"] = df_manual_forecast["run_date"].apply(lambda x: get_year_week_from_date(x))
df_manual_forecast.head()

In [None]:
from orders_forecasting.helpers import get_iso_week_numbers
import pandas as pd
df_iso_calendar = get_iso_week_numbers(
    start_date=pd.to_datetime("2021-01-01"),
    end_date=pd.to_datetime("2023-11-01")
)
from orders_forecasting.helpers import create_date_from_year_week
df_forecast_date = create_date_from_year_week(
    df=df_iso_calendar,
    date_column_name="forecast_date",
    day_of_week=3
)

forecast_days = df_forecast_date["forecast_date"]

In [None]:
from orders_forecasting.data import get_forecast_start, create_future_df
import pandas as pd
from datetime import datetime
from orders_forecasting.metrics import get_metrics
forecast_date = datetime(year=2023, month=1, day=10)
start_year, start_week = get_forecast_start(
    start_date=pd.to_datetime(forecast_date),
    cut_off_day=company_config["cut_off_day"]
)
df = df_order_history
df_val = df[df["year"] * 100 + df["week"] >= (start_year * 100 + start_week)]
df_train = df[df["first_date_of_week"]<= forecast_date]
df_future = create_future_df(
    start_week=start_week,
    start_year=start_year,
    horizon=11
)

In [None]:
forecast_year = forecast_date.isocalendar()[0]
forecast_week = forecast_date.isocalendar()[1]

In [None]:
from orders_forecasting.baseline import get_rolling_avg_projection
df_pred, colname, ratio = get_rolling_avg_projection(
    df_past=df_train,
    df_future=df_future,
    target_col="num_total_orders",
    num_rolling_weeks=4
)

In [None]:
from orders_forecasting.baseline import get_week_projection
df_pred, proj_colname1 = get_week_projection(
    df_past=df_train,
    df_future=df_pred,
    target_col="num_total_orders",
    num_past_year=1,
    method="diff",
)

df_pred, proj_colname2 = get_week_projection(
    df_past=df_train,
    df_future=df_pred,
    target_col="num_total_orders",
    num_past_year=1,
    method="ratio",
)

df_pred, proj_colname3 = get_week_projection(
    df_past=df_train,
    df_future=df_pred,
    target_col="num_total_orders",
    num_past_year=2,
    method="diff",
)

df_pred, proj_colname4 = get_week_projection(
    df_past=df_train,
    df_future=df_pred,
    target_col="num_total_orders",
    num_past_year=2,
    method="ratio",
)

df_pred


In [None]:
df_manual_to_plot = df_manual_forecast[df_manual_forecast["forecast_yyyyww"] == forecast_year*100 + forecast_week].drop_duplicates(
    subset = ["year", "week"],
    keep="first"
)
df_pred["yyyyww"] = df_pred["year"] * 100 + df_pred["week"]
df_manual_to_plot["yyyyww"] = df_manual_to_plot["year"] * 100 + df_manual_to_plot["week"]
df_manual_to_plot = df_manual_to_plot.sort_values(by="yyyyww")
df_actual = df[(df["year"] == start_year)]
df_actual["yyyyww"] =df["year"] * 100 + df["week"]
df_train_to_plot = df_train.tail(5)
df_train_to_plot["yyyyww"] = df_train_to_plot["year"]*100 + df_train_to_plot["week"]
df_train_to_plot.head()

In [None]:
df_prev_year = df[df["year"] == start_year-1]

In [None]:
import plotly.graph_objects as go

# Create random data with numpy
import numpy as np

# Create traces
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_actual["week"], y=df_actual["num_total_orders"],
                    mode='lines',
                    name='actual'))
fig.add_trace(go.Scatter(x=df_prev_year["week"], y=df_prev_year["num_total_orders"],
                    mode='lines',
                    name='previous year'))
for colname in [
    'projection_ra_ratio_4_weeks',
    'projection_week_diff_1_year',
    'projection_week_ratio_1_year',
    'projection_week_diff_2_year',
]:
    fig.add_trace(
        go.Scatter(
            x=df_pred["week"],
            y=df_pred[colname],
            mode='lines',
            name=colname
        )
    )
fig.add_trace(
    go.Scatter(
        x=df_manual_to_plot["week"],
        y=df_manual_to_plot["total_orders"],
        mode='lines',
        name="manual forecast"
    )
)

fig.add_trace(
    go.Scatter(
        x=df_train_to_plot["week"],
        y=df_train_to_plot["num_total_orders"],
        mode='lines',
        name="train data"
    )
)

fig.show()

In [None]:
from orders_forecasting.metrics import get_metrics
metrics_proj = get_metrics(
    df_pred=df_pred,
    df_actual=df,
    pred_col="projection_ra_ratio_4_weeks",
    actual_col="num_total_orders",
)


In [None]:
from orders_forecasting.metrics import get_metrics
metrics_manual = get_metrics(
    df_pred=df_manual_to_plot,
    df_actual=df,
    pred_col="total_orders",
    actual_col="num_total_orders",
)


In [None]:
from orders_forecasting.helpers import get_iso_week_numbers
import pandas as pd
from orders_forecasting.metrics import get_metrics
df_iso_calendar = get_iso_week_numbers(
    start_date=pd.to_datetime("2023-01-01"),
    end_date=pd.to_datetime("2023-11-01")
)

from orders_forecasting.helpers import create_date_from_year_week
df_forecast_date = create_date_from_year_week(
    df=df_iso_calendar,
    date_column_name="forecast_date",
    day_of_week=3
)

metrics_list = []
for forecast_date in forecast_days:
    start_year, start_week = get_forecast_start(
        start_date=pd.to_datetime(forecast_date),
        cut_off_day=company_config["cut_off_day"]
    )
    df = df_order_history
    df_val = df[df["year"] * 100 + df["week"] >= (start_year * 100 + start_week)]
    df_train = df[df["first_date_of_week"]<= forecast_date]
    df_future = create_future_df(
        start_week=start_week,
        start_year=start_year,
        horizon=11
    )

    df_pred, colname, ratio = get_rolling_avg_projection(
        df_past=df_train,
        df_future=df_future,
        target_col="num_total_orders",
        num_rolling_weeks=4
    )
    metrics = get_metrics(
        df_pred=df_pred,
        df_actual=df,
        pred_col="projection_ra_ratio_4_weeks",
        actual_col="num_total_orders",
    )
    metrics_list.append(metrics["df_metrics"])


In [None]:
fig = go.Figure()
for forecast_date, metrics in zip(forecast_days, metrics_list):
    fig.add_trace(
        go.Scatter(
            x=metrics["week"],
            y=metrics["abs_error_pct"],
            mode='lines',
            name=str(forecast_date)
        )
    )

In [None]:
from datetime import datetime
forecast_date = datetime(
    year=2023,
    month=11,
    day=1
)

from orders_forecasting.helpers import get_year_week_from_date
yyyyww_forecast = get_year_week_from_date(a_date=forecast_date)
# the actual known days
df_past = df_order_history[(df_order_history["year"] * 100 + df_order_history["week"])<=yyyyww_forecast]
df_past["type"] = "train"
last_known_yyyyww = (df_past["year"] * 100 + df_past["week"]).max()

In [None]:
from orders_forecasting.data import get_all_missing_future_weeks
from orders_forecasting.helpers import create_date_from_year_week

df_future = get_all_missing_future_weeks(
    forecast_date=forecast_date,
    last_known_yyyyww=last_known_yyyyww,
    forecast_horizon=11,
    cut_off_day=company_config["cut_off_day"]
)

df_future["company_id"] = company_config["company_id"]
df_future = create_date_from_year_week(df_future, date_column_name="first_date_of_week")
for col in df_past.columns:
    if col not in df_future:
        df_future[col] = None
df_future = df_future[df_past.columns]
df_future["month"] = df_future["first_date_of_week"].dt.month

In [None]:
from orders_forecasting.data import add_lag_features
from orders_forecasting.data import add_moving_avg_features
from orders_forecasting.data import get_holiday_df

df_past["month"] = df_past.first_date_of_week.dt.month
df = add_lag_features(
    df=df_past,
    lag_list=[2, 4, 8, 12, 52],
    origin_col="num_total_orders"
)
df = add_moving_avg_features(
    df=df,
    window_list=[2, 4, 8, 12, 52],
    origin_col="num_total_orders"
)
df["detrended"] = df["num_total_orders"] - df["moving_avg_52"]
df["seasonality"] = df.groupby("week")["detrended"].transform("mean")

In [None]:
df_holiday_calendar = get_holiday_df(
    country="Norway",
    year_min=df["year"].min(),
    year_max=df["year"].max()
)

df_holiday_calendar = df_holiday_calendar.drop(columns="num_holidays")

In [None]:
df = df.merge(
    df_holiday_calendar,
    on=["year", "week"],
    how="left"
)

df.loc[:, df_holiday_calendar.columns] = df[df_holiday_calendar.columns].fillna(0)

In [None]:
df["type"] = "train"

In [None]:
lag_features = [
    "lag_2",
    "lag_4",
    "lag_12",
    "lag_52",
    "moving_avg_2",
    "moving_avg_4",
    "moving_avg_12",
    "moving_avg_52",
    "seasonality",
]
holiday_features = list(df_holiday_calendar.columns)
calendar_features = ["month"]
train_features = holiday_features + lag_features + calendar_features
target = "num_total_orders"

In [None]:
automl.best_loss

In [None]:
import pandas as pd
df_feature_importance = pd.DataFrame(
    {
        "features": train_features,
        "feature_importance": automl.feature_importances_,
    }
)

df_feature_importance = df_feature_importance.sort_values(
    by="feature_importance",
    ascending=False
)
df_feature_importance.plot.bar(x="features", y="feature_importance")

In [None]:
df_future.columns

In [None]:
from orders_forecasting.data import add_lag_features
from orders_forecasting.data import add_moving_avg_features
df_seasonality = df[["week", "seasonality"]].drop_duplicates()
df_new = df[df_future.columns]
for i in list(df_future.index):
    df_tmp = pd.DataFrame(df_future.loc[i, :]).transpose()
    df_new = pd.concat([df_new, df_tmp], ignore_index=True)

    df_new = add_lag_features(
        df=df_new,
        lag_list=[2, 4, 8, 12, 52],
        origin_col="num_total_orders"
    )

    df_new = add_moving_avg_features(
        df=df_new,
        window_list=[2, 4, 8, 12, 52],
        origin_col="num_total_orders"
    )

    df_new = df_new.merge(
        df_seasonality,
        on="week",
        how="left"
    )

    df_new = df_new.merge(
        df_holiday_calendar,
        on=["year", "week"],
        how="left"
    )

    df_new[df_holiday_calendar.columns[2:]] = df_new[df_holiday_calendar.columns[2:]].fillna(0)


    X_test = df_new.tail(1)[train_features]
    y_pred = automl.predict(X_test)

    df_new.loc[df_new.index[-1], target] = y_pred[0]
    df_new = df_new[df_future.columns]


In [None]:
df_test = df_new[df_new["type"]!="train"][["year", "week", "type","num_total_orders"]].rename(
    columns={"num_total_orders": "pred"}
)

df_test = df_test.merge(
    df_order_history[["year", "week", target]],
    on=["year", "week"],
    how="inner"
)
df_test["abs_error_pct"] = abs(df_test["pred"] - df_test[target])/df_test[target]

In [None]:
df_test

In [None]:
df_test["abs_error_pct"].mean()

In [None]:
import plotly.graph_objects as go

# Create random data with numpy
# Create traces
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=df_train.tail(20)["week"],
        y=df_train.tail(20)["num_total_orders"],
        mode='lines',
        name='train'
    )
)
fig.show()

In [None]:
import plotly.graph_objects as go

# Create random data with numpy
# Create traces
fig = go.Figure()

# fig.add_trace(
#     go.Scatter(
#         x=df_train.tail(20)["week"],
#         y=df_train.tail(20)["num_total_orders"],
#         mode='lines',
#         name='train'
#     )
# )

fig.add_trace(
    go.Scatter(
        x=df_test["week"],
        y=df_test["pred"],
        mode='lines',
        name='pred'
    )
)

fig.add_trace(
    go.Scatter(
        x=df_test["week"],
        y=df_test["num_total_orders"],
        mode='lines',
        name='actual'
    )
)

In [None]:
fig.write_html("./pred.html")

# Estimation model

In [None]:
import pandas as pd
# prediction_date = pd.to_datetime("2023-10-01")
prediction_date = None

In [None]:
from orders_forecasting.pipeline import process_and_create_dataset
if company == "RT":
    is_augment_estimation = True
else:
    is_augment_estimation = False

df_train, df_test, df_holdout = process_and_create_dataset(
    df_order_history=df_order_history,
    df_estimations_total=df_estimations_total,
    df_estimations_dishes=df_estimations_dishes,
    is_augment_estimation=is_augment_estimation,
    company_config=company_config,
    prediction_date=prediction_date,
    target_col=target,
    outlier_yyyywws=company_config["outlier_weeks"]
)

In [None]:
df_holdout

In [None]:
df_test

## Flaml model - probably not going to be used 

In [None]:
df_test = df[df["estimation_date"] == prediction_date]
df = df.sort_values(by=["estimation_date", "year", "week"])
df_train = df[df["estimation_date"] < prediction_date]
df_train = df_train[df_train["cut_off_date"]< prediction_date]

X_train = df_train[train_features]
y_train = df_train[target]

In [None]:
from flaml import AutoML

# Initialize an AutoML instance
automl = AutoML()
# Specify automl goal and constraint
automl_settings = {
    "time_budget": 120,  # in seconds
    "metric": 'mape',
    "task": 'regression',
    "eval_method": "holdout",
    "split_ratio": 0.3,
    "estimator_list": ['lgbm', 'xgboost', 'xgb_limitdepth']
}
# Train with labeled input data
automl.fit(X_train=X_train, y_train=y_train,
           **automl_settings)
# Predict
print(automl.predict(X_train))
# Print the best model
print(automl.model.estimator)

In [None]:
X_test = df_test[train_features]
y_test = df_test[target]
y_flaml = automl.predict(X_test)

In [None]:
df_test["pred"] = y_flaml
df_test["error"] = df_test["pred"] - df_test[target]
df_test["error_pct"] = df_test["error"]/df_test[target]
df_test.sort_values(by=["year", "week"])

In [None]:
abs(df_test["error_pct"]).describe()

In [None]:
df_feature_importance = pd.DataFrame(
    {
        "features": train_features,
        "feature_importance": automl.feature_importances_
    }
)
df_feature_importance = df_feature_importance.sort_values(
    by="feature_importance", ascending=False
).plot.bar(
    x="features",
    y="feature_importance"
)

In [None]:
train_features

## Pycaret

In [None]:
df_train.columns

In [None]:
train_params = read_yaml(
    file_name="train",
    directory=CONFIG_DIR
)

In [None]:
from orders_forecasting.model import _convert_col_to_numeric
from pycaret.regression import setup
train_features = train_params["train_features"][company]
cols_to_include = train_features + [target]
df_train = _convert_col_to_numeric(
    df=df_train,
    relevant_columns=cols_to_include
)
df_holdout = _convert_col_to_numeric(
    df=df_holdout,
    relevant_columns=cols_to_include
)
df_train_relevant_cols_only = df_train[cols_to_include]
df_holdout_relevant_cols_only = df_holdout[cols_to_include]
experiment_reg = setup(
    data=df_train_relevant_cols_only,
    target=target,
    test_data=df_holdout_relevant_cols_only,
)

best_models = experiment_reg.compare_models(
    n_select=train_params["n_select"], include=train_params["model_list"]
)

In [None]:
df_train_info = experiment_reg.pull()
df_train_info

### Feature importance

In [None]:
df_feature_importance = pd.DataFrame(
    {
        "features": train_features,
        "feature_importance": best_models[0].feature_importances_
    }
)
df_feature_importance = df_feature_importance.sort_values(
    by="feature_importance", ascending=False
).plot.bar(
    x="features",
    y="feature_importance"
)

In [None]:
df_feature_importance = pd.DataFrame(
    {
        "features": train_features,
        "feature_importance": best_models[1].feature_importances_
    }
)
df_feature_importance = df_feature_importance.sort_values(
    by="feature_importance", ascending=False
).plot.bar(
    x="features",
    y="feature_importance"
)

## Hold out metrics

In [None]:
from orders_forecasting.metrics import get_metrics
top_models_blend = experiment_reg.blend_models(estimator_list=best_models)

df_holdout_pred = experiment_reg.predict_model(top_models_blend)
hold_out_metrics = get_metrics(
    df_pred=df_holdout_pred[["year", "week", "prediction_label"]],
    df_actual=df_holdout,
    pred_col="prediction_label",
    actual_col=target,
)

In [None]:
baseline_metrics = get_metrics(
    df_pred=df_holdout_pred[["year", "week", "retention_projection_total"]],
    df_actual=df_holdout[["year", "week", target]],
    pred_col="retention_projection_total",
    actual_col=target,
)

In [None]:
baseline_metrics

In [None]:
hold_out_metrics

In [None]:
from orders_forecasting.model import make_predictions, _convert_col_to_numeric
# df_test = _convert_col_to_numeric(df_test, relevant_columns=train_features)
df_test["pred"] = make_predictions(
    trained_model=top_models_blend,
    df_test=df_test[train_features],
    feature_list=train_params["train_features"][company]
)

df_test = df_test.sort_values(by=["year", "week"])

In [None]:
from orders_forecasting.visualisation import plot_train_test_pred
from paths import PROJECT_DIR
fig = plot_train_test_pred(
    df_train=df_train,
    df_val=df_holdout[["year", "week", target]].drop_duplicates(),
    df_test=df_test,
    target=target
)

test_start_wk = (df_test["year"] * 100 + df_test["week"]).min()
n_week = (df_test["year"] * 100 + df_test["week"]).nunique()
html_file_name = f"train_test_pred_{test_start_wk}_{n_week}_wks.html"
html_file_dir = PROJECT_DIR
fig.write_html(f"{PROJECT_DIR}/{html_file_name}")

In [None]:
from orders_forecasting.metrics import get_metrics

# won't be available in prod, but available when training
test_metrics = get_metrics(
    df_actual=df_test[["year", "week", target]],
    df_pred=df_test[["year", "week", "pred"]],
    pred_col="pred",
    actual_col=target
)

In [None]:
# baseline metrics
baseline_metrics = get_metrics(
    df_actual=df_test[["year", "week", target]],
    df_pred=df_test[["year", "week", "retention_projection_total"]],
    pred_col="retention_projection_total",
    actual_col=target

)

In [None]:
test_metrics["df_metrics"]