# Test Greykite

### Imports

In [114]:
import pandas as pd
from greykite.algo.changepoint.adalasso.changepoint_detector import ChangepointDetector
from greykite.algo.forecast.silverkite.constants.silverkite_holiday import SilverkiteHoliday
from greykite.algo.forecast.silverkite.constants.silverkite_seasonality import SilverkiteSeasonalityEnum
from greykite.algo.forecast.silverkite.forecast_simple_silverkite_helper import cols_interact
from greykite.common import constants as cst
from greykite.common.features.timeseries_features import build_time_features_df
from greykite.common.features.timeseries_features import convert_date_to_continuous_time
from greykite.framework.benchmark.data_loader_ts import DataLoaderTS
from greykite.framework.templates.autogen.forecast_config import EvaluationPeriodParam
from greykite.framework.templates.autogen.forecast_config import ForecastConfig
from greykite.framework.templates.autogen.forecast_config import MetadataParam
from greykite.framework.templates.autogen.forecast_config import ModelComponentsParam
from greykite.framework.templates.forecaster import Forecaster
from greykite.framework.templates.model_templates import ModelTemplateEnum
from greykite.framework.utils.result_summary import summarize_grid_search_results
from greykite.framework.input.univariate_time_series import UnivariateTimeSeries
from greykite.detection.detector.greykite import GreykiteDetector
from greykite.detection.detector.config import ADConfig
from greykite.detection.detector.data import DetectorData
from greykite.common.constants import GrowthColEnum

import plotly
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [115]:
retail_csv = pd.read_csv('./data_productos_ventas_3_sin_2025.csv')
df = pd.DataFrame(retail_csv)
df['fecha_venta'] = pd.to_datetime(df['fecha_venta'])
df

Unnamed: 0,fecha_venta,sucursal,codigo_articulo,producto,precio_venta,cantidad
0,2022-03-15,BARQUISIMETO,1973,MARGARINA 500GR MAVESA.,8.86,136
1,2022-03-15,CAGUA,1973,MARGARINA 500GR MAVESA.,8.86,160
2,2022-03-16,BARQUISIMETO,386,HUEVOS A GRANEL,19.60,109
3,2022-03-16,GUARENAS,386,HUEVOS A GRANEL,19.60,331
4,2022-03-16,IPSFA,386,HUEVOS A GRANEL,19.60,694
...,...,...,...,...,...,...
32441,2024-12-31,GUARENAS,386,HUEVOS A GRANEL,270.04,346
32442,2024-12-31,LA CASCADA,386,HUEVOS A GRANEL,270.04,266
32443,2024-12-31,MAÑONGO - NAGUANAGUA,386,HUEVOS A GRANEL,259.63,56
32444,2024-12-31,SANTA CECILIA - VALENCIA,386,HUEVOS A GRANEL,270.04,17


In [116]:
df_product_filtered = df.query('`codigo_articulo` == 386')
df_product_filtered.rename(columns={'fecha_venta': 'ts', 'cantidad': 'y'}, inplace=True)

In [117]:
#Semanal

freq = 'D'
df = df_product_filtered.groupby(pd.Grouper(key='ts', freq=freq)).agg({'y': 'sum'}).reset_index()

df

Unnamed: 0,ts,y
0,2022-03-15,1909
1,2022-03-16,1860
2,2022-03-17,1940
3,2022-03-18,2098
4,2022-03-19,2387
...,...,...
1018,2024-12-27,4246
1019,2024-12-28,4844
1020,2024-12-29,4893
1021,2024-12-30,6767


In [118]:
import plotly.express as px


fig = px.histogram(
    df, 
    x='y', 
    title='Histograma de y'
)

fig.update_layout(
    xaxis_title='y',
    yaxis_title='Frecuencia'
)

fig.show()

In [119]:
# No es un dataframe pero pareceira un dataframe5
ts = UnivariateTimeSeries()
ts.load_data(
    df=df,
    time_col="ts",
    value_col="y",
    freq=freq
    # anomaly_info=anomaly_info,
    # regressor_cols=["sale_price"]
)

metadata = MetadataParam(
        time_col="ts",  # name of the time column
        value_col="y",  # name of the value column
        freq=freq  # "H" for hourly, "D" for daily, "W" for weekly, etc.
    )

In [120]:
anomaly_detector = GreykiteDetector()  # Creates an instance of the Greykite anomaly detector

forecast_config = ForecastConfig(
    model_template=ModelTemplateEnum.AUTO.name,
    forecast_horizon=7,  # forecasts 7 steps ahead
    coverage=None,       # Confidence Interval will be tuned by the AD model
    metadata_param=metadata)

ad_config = ADConfig()  # Default anomaly detection config

detector = GreykiteDetector(
    forecast_config=forecast_config,
    ad_config=ad_config,
    reward=None)

# df

In [121]:
train_data = DetectorData(df=df)
train_data
detector.fit(data=train_data)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


In [122]:
print(detector.fitted_df[detector.fitted_df["is_anomaly_predicted"] == False])

fig = detector.plot(
    phase="train",
    title="Greykite Detector Peyton Manning - fit phase")
plotly.io.show(fig)

             ts  actual     forecast  forecast_lower  forecast_upper  \
5    2022-03-20    2195  2161.634015     2095.184680     2228.083350   
21   2022-04-05    1901  1911.212136     1844.762801     1977.661471   
24   2022-04-08    2007  2003.845189     1937.395855     2070.294524   
32   2022-04-16    1926  1923.720120     1857.270785     1990.169455   
38   2022-04-22    2088  2096.575687     2030.126352     2163.025022   
...         ...     ...          ...             ...             ...   
976  2024-11-15    4283  4347.947761     4281.498427     4414.397096   
994  2024-12-03    4056  4059.002769     3992.553435     4125.452104   
1000 2024-12-09    3850  3805.984841     3739.535506     3872.434175   
1001 2024-12-10    3813  3787.703612     3721.254277     3854.152947   
1008 2024-12-17    4482  4461.364263     4394.914928     4527.813598   

      is_anomaly_predicted   z_score is_anomaly  
5                    False  0.031487       None  
21                   False -0.00963

In [123]:
# # Filter anomalies
# import plotly.express


# df_fited = detector.fitted_df.rename(columns={"actual": "y"})

# # df_fited = df_fited[np.abs(df_fited['z_score']) > 2.5]
# # np.abs(df_fited['z_score'])
# anomaly_df = pd.DataFrame({
#     cst.START_TIME_COL: list(df_fited['ts']),
#     cst.END_TIME_COL: list(df_fited['ts']),
#     cst.ADJUSTMENT_DELTA_COL: [int(num) for num in list((df_fited['z_score'] * df['y'].mean())/ df_fited['z_score'].std())],
#     "y": df_fited['y'] # mask as NA
# })
# # Graficar las anomalías detectadas y el valor real
# # anomaly_df.plot(
# #     x=cst.START_TIME_COL,
# #     y=["y", cst.ADJUSTMENT_DELTA_COL],
# #     title="Anomalías detectadas por Greykite Detector"
# # )
# fig = plotly.express.line(anomaly_df, x=cst.START_TIME_COL, y=["y", cst.ADJUSTMENT_DELTA_COL], title='Gráfico con Plotly Express')
# fig.show()


In [124]:
df_fited = detector.fitted_df.rename(columns={"forecast": "y"})

fig = px.histogram(
    df_fited, 
    x='y', 
    title='Histograma de y'
)

fig.update_layout(
    xaxis_title='y',
    yaxis_title='Frecuencia'
)

fig.show()

In [None]:
forecaster = Forecaster()

model_components_param_silverkite = ModelComponentsParam(
     seasonality={
         "auto_seasonality": False,
         "yearly_seasonality": "auto",
         "quarterly_seasonality": "auto",
         "monthly_seasonality": "auto",
         "weekly_seasonality": "auto",
         "daily_seasonality": "auto",
     },
     growth={
         "growth_term": GrowthColEnum.linear.name
     },
     events={
         "auto_holiday": False,
         "holidays_to_model_separately": "auto",
         "holiday_lookup_countries": "auto",
         "holiday_pre_num_days": 2,
         "holiday_post_num_days": 2,
         "holiday_pre_post_num_dict": None,
         "daily_event_df_dict": None,
     },
     changepoints={
         "auto_growth": False,
         "changepoints_dict": {
             "method": "auto",
             "yearly_seasonality_order": 15,
             "resample_freq": "3D",
             "regularization_strength": 0.6,
             "actual_changepoint_min_distance": "30D",
             "potential_changepoint_distance": "15D",
             "no_changepoint_distance_from_end": "90D"
         },
         "seasonality_changepoints_dict": None
     },
     autoregression={
         "autoreg_dict": "auto",
         "simulation_num": 10  # simulation is not triggered with ``autoreg_dict="auto"``
     },
     regressors={
         "regressor_cols": []
     },
     lagged_regressors={
         "lagged_regressor_dict": None
     },
     uncertainty={
         "uncertainty_dict": None
     },
     custom={
         "fit_algorithm_dict": {
             "fit_algorithm": "ridge",
             "fit_algorithm_params": None,
         },
         "feature_sets_enabled": "auto",  # "auto" based on data freq and size
         "max_daily_seas_interaction_order": 5,
         "max_weekly_seas_interaction_order": 2,
         "extra_pred_cols": [],
         "drop_pred_cols": None,
         "explicit_pred_cols": None,
         "min_admissible_value": None,
         "max_admissible_value": None,
         "normalize_method": "zero_to_one"
     }
 )


result = forecaster.run_forecast_config(
            df=df_fited,
            config=ForecastConfig(
                model_template=ModelTemplateEnum.SILVERKITE.name,
                # forecast_horizon=10, 
                coverage=0.95,
                metadata_param=metadata,
                model_components_param=model_components_param_silverkite,
                # evaluation_period_param=evaluation_period 
            )
        )

Fitting 3 folds for each of 1 candidates, totalling 3 fits


In [126]:
result_auto = forecaster.run_forecast_config(
            df=df_fited,
            config=ForecastConfig(
                model_template=ModelTemplateEnum.AUTO.name,
                # forecast_horizon=10, 
                # coverage=0.95,
                # metadata_param=metadata,
                # model_components_param=model_components_param_silverkite,
                # evaluation_period_param=evaluation_period 
            )
        )

Fitting 3 folds for each of 1 candidates, totalling 3 fits


In [127]:
result.forecast.plot()

In [128]:
result_auto.forecast.plot()

In [129]:
frecast_wo_2025_df = result.forecast.df_test

frecast_wo_2025_df

frecast_wo_2025_df[frecast_wo_2025_df['ts'] >= '2025-01-01']['forecast_lower'].sum()

46908.77912402321

In [130]:
frecast_wo_2025_df.describe()

Unnamed: 0,actual,forecast,forecast_lower,forecast_upper
count,0.0,10.0,10.0,10.0
mean,,5592.471211,4690.877912,6494.064509
std,,832.996628,883.379365,785.164682
min,,3673.17349,2680.068339,4666.278641
25%,,5292.135491,4358.879577,6209.050911
50%,,5708.609204,4812.292079,6643.13829
75%,,5956.367246,5140.074542,6772.65995
max,,6800.076268,5958.396443,7641.756092


In [131]:
print(pd.DataFrame(result.backtest.test_evaluation, index=["Value"]).transpose())

                                                           Value
CORR                                                    0.985604
R2                                                      0.966142
MSE                                                 47434.565306
RMSE                                                  217.794778
MAE                                                   164.558433
MedAE                                                 113.964677
MAPE                                                    3.467894
MedAPE                                                  2.855223
sMAPE                                                   1.714355
Q80                                                    64.296395
Q95                                                    55.304984
Q99                                                    52.907275
OutsideTolerance1p                                           0.8
OutsideTolerance2p                                           0.6
OutsideTolerance3p       

In [132]:
retail_csv = pd.read_csv('./data_productos_ventas_3.csv')
df_real = pd.DataFrame(retail_csv)
df_real['fecha_venta'] = pd.to_datetime(df_real['fecha_venta'])

df_real = df_real.query('`codigo_articulo` == 386')
    
# # df_real = df_real.groupby(['fecha_venta']).agg({'cantidad': 'sum'}).reset_index()
df_real.rename(columns={'fecha_venta': 'ts', 'cantidad': 'y'}, inplace=True)
df_real = df_real.groupby(pd.Grouper(key='ts', freq=freq)).agg({'y': 'sum'}).reset_index()


ts2025 = UnivariateTimeSeries()
ts2025.load_data(
    df=df_real,
    time_col="ts",
    value_col="y",
    freq=freq
    # anomaly_info=anomaly_info,
    # regressor_cols=["sale_price"]
)

ts2025.plot()

In [133]:
df_real[df_real['ts'] >= '2025-01-01']['y'].describe()
# df_real.describe()

count      73.000000
mean     3985.821918
std      1510.575436
min         0.000000
25%      3023.000000
50%      3699.000000
75%      4514.000000
max      9241.000000
Name: y, dtype: float64