In [2]:
pip install statsmodels

You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.stattools import adfuller
from sklearn.metrics import mean_squared_error, r2_score

In [23]:
from fosforml.model_manager.snowflakesession import get_session
my_session = get_session()
 
table_name = 'Fact_Customer_Orders'
 
sf_df = my_session.sql("select * from {}".format(table_name))
df = sf_df.to_pandas()

In [24]:
df.columns

Index(['ORDER_ID', 'PRODUCT_ID', 'ORDER_QTY', 'ORDER_DATE'], dtype='object')

In [59]:
data=data.groupby('PRODUCT_ID','ORDER_DATE').agg({'ORDER_QTY':'sum'}).reset_index()

ValueError: No axis named ORDER_DATE for object type DataFrame

In [25]:
data = df[['PRODUCT_ID','ORDER_QTY','ORDER_DATE']]

In [26]:
#Convert order_date to datetime format
data['ORDER_DATE'] = pd.to_datetime(data['ORDER_DATE'])

In [58]:
data.columns

Index(['PRODUCT_ID', 'ORDER_QTY', 'ORDER_DATE'], dtype='object')

In [57]:
#Aggregate data by date and sum of quantities per day
data_daily= data.groupby('PRODUCT_ID','ORDER_DATE').agg({'ORDER_QTY':'sum'}).reset_index()

ValueError: No axis named ORDER_DATE for object type DataFrame

In [30]:
#set order_date as index required for time series analysis
data_daily.set_index('ORDER_DATE',inplace=True)

In [44]:
data_daily

Unnamed: 0_level_0,ORDER_QTY
ORDER_DATE,Unnamed: 1_level_1
2019-01-02,11600
2019-01-03,12150
2019-01-04,10100
2019-01-05,104650
2019-01-06,109250
...,...
2024-08-26,66850
2024-08-27,63650
2024-08-28,73200
2024-08-30,400


In [45]:
data_daily.sort_index(inplace=True)

In [46]:
data_daily

Unnamed: 0_level_0,ORDER_QTY
ORDER_DATE,Unnamed: 1_level_1
2019-01-02,11600
2019-01-03,12150
2019-01-04,10100
2019-01-05,104650
2019-01-06,109250
...,...
2024-08-26,66850
2024-08-27,63650
2024-08-28,73200
2024-08-30,400


In [56]:
# Group data by 'product_id'
grouped_data = data.groupby('PRODUCT_ID')

In [48]:
# Initialize an empty DataFrame for forecast results
test_results = pd.DataFrame(columns=['PRODUCT_ID','ORDER_DATE', 'test_forecast_order_qty','rse', 'rmse','r2_score'])

In [49]:
# Initialize an empty DataFrame for forecast results
forecast_results = pd.DataFrame(columns=['PRODUCT_ID','ORDER_DATE', 'forecast_order_qty'])

In [None]:
for PRODUCT_ID, group in grouped_data:
    # Sort data by order date
    group.sort_index(inplace=True)
    
    # Split data into train and test sets (80% train, 20% test)
    train_size = int(0.8 * len(group))
    train_data, test_data = group.iloc[:train_size], group.iloc[train_size:]
    
    # Create SARIMAX model
    model = SARIMAX(train_data['ORDER_QTY'], order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))
    model_fit = model.fit(disp=False)
    
    # Forecast the next period
    test_model = model_fit.get_forecast(steps=len(test_data))
    test_df = test_model.predicted_mean
    
    # Calculate RMSE for order quantity
    #rmse_order_qty = np.sqrt(mean_squared_error(test_data['ORDER_QTY'], forecast_order_qty))
    
    # Evaluate the model
    rse = np.sqrt(((test_df - test_data) ** 2).sum().sum() / (test_data.shape[0] * test_data.shape[1] - len(model_fit.params)))
    rmse = np.sqrt(mean_squared_error(test_data['ORDER_QTY'], test_df))
    r2 = r2_score(test_data['ORDER_QTY'], test_df)

    
    date_list = test_df.index.to_list()
    
    new_row = {
        'PRODUCT_ID': PRODUCT_ID,
        'ORDER_DATE': date_list,
        'test_forecast_order_qty': test_df,
        'rse': rse,
        'rmse': rmse,
        'r2_score': r2
    }
    
    df_new = pd.DataFrame(new_row)
    test_results=pd.concat([test_results,df_new],ignore_index=True)
    
    # Forecast future values
    forecast = model_fit.get_forecast(steps=25)#, steps=n_forecast)
    forecast_df = forecast.predicted_mean
    
    date_list2= forecast_df.index.to_list()
    
    new_row2 = {
        'PRODUCT_ID': PRODUCT_ID,
        'ORDER_DATE': date_list2,
        'forecast_order_qty': forecast_df,
    }
    
    df_new2 = pd.DataFrame(new_row2)
    
    forecast_results=pd.concat([forecast_results,df_new2],ignore_index=True)