In [1]:
!pip install statsmodels

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
from fosforml.model_manager.snowflakesession import get_session
my_session = get_session()
 
table_name = '"FACT_CUSTOMER_ORDERS"'
 
sf_df = my_session.sql("select * from {}".format(table_name))
df = sf_df.to_pandas()

In [4]:
data = df[['PRODUCT_ID','ORDER_DATE','ORDER_QTY']]

In [5]:
data

Unnamed: 0,PRODUCT_ID,ORDER_DATE,ORDER_QTY
0,Product_001,2019-01-02,850
1,Product_004,2019-01-02,700
2,Product_004,2019-01-02,900
3,Product_004,2019-01-02,550
4,Product_004,2019-01-02,300
...,...,...,...
225558,Product_098,2024-08-31,200
225559,Product_100,2024-08-31,800
225560,Product_100,2024-08-31,550
225561,Product_100,2024-08-31,850


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225563 entries, 0 to 225562
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   PRODUCT_ID  225563 non-null  object
 1   ORDER_DATE  225563 non-null  object
 2   ORDER_QTY   225563 non-null  int16 
dtypes: int16(1), object(2)
memory usage: 3.9+ MB


In [7]:
data['ORDER_DATE'] = pd.to_datetime(data['ORDER_DATE'])

In [17]:
data = data.set_index('ORDER_DATE')

In [18]:
data

Unnamed: 0_level_0,PRODUCT_ID,ORDER_QTY
ORDER_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-02,Product_001,850
2019-01-02,Product_004,700
2019-01-02,Product_004,900
2019-01-02,Product_004,550
2019-01-02,Product_004,300
...,...,...
2024-08-31,Product_098,200
2024-08-31,Product_100,800
2024-08-31,Product_100,550
2024-08-31,Product_100,850


In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 225563 entries, 2019-01-02 to 2024-08-31
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   PRODUCT_ID  225563 non-null  object
 1   ORDER_QTY   225563 non-null  int16 
dtypes: int16(1), object(1)
memory usage: 3.9+ MB


In [20]:
data.sort_index(inplace=True)

In [21]:
# Group data by 'product_id'
grouped_data = data.groupby('PRODUCT_ID')

In [22]:
# Initialize an empty DataFrame for forecast results
test_results = pd.DataFrame(columns=['product_id','order_date', 'test_forecast_orders','rse', 'rmse','r2_score'])

In [23]:
# Initialize an empty DataFrame for forecast results
forecast_results = pd.DataFrame(columns=['product_id','order_date', 'forecast_order_qty'])

In [30]:
for product_id, group in grouped_data:
    # Sort data by order date
    group.sort_index(inplace=True)

    
    # Split data into train and test sets (80% train, 20% test)
    train_size = int(0.8 * len(group))
    train_data, test_data = group.iloc[:train_size], group.iloc[train_size:]
    
    # Create SARIMAX model
    model = SARIMAX(train_data['ORDER_QTY'], order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))
    model_fit = model.fit(disp=False)
    
    # Forecast the next period
    test_model = model_fit.get_forecast(steps=len(test_data))
    test_df = test_model.predicted_mean

    # Evaluate the model
    rse = np.sqrt(((test_df - test_data) ** 2).sum().sum() / (test_data.shape[0] * test_data.shape[1] - len(model_fit.params)))
    rmse = np.sqrt(mean_squared_error(test_data['ORDER_QTY'], test_df))
    r2 = r2_score(test_data['ORDER_QTY'], test_df)

    # Convert index to date format
    test_df.index = pd.to_datetime(test_df.index)
    date_list = test_df.index.strftime('%Y-%m-%d').to_list()
    
    new_row = {
        'product_id': product_id,
        'order_date': date_list,
        'test_forecast_orders': test_df,
        'rse': rse,
        'rmse': rmse,
        'r2_score': r2
    }
    
    df_new = pd.DataFrame(new_row)
    test_results = pd.concat([test_results, df_new], ignore_index=True)
    
    # Forecast future values
    forecast = model_fit.get_forecast(steps=25)
    forecast_df = forecast.predicted_mean
    
    # Convert index to date format
    forecast_df.index = pd.to_datetime(forecast_df.index)
    date_list2 = forecast_df.index.strftime('%Y-%m-%d').to_list()
    
    new_row2 = {
        'product_id': product_id,
        'order_date': date_list2,
        'forecast_order_qty': forecast_df,
    }
    
    df_new2 = pd.DataFrame(new_row2)
    forecast_results = pd.concat([forecast_results, df_new2], ignore_index=True)


KeyboardInterrupt: 

In [25]:
test_results

Unnamed: 0,product_id,order_date,test_forecast_orders,rse,rmse,r2_score
0,Product_001,1970-01-01,537.353131,0.0,290.185585,-0.000545
1,Product_001,1970-01-01,549.804686,0.0,290.185585,-0.000545
2,Product_001,1970-01-01,538.368357,0.0,290.185585,-0.000545
3,Product_001,1970-01-01,553.529134,0.0,290.185585,-0.000545
4,Product_001,1970-01-01,525.990260,0.0,290.185585,-0.000545
...,...,...,...,...,...,...
45125,Product_100,1970-01-01,476.050933,0.0,295.283780,-0.021838
45126,Product_100,1970-01-01,522.781958,0.0,295.283780,-0.021838
45127,Product_100,1970-01-01,475.125436,0.0,295.283780,-0.021838
45128,Product_100,1970-01-01,498.165467,0.0,295.283780,-0.021838


In [26]:
test_df.index

DatetimeIndex(['1970-01-01 00:00:00.000003112',
               '1970-01-01 00:00:00.000003113',
               '1970-01-01 00:00:00.000003114',
               '1970-01-01 00:00:00.000003115',
               '1970-01-01 00:00:00.000003116',
               '1970-01-01 00:00:00.000003117',
               '1970-01-01 00:00:00.000003118',
               '1970-01-01 00:00:00.000003119',
               '1970-01-01 00:00:00.000003120',
               '1970-01-01 00:00:00.000003121',
               ...
               '1970-01-01 00:00:00.000003881',
               '1970-01-01 00:00:00.000003882',
               '1970-01-01 00:00:00.000003883',
               '1970-01-01 00:00:00.000003884',
               '1970-01-01 00:00:00.000003885',
               '1970-01-01 00:00:00.000003886',
               '1970-01-01 00:00:00.000003887',
               '1970-01-01 00:00:00.000003888',
               '1970-01-01 00:00:00.000003889',
               '1970-01-01 00:00:00.000003890'],
              dtype=

In [27]:
data

Unnamed: 0_level_0,PRODUCT_ID,ORDER_QTY
ORDER_DATE,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-02,Product_001,850
2019-01-02,Product_004,700
2019-01-02,Product_004,900
2019-01-02,Product_004,550
2019-01-02,Product_004,300
...,...,...
2024-08-31,Product_098,200
2024-08-31,Product_100,800
2024-08-31,Product_100,550
2024-08-31,Product_100,850


In [28]:
data.index

DatetimeIndex(['2019-01-02', '2019-01-02', '2019-01-02', '2019-01-02',
               '2019-01-02', '2019-01-02', '2019-01-02', '2019-01-02',
               '2019-01-02', '2019-01-02',
               ...
               '2024-08-31', '2024-08-31', '2024-08-31', '2024-08-31',
               '2024-08-31', '2024-08-31', '2024-08-31', '2024-08-31',
               '2024-08-31', '2024-08-31'],
              dtype='datetime64[ns]', name='ORDER_DATE', length=225563, freq=None)

In [29]:
grouped_data.index

AttributeError: 'DataFrameGroupBy' object has no attribute 'index'

In [32]:
for product_id, group in grouped_data:
    # Sort data by order date
    group.sort_index(inplace=True)

    print({group.index})

TypeError: unhashable type: 'DatetimeIndex'

In [None]:
for product_id, group in grouped_data:
    # Sort data by order date
    group.sort_index(inplace=True)

    train_size = int(0.8 * len(group))
    train_data, test_data = group.iloc[:train_size], group.iloc[train_size:]

    #print(f'Train_data:{train_data.index}')
    #print(f'Test_data:{test_data.index}')

    model = SARIMAX(train_data['ORDER_QTY'], order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))
    model_fit = model.fit(disp=False)
    
    # Forecast the next period
    test_model = model_fit.get_forecast(steps=len(test_data))
    test_df = test_model.predicted_mean

    date_list = test_df.index.to_list()
   
    print(f'Datelist:{date_list}')


Datelist:[13530, 13531, 13532, 13533, 13534, 13535, 13536, 13537, 13538, 13539, 13540, 13541, 13542, 13543, 13544, 13545, 13546, 13547, 13548, 13549, 13550, 13551, 13552, 13553, 13554, 13555, 13556, 13557, 13558, 13559, 13560, 13561, 13562, 13563, 13564, 13565, 13566, 13567, 13568, 13569, 13570, 13571, 13572, 13573, 13574, 13575, 13576, 13577, 13578, 13579, 13580, 13581, 13582, 13583, 13584, 13585, 13586, 13587, 13588, 13589, 13590, 13591, 13592, 13593, 13594, 13595, 13596, 13597, 13598, 13599, 13600, 13601, 13602, 13603, 13604, 13605, 13606, 13607, 13608, 13609, 13610, 13611, 13612, 13613, 13614, 13615, 13616, 13617, 13618, 13619, 13620, 13621, 13622, 13623, 13624, 13625, 13626, 13627, 13628, 13629, 13630, 13631, 13632, 13633, 13634, 13635, 13636, 13637, 13638, 13639, 13640, 13641, 13642, 13643, 13644, 13645, 13646, 13647, 13648, 13649, 13650, 13651, 13652, 13653, 13654, 13655, 13656, 13657, 13658, 13659, 13660, 13661, 13662, 13663, 13664, 13665, 13666, 13667, 13668, 13669, 13670, 136