# TSA for the n_transactions per day

In [33]:
# Prophet model with additional regressor the working_hours of each store
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from prophet import Prophet
from sklearn.metrics import mean_absolute_error, mean_squared_error
sns.set_style('whitegrid')


In [31]:

# Load your preprocessed dataset
data = pd.read_parquet('n_forecast_preprocessed.parquet', engine='pyarrow', dtype_backend='numpy_nullable')


# Convert the 'sales_date' column to a datetime type
data['sales_date'] = pd.to_datetime(data['sales_date'])

# Group data by 'store_hashed' and 'sales_date' to get daily transaction counts per store
daily_sales = data.groupby(['store_hashed', 'sales_date'])['n_transactions'].sum().reset_index()

# Rename columns for Prophet's input format
daily_sales.rename(columns={'sales_date': 'ds', 'n_transactions': 'y'}, inplace=True)

# Initialize Prophet models for each store
models = {}
for store in daily_sales['store_hashed'].unique():
    store_data = daily_sales[daily_sales['store_hashed'] == store]
    model = Prophet(yearly_seasonality=False, weekly_seasonality=True, daily_seasonality=False)
    model.fit(store_data)
    models[store] = model

# Create a dataframe for future dates for forecasting (50 days ahead)
future_dates = pd.DataFrame({'ds': pd.date_range(start=daily_sales['ds'].max(), periods=51, freq='D')[1:]})


10:22:46 - cmdstanpy - INFO - Chain [1] start processing
10:22:46 - cmdstanpy - INFO - Chain [1] done processing
10:22:46 - cmdstanpy - INFO - Chain [1] start processing
10:22:46 - cmdstanpy - INFO - Chain [1] done processing
10:22:46 - cmdstanpy - INFO - Chain [1] start processing
10:22:46 - cmdstanpy - INFO - Chain [1] done processing
10:22:46 - cmdstanpy - INFO - Chain [1] start processing
10:22:46 - cmdstanpy - INFO - Chain [1] done processing
10:22:46 - cmdstanpy - INFO - Chain [1] start processing
10:22:46 - cmdstanpy - INFO - Chain [1] done processing
10:22:46 - cmdstanpy - INFO - Chain [1] start processing
10:22:47 - cmdstanpy - INFO - Chain [1] done processing
10:22:47 - cmdstanpy - INFO - Chain [1] start processing
10:22:47 - cmdstanpy - INFO - Chain [1] done processing
10:22:47 - cmdstanpy - INFO - Chain [1] start processing
10:22:47 - cmdstanpy - INFO - Chain [1] done processing
10:22:47 - cmdstanpy - INFO - Chain [1] start processing
10:22:48 - cmdstanpy - INFO - Chain [1]

In [None]:
# Create a dataframe for future dates for forecasting (50 days ahead)
future_dates = pd.DataFrame({'ds': pd.date_range(start=daily_sales['ds'].max(), periods=51, freq='D')[1:]})

In [35]:
# Initialize lists to store MAPE and RMSE results
mape_scores = []
rmse_scores = []

# Forecast for each store and calculate metrics
forecast_data = pd.DataFrame(columns=['store_hashed', 'ds', 'yhat'])

# Forecast for each store and calculate metrics
for store, model in models.items():
    future = model.predict(future_dates)
    actual = daily_sales[daily_sales['store_hashed'] == store].set_index('ds')['y']
    predicted = future.set_index('ds')['yhat']
    
    # Calculate MAPE
    mape = np.mean(np.abs((actual - predicted) / actual)) * 100
    mape_scores.append((store, mape))
    
    # # Calculate RMSE
    # common_dates = actual.index.intersection(predicted.index)
    # rmse = np.sqrt(mean_squared_error(actual[common_dates], predicted[common_dates]))
    # rmse_scores.append((store, rmse))


In [40]:
# Print MAPE and RMSE scores
for store, mape in mape_scores:
    print(f'Store {store} MAPE: {mape:.2f}%')


Store 01eee509ee2f68dc6014898c309e86bf MAPE: <NA>%
Store 021bbc7ee20b71134d53e20206bd6feb MAPE: <NA>%
Store 022898bbc7110244fd24b3e410597047 MAPE: <NA>%
Store 0233f3bb964cf325a30f8b1c2ed2da93 MAPE: <NA>%
Store 033daef61ea8721921fbbeebb6f87313 MAPE: <NA>%
Store 03793ef7d06ffd63d34ade9d091f1ced MAPE: <NA>%
Store 03db60c2331018b18c4166c1787072fe MAPE: <NA>%
Store 03e4d3f831100d4355663f3d425d716b MAPE: <NA>%
Store 0424d20160a6a558e5bf86a7bc9b67f0 MAPE: <NA>%
Store 048e2f1447691907b18b2a37e7ed2322 MAPE: <NA>%
Store 05311655a15b75fab86956663e1819cd MAPE: <NA>%
Store 05a70454516ecd9194c293b0e415777f MAPE: <NA>%
Store 0663a4ddceacb40b095eda264a85f15c MAPE: <NA>%
Store 069654d5ce089c13f642d19f09a3d1c0 MAPE: <NA>%
Store 06a15eb1c3836723b53e4abca8d9b879 MAPE: <NA>%
Store 07042ac7d03d3b9911a00da43ce0079a MAPE: <NA>%
Store 08040837089cdf46631a10aca5258e16 MAPE: <NA>%
Store 08fe2621d8e716b02ec0da35256a998d MAPE: <NA>%
Store 0912d0f15f1394268c66639e39b26215 MAPE: <NA>%
Store 0966289037ad9846c5e994be2

In [None]:

# Plot the time series for each store
plt.figure(figsize=(12, 6))
for store, store_data in forecast_data.groupby('store_hashed'):
    plt.plot(store_data['ds'], store_data['yhat'], label=f'Store {store}')
plt.xlabel('Date')
plt.ylabel('Forecasted Transactions')
plt.title('Forecasted Transactions per Store')
plt.legend()
plt.grid(True)

# Show the plot
plt.show()

# Print MAPE and RMSE scores
for store, mape in mape_scores:
    print(f'Store {store} MAPE: {mape:.2f}%')

for store, rmse in rmse_scores:
    print(f'Store {store} RMSE: {rmse:.2f}')
