In [36]:
#ignore the warnings of pandas 
import warnings
warnings.filterwarnings("ignore")

In [37]:
import pandas as pd
df_data = pd.read_csv("./data/train.csv")
df = df_data[~(df_data.notna().sum(axis=1) == 0)]

In [38]:
# Create a UUID column
df['UUID'] = df.groupby(['city', 'shop', 'brand', 'container']).ngroup() + 1
df['sales'] = df['price'].astype(float) * df['quantity'].astype(float)

In [39]:
df_data = df.groupby(['UUID','date', 'city', 'shop', 'brand', 'container']).sales.sum().reset_index()
df_data["date"] = pd.to_datetime(df_data["date"])

In [40]:
grouped_df = df_data.groupby(['date', 'UUID'])['sales'].sum().reset_index()


In [41]:
# !pip install prophet


In [43]:
from prophet import Prophet
import matplotlib.pyplot as plt

forecasts = pd.DataFrame()

# STEP 6: Forecast using Prophet
for uuid in grouped_df['UUID'].unique():
    temp_df = grouped_df[grouped_df['UUID'] == uuid][['date', 'sales']].rename(columns={'date': 'ds', 'sales': 'y'})
    
    if len(temp_df) < 2:
        continue  # Skip if too few data points

    model = Prophet()
    model.fit(temp_df)

    future = model.make_future_dataframe(periods=12, freq='M')  # Forecast next 6 months
    forecast = model.predict(future)
    forecast = forecast[["yhat","ds" ]]
    forecast["UUID"] = uuid
    forecasts = pd.concat([forecasts, forecast], ignore_index=True)


22:55:37 - cmdstanpy - INFO - Chain [1] start processing
22:55:37 - cmdstanpy - INFO - Chain [1] done processing
22:55:37 - cmdstanpy - INFO - Chain [1] start processing
22:55:37 - cmdstanpy - INFO - Chain [1] done processing
22:55:37 - cmdstanpy - INFO - Chain [1] start processing
22:55:38 - cmdstanpy - INFO - Chain [1] done processing
22:55:38 - cmdstanpy - INFO - Chain [1] start processing
22:55:38 - cmdstanpy - INFO - Chain [1] done processing
22:55:38 - cmdstanpy - INFO - Chain [1] start processing
22:55:38 - cmdstanpy - INFO - Chain [1] done processing
22:55:38 - cmdstanpy - INFO - Chain [1] start processing
22:55:38 - cmdstanpy - INFO - Chain [1] done processing
22:55:38 - cmdstanpy - INFO - Chain [1] start processing
22:55:38 - cmdstanpy - INFO - Chain [1] done processing
22:55:38 - cmdstanpy - INFO - Chain [1] start processing
22:55:39 - cmdstanpy - INFO - Chain [1] done processing
22:55:39 - cmdstanpy - INFO - Chain [1] start processing
22:55:39 - cmdstanpy - INFO - Chain [1]

In [44]:
forecasts = forecasts.rename(columns={'ds': 'date', 'yhat': 'yhat'})
# merge details into the datframe 
forecasts = forecasts.merge(df_data[['UUID', 'city', 'shop', 'brand', 'container']].drop_duplicates(), on='UUID', how='left')

In [46]:
forecasts["UUID"].nunique()

90

In [50]:
# reading the test data 
df_test = pd.read_csv("./data/test.csv")
# Convert 'date' to datetime
df_test['date'] = pd.to_datetime(df_test['date'])
#create the sales column
df_test['sales'] = df_test['price'].astype(float) * df_test['quantity'].astype(float)
df_test = df_test.groupby(['date', 'city', 'shop', 'brand', 'container']).sales.sum().reset_index()
# Ensure 'date' is in datetime format
df_test['date'] = pd.to_datetime(df_test['date'])
# Merge forecasts with test data on columns ['date', 'city', 'shop', 'brand', 'container']
df_test = df_test.merge(forecasts, on=['date', 'city', 'shop', 'brand', 'container'], how='left')
# Fill NaN values in 'yhat' with 0
df_test['yhat'] = df_test['yhat'].fillna(0)




In [52]:
#calculate the error between sales and yhat
df_test['error'] = df_test['sales'] - df_test['yhat']
# Calculate the percentage error
df_test['percentage_error'] = df_test['error'] / df_test['sales'].replace(0, pd.NA) * 100

# Save the results to a CSV file
df_test[['date', 'city', 'shop', 'brand', 'container', 'sales', 'yhat', 'error', 'percentage_error']].to_csv("./data/forecast_results.csv", index=False)
