#Libraries and Functions

In [0]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

from statsmodels.tsa.holtwinters import SimpleExpSmoothing, Holt, ExponentialSmoothing

In [0]:
def evaluate_forecast(test, preds):
    print("r2: ", r2_score(test, preds))
    print("mse: ", mean_squared_error(test, preds))
    print("mae: ", mean_absolute_error(test, preds))

    return pd.Series([r2_score(test, preds), mean_squared_error(test, preds), mean_absolute_error(test, preds)], index=['r2', 'mse', 'mae'])

In [0]:
def evaluate_forecast_various_periods(test, preds):
    """
    the following function evaluates the forecast for various periods: 
    first 10 days, first month, first two months, and overall

    Parameters:
    test: the test dataset, with the original value from the time series.
    preds: the predicted values for the test period.

    """

    time_range = [10, 31, 31+28]
    time_range_labels = ["10_days", "january", "jan-feb"]
    eval_dict = {
        "metric":['r2', 'mse', 'mae'],
        "10_days":[],
        "january":[],
        "jan-feb":[],
        "overall":[]
    }

    for r, rl in zip(time_range, time_range_labels):
        eval_dict[rl].append(r2_score(test[:r], preds[:r]))
        eval_dict[rl].append(mean_squared_error(test[:r], preds[:r]))
        eval_dict[rl].append(mean_absolute_error(test[:r], preds[:r]))

    eval_dict["overall"].append(r2_score(test, preds))
    eval_dict["overall"].append(mean_squared_error(test, preds))
    eval_dict["overall"].append(mean_absolute_error(test, preds))

    eval_df = pd.DataFrame(eval_dict)
    print(eval_df)

In [0]:
# Function to plot original data and predictions
def plot_predictions(df, preds, test_size=90):
    """
    Parameters:
    df (pd.Series): The original time series data.
    preds (pd.Series): The predicted values for the test period.
    test_size (int): Number of test observations used for evaluation. Default is 12.
    """
    # Create the plot
    plt.figure(figsize=(20, 5))
    sns.lineplot(x=df.index, y=df.values, label='Original Data')
    sns.lineplot(x=df.index[-test_size:], y=preds.values, label='Predictions')

    # Add labels and title
    plt.xticks(rotation = 45)
    plt.xlabel('Date')
    plt.ylabel('Values')
    plt.title('Forecast vs Actual')
    plt.legend()
    plt.show()

#Import Dataset

In [0]:
df = spark.table("workspace.timeseries.train2").toPandas()

In [0]:
#convert date to datetime
df.date = pd.to_datetime(df.date)

In [0]:
df.head()
#it contains data from Guayas region only
#rows with unit_sales == 0 have been removed: if a product was not sold a specific day, this product has not a row for that day
#the data are between 2013-01-02 and 2014-03-31

In [0]:
df.info()

#Exponential Smoothing

In [0]:
sales_by_date = df.groupby('date')['unit_sales'].sum()

In [0]:
#assign the daily frequence to sales_by_date
sales_by_date = sales_by_date.asfreq('D')

In [0]:
sales_by_date.head(3)

In [0]:
train = sales_by_date[:'2013-12-31']
train = train.fillna(1e-3)

test = sales_by_date['2014-01-01':]

In [0]:
#ADDITIVE seasonal model

mod_hw = ExponentialSmoothing(train, trend='add', seasonal='add', seasonal_periods=7, damped_trend=False).fit()
preds = mod_hw.forecast(90)
add_false = evaluate_forecast(test, preds)
plot_predictions(sales_by_date, preds)

In [0]:
#ADDITIVE seasonal model with DAMPING

mod_hw = ExponentialSmoothing(train, trend='add', seasonal='add', seasonal_periods=7, damped_trend=True).fit()
preds = mod_hw.forecast(90)
add_true = evaluate_forecast(test, preds)
plot_predictions(sales_by_date, preds)

In [0]:
#MULTIPLICATIVE seasonal model

mod_hw = ExponentialSmoothing(train, trend='add', seasonal='mul', seasonal_periods=7, damped_trend=False).fit()
preds = mod_hw.forecast(90)
mul_false = evaluate_forecast(test, preds)
plot_predictions(sales_by_date, preds)

In [0]:
#MULTIPLICATIVE seasonal model with DAMPING

mod_hw = ExponentialSmoothing(train, trend='add', seasonal='mul', seasonal_periods=7, damped_trend=True).fit()
preds = mod_hw.forecast(90)
mul_true = evaluate_forecast(test, preds)
plot_predictions(sales_by_date, preds)

In [0]:
print("additive")
print(add_false)
print("\nadditive with damping")
print(add_true)
print("\nmultiplicative")
print(mul_false)
print("\nmultiplicative with damping")
print(mul_true)

#Saving Data

In [0]:
#spark.sql("DROP TABLE IF EXISTS workspace.timeseries.train2")

In [0]:
#spark_df = spark.createDataFrame(df)
#spark_df.write.format("delta").mode("overwrite").saveAsTable("workspace.timeseries.train2")