In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
def save_forecast_results(model_results, steps=12, last_date='2022-02-28'):
    """
    Generate and save forecast results based on a time series model.

    Parameters:
    - model_results (object): The result object from a time series forecasting model.
    - steps (int, optional): Number of steps (future time points) to forecast. Default is 12.
    - last_date (str, optional): The last date in the historical data. Default is '2022-02-28'.

    Returns:
    - dict: A dictionary containing forecasted values for each future date.
    """
    # Generate forecast based on the provided number of steps
    forecast = model_results.get_forecast(steps=steps)

    # Convert forecasted values back to the original scale
    forecasted_values = np.exp(forecast.predicted_mean)

    # Generate future dates for the forecast
    future_dates = pd.date_range(start=last_date, periods=steps + 1, freq='M')[1:]

    # Create a dictionary mapping each future date to its corresponding forecasted value
    forecast_dict = {date.strftime("%b-%Y"): value for date, value in zip(future_dates, forecasted_values)}

    return forecast_dict

In [None]:
from tqdm import tqdm
from sklearn.metrics import mean_squared_error, mean_absolute_error


def preprocess_and_fit_model(data, product_name, split_date):
    """
    Preprocess the data for a specific product, split it into training and testing sets,
    fit a SARIMA model, and save the model weights. Evaluate the model's forecast on the test set.

    Parameters:
    - data (DataFrame): The input time series data containing 'Products' and 'VALUE'.
    - product_name (str): The specific product to analyze.
    - split_date (str): The date to split the data into training and testing sets.

    Returns:
    - tuple: A tuple containing the product name and a dictionary with forecasted values, MSE, and MAE.
    """

    # Filter data for the specific product and create a copy to avoid SettingWithCopyWarning
    product_data = data[data['Products'] == product_name].copy()

    # Convert REF_DATE to datetime
    product_data['REF_DATE'] = pd.to_datetime(product_data['REF_DATE'])

    # Set REF_DATE as the index and specify the frequency
    product_data.set_index('REF_DATE', inplace=True)
    product_data.index.freq = 'MS'  # Monthly Start frequency

    # Log transformation
    product_data['VALUE_log'] = np.log(product_data['VALUE'] + 1)  # +1 to handle 0 values

    # Split the data
    split_date = pd.to_datetime(split_date)
    train = product_data[product_data.index < split_date]
    test = product_data[product_data.index >= split_date]

    # Fit SARIMA model
    model_results = fit_sarima_model(train, product_name, grid_search=True)

    # Save the model weights
    model_results.save(f'/content/drive/MyDrive/Intro to ML PR/Project/Weights/{product_name}_model.pkl')

    # Save forecast results
    forecasts = save_forecast_results(model_results, steps=14, last_date='2020-12-01')

    if len(test['VALUE']) == len(list(forecasts.values())):
        # Calculate MSE and MAE
        mse = mean_squared_error(test['VALUE'], list(forecasts.values()))
        mae = mean_absolute_error(test['VALUE'], list(forecasts.values()))

        # Save MSE and MAE to the forecasts dictionary
        forecasts['mse'] = mse
        forecasts['mae'] = mae

        return product_name, forecasts
    else:
        return product_name, forecasts


In [None]:
import statsmodels.api as sm
import itertools
import warnings
from tqdm import tqdm

def fit_sarima_model(data, product_name, order=(1,1,1), seasonal_order=(1,1,1,12), grid_search=False):
  """
    Fit a SARIMA (Seasonal AutoRegressive Integrated Moving Average) model to the input time series data.

    Parameters:
    - data (DataFrame): The input time series data with a 'VALUE_log' column.
    - product_name (str): The name of the product for identification purposes.
    - order (tuple, optional): Non-seasonal order of the SARIMA model. Default is (1, 1, 1).
    - seasonal_order (tuple, optional): Seasonal order of the SARIMA model. Default is (1, 1, 1, 12) for monthly data.
    - grid_search (bool, optional): Perform a grid search for optimal hyperparameters if True. Default is False.

    Returns:
    - object: The results object containing information about the fitted SARIMA model.
    """
    if grid_search == False:
      model = sm.tsa.statespace.SARIMAX(data['VALUE_log'],
                                        order=order,
                                        seasonal_order=seasonal_order,
                                        enforce_stationarity=False,
                                        enforce_invertibility=False)

      # Increase the number of iterations and optionally change the optimization method
      results = model.fit(maxiter=500, method='nm', disp=False)  # 'nm' stands for Nelder-Mead
    else:
      # Suppressing warnings for model convergence issues during grid search
      warnings.filterwarnings("ignore")

      # Defining the range of parameters to test
      p = d = q = range(0, 3)  # Considering values 0, 1, 2
      pdq = list(itertools.product(p, d, q))

      # Seasonal component - assuming an annual seasonality (s=12)
      seasonal_pdq = [(x[0], x[1], x[2], 12) for x in pdq]

      # Grid Search
      best_aic = float("inf")
      best_pdq = None
      best_seasonal_pdq = None

      for param in tqdm(pdq, desc=f'PDQ Range for {product_name}'):
          # for seasonal_param in tqdm(seasonal_pdq, desc='Seasonal PDQ Range'):
          for seasonal_param in seasonal_pdq:
              try:
                  model = sm.tsa.statespace.SARIMAX(data['VALUE_log'],
                                                    order=param,
                                                    seasonal_order=seasonal_param,
                                                    enforce_stationarity=False,
                                                    enforce_invertibility=False)
                  results = model.fit(method='nm', disp=False)
                  if results.aic < best_aic:
                      best_aic = results.aic
                      best_pdq = param
                      best_seasonal_pdq = seasonal_param
              except:
                  continue

      model = sm.tsa.statespace.SARIMAX(data['VALUE_log'],
                                                    order=best_pdq,
                                                    seasonal_order=best_seasonal_pdq,
                                                    enforce_stationarity=False,
                                                    enforce_invertibility=False)
      results = model.fit(max_iter=500, method='nm', disp=False)


    return results



In [None]:
import concurrent.futures

def model_and_forecast(data):
    """
    Perform parallel preprocessing, SARIMA model fitting, and forecasting for each unique product in the dataset.

    Parameters:
    - data (DataFrame): The input time series data.

    Returns:
    - dict: A dictionary containing product names and their corresponding forecast results.
    """
    # Get unique product names from the 'Products' column
    unique_products = data['Products'].unique()

    # Use ThreadPoolExecutor for parallel execution of preprocess_and_fit_model function
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Create a dictionary to map future objects to their respective products
        future_to_product = {executor.submit(preprocess_and_fit_model, data, product, '2020-12-31'): product for product in unique_products}

        # Initialize a dictionary to store all forecasts
        all_forecasts = {}

        # Iterate through completed futures using as_completed
        for future in tqdm(concurrent.futures.as_completed(future_to_product), desc='Multi Threading'):
            product = future_to_product[future]
            try:
                # Retrieve the result of the future (product and its forecasts)
                product_forecasts = future.result()
            except Exception as exc:
                print(f'{product} generated an exception: {exc}')
            else:
                # Store the forecasts in the all_forecasts dictionary
                all_forecasts[product] = product_forecasts

    return all_forecasts

In [None]:
import pandas as pd
import numpy as np
import warnings
from tqdm import tqdm
from joblib import Parallel, delayed

# Load your dataset
data = pd.read_csv('/content/drive/MyDrive/Intro to ML PR/Project/18100002-trimmed.csv')

# Run the main function
product_forecasts = model_and_forecast(data)