In [1]:
# Connect to server
#import pyodbc
#from dotenv import dotenv_values

# Datetime
from datetime import datetime

# Data manipulation
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import seaborn as sns
import missingno as msno

# Decomposition
from statsmodels.tsa.seasonal import seasonal_decompose

# Statistical Analysis
import scipy.stats as stats
from statsmodels.stats.weightstats import ttest_ind
import statsmodels.api as sm
from pmdarima.arima import CHTest, nsdiffs
from pmdarima.arima import auto_arima
#from arch.unitroot import ADF, KPSS
from statsmodels.stats.diagnostic import acorr_ljungbox
#import phik
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.stattools import adfuller

# Machine Learning Modeling
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import RandomizedSearchCV
#import xgboost as xgb
from sklearn.metrics import mean_squared_error
import joblib
from sklearn.pipeline import Pipeline

import os

import random

import warnings
import time

# ignore warnings
warnings.filterwarnings('ignore')

from pathlib import Path, PureWindowsPath

In [2]:
#! pip install missingno
#! pip install pmdarima

### Import data

In [3]:
path_cwd = Path(PureWindowsPath(os.path.dirname(os.getcwd())))
path_cwd
path = path_cwd / 'data/processed/'

In [None]:
def create_dummy_fam_cluster(df):
    #df2 = df.drop(columns=['store_nbr','family','id','date'])
    df2  = pd.get_dummies(df, columns=['familycluster','cluster']) 
    return df2

In [4]:
train_data = pd.read_pickle(path / 'df_train.pkl')
val_data = pd.read_pickle(path / 'df_val.pkl')
test_data = pd.read_pickle(path / 'df_test.pkl')

In [125]:
train_data=train_data[['date','cluster','familycluster','sales']]
val_data=val_data[['date','cluster','familycluster','sales']]
test_data=test_data[['date','cluster','familycluster','sales']]

In [131]:
train_data = create_dummy_fam_cluster(train_data)
val_data = create_dummy_fam_cluster(val_data)
test_data = create_dummy_fam_cluster(test_data)

In [154]:
train_data.iloc[:,2:] = train_data.iloc[:,2:].astype(int)
val_data.iloc[:,2:] = val_data.iloc[:,2:].astype(int)
test_data.iloc[:,2:] = test_data.iloc[:,2:].astype(int)

## Modeling ##

In [8]:
# Updated plot_predictions to work with dataframes where the date is not already aggregated
# This simply aggregates the dates inside the function
def plot_predictions(date, y_test, y_pred, forecast_label, title):
    """
    Plot the actual and predicted time series data.

    Parameters:
    date (array-like): Date or time index.
    y_test (array-like): Actual values.
    y_pred (array-like): Predicted values.
    forecast_label (str): Label for the forecasted data.
    title (str): Title for the plot.
    """
    # Combine the data into a DataFrame
    data = pd.DataFrame({'Date': date, 'Actual': y_test, 'Predicted': y_pred})
    
    # Aggregate the data by date, taking the mean of the values for each day
    data = data.groupby('Date').mean().reset_index()
    
    # Set the custom color palette
    custom_palette = sns.color_palette("husl", 2)
    sns.set_palette(custom_palette)
    
    # Create a figure with specified dimensions
    plt.figure(figsize=(10, 6))

    # Plot the actual data in green
    sns.lineplot(data=data, x='Date', y='Actual', label='Actual', color=custom_palette[0])

    # Plot the predicted data in blue with the specified label
    sns.lineplot(data=data, x='Date', y='Predicted', label=forecast_label, color=custom_palette[1])

    # Add a legend to the plot
    plt.legend()

    # Set the title of the plot
    plt.title(title)

    # Rotate x-axis labels for better readability
    plt.xticks(rotation=45)

    # Display the plot
    plt.show()

In [9]:
# Define a function to compute the evaluations metrics after the forecast
def evaluate_forecast(y_test, forecast):
    """
    Compute MSE, RMSE, and RMSLE for a forecast.

    Parameters:
    y_test (array-like): Actual values.
    forecast (array-like): Predicted values.

    Returns:
    dict: Dictionary containing MSE, RMSE, and RMSLE.
    """
    def rmsle(predicted, actual):
        return np.sqrt(np.mean(np.square(np.log1p(predicted) - np.log1p(actual))))

    # Compute Mean Squared Error (MSE)
    mse = mean_squared_error(y_test, forecast)
    
    # Compute Root Mean Squared Error (RMSE)
    rmse = np.sqrt(mse)
    
    # Compute Root Mean Squared Logarithmic Error (RMSLE)
    rmsle_value = rmsle(forecast, y_test)
    
    # Return the evaluation metrics as a dictionary
    metrics = {
        'MSE': mse,
        'RMSE': rmse,
        'RMSLE': rmsle_value
    }
    
    return metrics

## SARIMA ##

In [155]:
train_data.columns

Index(['date', 'sales', 'familycluster_0', 'familycluster_1',
       'familycluster_2', 'familycluster_3', 'familycluster_4',
       'familycluster_5', 'familycluster_6', 'familycluster_7',
       'familycluster_8', 'familycluster_9', 'familycluster_10',
       'familycluster_11', 'familycluster_12', 'familycluster_13',
       'familycluster_14', 'familycluster_15', 'familycluster_16',
       'familycluster_17', 'familycluster_18', 'familycluster_19',
       'familycluster_20', 'cluster_1', 'cluster_2', 'cluster_3', 'cluster_4',
       'cluster_5', 'cluster_6', 'cluster_7', 'cluster_8', 'cluster_9',
       'cluster_10', 'cluster_11', 'cluster_12', 'cluster_13', 'cluster_14',
       'cluster_15', 'cluster_16', 'cluster_17'],
      dtype='object')

In [156]:
train_data['date'].max()

Timestamp('2016-04-26 00:00:00')

In [157]:
#shorten train for testing
train_data_sarima = train_data.set_index('date')
#shorten the train to help with training time
train_data_sarima = train_data_sarima[train_data_sarima.index > '2015-01-01']

In [158]:
#train_data_sarima = train_data_s[train_data_s['date']=='2015-09-01']
#train_data_sarima = train_data_s.set_index('date')
#val_data_sarima = val_data_s.set_index('date')

In [159]:
train_data_sarima_exog = train_data_sarima.drop(columns=['sales'])
y_train_sarima_endog = train_data_sarima['sales']

#val_data_sarima_exog = val_data_sarima[['familycluster','cluster']]
#y_val_sarima_endog = val_data_sarima['sales']

In [160]:
train_data_sarima_exog

Unnamed: 0_level_0,familycluster_0,familycluster_1,familycluster_2,familycluster_3,familycluster_4,familycluster_5,familycluster_6,familycluster_7,familycluster_8,familycluster_9,...,cluster_8,cluster_9,cluster_10,cluster_11,cluster_12,cluster_13,cluster_14,cluster_15,cluster_16,cluster_17
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-02,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2015-01-02,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2015-01-02,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2015-01-02,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2015-01-02,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-04-26,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2016-04-26,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2016-04-26,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2016-04-26,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [161]:
val_data_sarima = val_data.set_index('date')

val_data_sarima_exog = val_data_sarima.drop(columns=['sales'])
y_val_sarima =  val_data_sarima[['sales']]

In [162]:
test_data_sarima = test_data.set_index('date')

test_data_sarima_exog = test_data_sarima.drop(columns=['sales'])
y_test_sarima =  test_data_sarima[['sales']]

# Group by date to sum sales. USE IF CANT GET ANYTHING RUNNING

ts = train_data_sarima.groupby(train_data_sarima['date'])['sales'].sum()
#ts = train_data_sarima.groupby(train_data_sarima.index)['sales'].sum()
fig, (ax1,ax2) = plt.subplots(2, 1, figsize=(12,6))

# Plot ACF
sm.graphics.tsa.plot_acf(ts, lags=40, ax=ax1)
ax1.set_title('ACF')

# Plot PACF
sm.graphics.tsa.plot_pacf(ts, lags=40, ax=ax2)
ax2.set_title('PACF')

plt.tight_layout(pad=2.0)

plt.show()

# auto_arima.  runs into memory error when trying to run. will have to manually find hyperparams
start_time = time.time()
sarima_model = auto_arima(train_data_sarima['sales']
                          #, exogenous=train_data_s[['familycluster', 'onpromotion','holiday']]
                          #, exogenous=train_data_s[['familycluster']]
                          , start_p=0 , max_p=3, start_P=0 , max_P=3 
                          , d=None , D=1 #not sure this is a good value for d
                          , start_q=0 , max_q= 3 , start_Q=0 , max_Q=3
                          , random=True, n_fits=2
                          , seasonal=True, m=12, random_state=21, trace=True )

end_time = time.time()
elapsed_time = end_time - start_time

too slow to get results and do not need for model
def ad_fuller(timeseries):
    print ('Dickey-Fuller Test indicates:')
    df_test = adfuller(timeseries, regression='ct', autolag='AIC')
    output = pd.Series(df_test[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    print(output)

print(ad_fuller(train_data_s['sales']))

In [163]:
# Define the instance
model_sarima = sm.tsa.SARIMAX(endog=y_train_sarima_endog, exog=train_data_sarima_exog, order=(1,1,1), seasonal_order=(1,1,1,7))



In [None]:
# Fit the model. started at 11:48
results_sarima = model_sarima.fit()



In [None]:
#something is off with validation testing
start_date_val = val_data['date'].min() 
end_date_val = val_data['date'].max() 
start_date_test = test_data['date'].min()
end_date_test = test_data['date'].max() 
 

In [None]:
print(start_date_val, end_date_val)

In [118]:
# Make predictions
#forecast_sarima = results_sarima.predict(start=len(train_data_sarima), end=len(train_data_sarima) + len(val_data_sarima) - 1, dynamic=False)

#val_forecast_sarima = results_sarima.predict(steps=pd.date_range(start=start_date_val, end=end_date_val), exog=val_data_sarima_exog)
val_forecast_sarima = results_sarima.predict(start=len(train_data_sarima), end=len(train_data_sarima) + len(val_data_sarima) - 1, exog=val_data_sarima_exog)


KeyError: 'The `end` argument could not be matched to a location related to the index of the data.'

In [117]:
valid_forecast_sarima= val_forecast_sarima.to_frame(name='sales')
#valid_forecast_sarima = valid_forecast_sarima.loc[start_date_val:end_date_val]
valid_forecast_sarima

Unnamed: 0_level_0,sales
date,Unnamed: 1_level_1
2015-01-02,-427.184322
2015-01-02,377.072545
2015-01-02,1160.760137
2015-01-02,583.073495
2015-01-02,-219.477875
...,...
2016-04-26,226.794567
2016-04-26,517.012216
2016-04-26,587.941412
2016-04-26,830.610224


In [111]:
valid_forecast_sarima

Unnamed: 0_level_0,sales
date,Unnamed: 1_level_1
2016-04-26,-346.110242
2016-04-26,628.454527
2016-04-26,-212.666224
2016-04-26,306.168096
2016-04-26,-174.272075
...,...
2016-04-26,226.794567
2016-04-26,517.012216
2016-04-26,587.941412
2016-04-26,830.610224


In [98]:
val_data_sarima_exog

Unnamed: 0_level_0,cluster,familycluster
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-04-26,9,16
2016-04-26,6,19
2016-04-26,9,12
2016-04-26,9,15
2016-04-26,9,2
...,...,...
2016-10-03,5,8
2016-10-03,5,1
2016-10-03,5,12
2016-10-03,5,15


In [82]:
y_val_sarima

Unnamed: 0_level_0,sales
date,Unnamed: 1_level_1
2016-04-26,7.000
2016-04-26,21.000
2016-04-26,14.000
2016-04-26,251.078
2016-04-26,180.000
...,...
2016-10-03,19.000
2016-10-03,214.000
2016-10-03,13.000
2016-10-03,786.588


In [106]:
valid_forecast_sarima

date
2016-04-26   -346.110242
2016-04-26    628.454527
2016-04-26   -212.666224
2016-04-26    306.168096
2016-04-26   -174.272075
                 ...    
2016-04-26    226.794567
2016-04-26    517.012216
2016-04-26    587.941412
2016-04-26    830.610224
2016-04-26    350.592032
Name: predicted_mean, Length: 900, dtype: float64

In [105]:
sarima_metrics = evaluate_forecast(y_val_sarima, valid_forecast_sarima)

sarima_metrics

ValueError: Found input variables with inconsistent numbers of samples: [276569, 900]