### Using the Dickey-Fuller Method to Verify Seasonality, Stationanarity, and Constancy of Our Datasets Before the Model

In [1]:
from datetime import datetime
import pandas as pd
import dask.dataframe as dd
from statsmodels.tsa.stattools import adfuller
import pmdarima as pm
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX



In [2]:
# load the dataset
df = pd.read_csv('../datasets/scoring/final_aggregation.csv', parse_dates=['date'])

In [3]:
test_df = df[(df.date <= pd.to_datetime('2016-12-31')) & (df.date >= pd.to_datetime('2016-01-01'))]
train_df = df[(df.date <= pd.to_datetime('2019-01-01')) & (df.date > pd.to_datetime('2016-12-31'))]

In [4]:
test_df.fillna(0, inplace=True)
train_df.fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.fillna(0, inplace=True)


In [5]:
test_df.set_index('date', inplace=True)
train_df.set_index('date', inplace=True)

In [6]:
test_df.to_csv('../datasets/training/SARIMA/test_df.csv', index=False)
train_df.to_csv('../datasets/training/SARIMA/train_df.csv', index=False)

In [7]:
def dickey_fuller(df, col, significance_level=0.05):
    """
    Check the stationarity of a column using the Dickey-Fuller test.

    Parameters:
    - df: DataFrame containing the time series data.
    - col: Name of the column to test.
    - significance_level: The significance level for hypothesis testing.

    Returns:
    - values: A dictionary containing ADF test results.
    """
    chunk_df = df[[col]]
    print(f'{datetime.now()}: Starting search for the ideal chunk size for column "{col}"...')

    try:
        adf_result = adfuller(chunk_df, autolag='AIC')
        print(f'{datetime.now()}: Dickey-Fuller test was successful')
        
    except MemoryError:
        print(f'{datetime.now()}: Memory limit exceeded for column "{col}".')
        return None

    # Store ADF test results for the column
    values = {
        'Column Name': col,
        'Start Date': chunk_df.index[0],
        'End Date': chunk_df.index[-1],
        'ADF Statistic': adf_result[0],
        'P-Value': adf_result[1],
        'Significance Level': significance_level
    }

    return values

In [8]:
def auto_arima(df, col, stationary_state, seasonal_period=24):
    """
    Determine SARIMA model parameters using auto_arima.

    Parameters:
    - df: DataFrame containing the time series data.
    - col: Name of the column to analyze.
    - seasonal_period: Seasonal period for SARIMA model.

    Returns:
    - p, d, q, P, D, Q: Model parameters.
    """
    if stationary_state:
        stepwise_fit = pm.auto_arima(df[col], seasonal=True, m=seasonal_period,
                                     suppress_warnings=True,
                                     stepwise=True,
                                     error_action="ignore",
                                     trace=True, n_fits=50)

        print(f'{datetime.now()}: Seasonal Auto-ARIMA function was successful for column "{col}"')
    else:
        stepwise_fit = pm.auto_arima(df[col], seasonal=False,
                                suppress_warnings=True,
                                stepwise=True,
                                error_action="ignore",
                                trace=True, n_fits=50)
        print(f'{datetime.now()}: Non-Seasonal Auto-ARIMA function was successful for column "{col}": {str(e)}')

    params = stepwise_fit.get_params()
    p, d, q, P, D, Q = params.get('order', (0, 1, 1)), params.get('seasonal_order', (0, 1, 1, 24))

    print(f'{datetime.now()}: Auto-ARIMA parameters successfully gathered for column "{col}"')

    return p, d, q, P, D, Q

In [9]:
def ARIMA_model(df, col, p, d, q, P, D, Q, stationary_state, m=24):
    """
    Fit a SARIMA or ARIMA model based on the stationarity state.

    Parameters:
    - df: DataFrame containing the time series data.
    - col: Name of the column to model.
    - p, d, q: Parameters for the ARIMA component.
    - P, D, Q: Parameters for the seasonal component (SARIMA).
    - seasonal_state: True if data is stationary, False otherwise.
    - m: Seasonal period for SARIMA model.

    Returns:
    - results: Fitted SARIMA or ARIMA model.
    """
    if stationary_state:
        print(f'{datetime.now()}: Data for column "{col}" is stationary. Fitting SARIMA model.')
        model = SARIMAX(df[col],
                        order=(p, d, q),
                        seasonal_order=(P, D, Q, m))
    else:
        print(f'{datetime.now()}: Data for column "{col}" is not stationary. Fitting ARIMA model instead.')
        model = ARIMA(df[col], order=(p, d, q))

    results = model.fit()
    
    return results


In [12]:
def process_column(df, col, significance_level=0.05):
    """
    Process a specific column in the DataFrame.

    Parameters:
    - df: DataFrame containing the time series data.
    - col: Name of the column to process.
    - significance_level: The significance level for hypothesis testing.

    Returns:
    - results: Fitted SARIMA or ARIMA model for the column.
    """
    values = dickey_fuller(df, col, significance_level)
    if values is not None:
        seasonality_state = (values['P-Value'] < values['Significance Level'])
        p, d, q, P, D, Q = auto_arima(df[col], seasonality_state)
        results = ARIMA_model(df[col], p, d, q, P, D, Q, seasonality_state)
        return results

In [None]:
def main(df, exempt_col):
    """
    Main function to process columns in the DataFrame using Dask.

    Parameters:
    - df: DataFrame containing the time series data.
    - exempt_col: List of columns to exempt from processing.

    Returns:
    - final_results: List of fitted SARIMA or ARIMA models for selected columns.
    """
    df_dask = dd.from_pandas(df, npartitions=8)  # Adjust the number of partitions as needed
    final_results = df_dask.map_partitions(
        lambda partition: [process_column(partition, col) for col in partition.columns if col not in exempt_col],
        meta=pd.Series(dtype=object)
    ).compute(scheduler="processes")

    return final_results

In [13]:
test_results = main(test_df, ['hour', 'area'])
test_results

2023-10-24 01:28:18.343633: Initiating on "non-violent" column
2023-10-24 01:28:18.344612: Starting search for the ideal chunk size for column "non-violent"...
2023-10-24 01:32:17.615237: Dickey-Fuller test was successful
Performing stepwise search to minimize aic


In [None]:
train_results = main(train_df, ['hour', 'area'])
train_results