### Using the Dickey-Fuller Method to Verify Seasonality, Stationanarity, and Constancy of Our Datasets Before the Model

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose 



In [2]:
# load the dataset
df = pd.read_csv('../datasets/scoring/final_aggregation.csv', parse_dates=['date'])

In [3]:
test_df = df[(df.date <= pd.to_datetime('2016-12-31')) & (df.date >= pd.to_datetime('2016-01-01'))]
train_df = df[(df.date <= pd.to_datetime('2019-01-01')) & (df.date > pd.to_datetime('2016-12-31'))]

In [4]:
test_df.fillna(0, inplace=True)
train_df.fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.fillna(0, inplace=True)


In [5]:
test_df.set_index('date', inplace=True)
train_df.set_index('date', inplace=True)

In [6]:
def adf_test(df, significance_level=0.05):
    results = []
    successes = []
    fails = []
    max_chunk_size = len(df)
    for col in df.columns:
        chunk_df = df[[col]]
        
        print(f'Starting search for ideal chunk size for column "{col}"...')

        current_chunk_size = max_chunk_size

        while current_chunk_size > 0:
            # Create a chunk DataFrame using .loc to select rows by index
            chunk_df = df.loc[df.index[:current_chunk_size], [col]]

            # Perform the ADF test on the chunk
            try:
                adf_result = adfuller(chunk_df, autolag='AIC')
            except MemoryError:
                # If a MemoryError occurs, decrease the chunk size by 10%
                current_chunk_size = max(current_chunk_size - max_chunk_size // 10, 0)
                print(f'Memory limit exceeded. Reducing chunk size to {current_chunk_size} rows...')
                continue

        print(f'Ideal chunk size for column "{col}": {current_chunk_size} rows')

        results.append({
                        'Current Chunk Size': current_chunk_size,
                        'Start Date': chunk_df.index[0],
                        'End Date': chunk_df.index[-1],
                        'ADF Statistic': adf_result[0],
                        'P-Value': adf_result[1],
                        'Significance Level': significance_level,
                        'Stationary': adf_result[1] < significance_level,
                    })
        
        #check results to verify whether or not the data is stationary
        result = F'{col} column scored a P-value of {adf_result[1]} at a significance level of {significance_level}'
        if adf_result[1] <= significance_level:
            successes.append(result)
        else:
            fails.append(result)
        
        print(f'ADF test for column "{col}" completed.')
    
    results_df = pd.DataFrame(results)

    return results_df, successes, fails

In [7]:
#adf, pvalue, usedlag_, nobs_, critical_values_, icbest_ = adfuller(test_df[['bike_rides']])
#print("pvalue = ", pvalue, " if above 0.05, data is not stationary")

In [8]:
test_adf_results, test_adf_successes, test_adf_fails = adf_test(test_df)
print(f'Successes: {len(test_adf_successes)}, Fails: {len(test_adf_fails)}')
test_adf_results

Starting search for ideal chunk size for column "hour"...


In [None]:
train_adf_results, train_adf_successes, train_adf_fails = adf_test(train_df)
print(f'Successes: {len(train_adf_successes)}, Fails: {len(train_adf_fails)}')
train_adf_results

In [None]:
plt.style.use('dark_background')

def plot_seasonality(df):

    for col in df.columns:

        decomposed = seasonal_decompose(df[col],  
                                    model ='additive')

        trend = decomposed.trend
        seasonal = decomposed.seasonal
        residual = decomposed.resid

        plt.figure(figsize=(12,8))
        plt.subplot(411)
        plt.plot(df, label='Original', color='yellow')
        plt.legend(loc='upper left')
        plt.subplot(412)
        plt.plot(trend, label='Trend', color='yellow')
        plt.legend(loc='upper left')
        plt.subplot(413)
        plt.plot(seasonal, label='Seasonal', color='yellow')
        plt.legend(loc='upper left')
        plt.subplot(414)
        plt.plot(residual, label='Residual', color='yellow')
        plt.legend(loc='upper left')
        plt.show()

In [None]:
plot_seasonality(test_df)

In [None]:
plot_seasonality(train_df)