In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pmdarima.arima import ADFTest
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose 
import sys



In [2]:
plt.style.use('dark_background')

# load the dataset
df = pd.read_csv('../datasets/scoring/final_aggregation.csv')
df['date'] = pd.to_datetime(df['date'])

In [3]:
test_df = df[(df.date <= pd.to_datetime('2016-12-31')) & (df.date >= pd.to_datetime('2016-01-01'))]
train_df = df[(df.date <= pd.to_datetime('2019-01-01')) & (df.date > pd.to_datetime('2016-12-31'))]

In [4]:
test_df.fillna(0, inplace=True)
train_df.fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.fillna(0, inplace=True)


In [5]:
test_df.set_index('date', inplace=True)
train_df.set_index('date', inplace=True)

In [6]:
test_df.columns

Index(['hour', 'area', 'non-violent', 'violent', 'cta_stations',
       'police_stations', 'bus_stations', 'unemployment', 'per_capita_income',
       'no_hs_dip', 'gov_depend', 'crowded_housing', 'below_pov',
       'bike_stations', 'train_rides', 'bike_rides', 'lighting',
       'vacant_buildings'],
      dtype='object')

In [8]:
def adf_test_large_data(df, chunk_size=1000, significance_level=0.05, constant_th=1e-6, constant_std_th=0.01):
    """
    Perform Augmented Dickey-Fuller (ADF) test on a large DataFrame by splitting it into chunks.

    Parameters:
    - df: DataFrame with a date index and a time series column.
    - chunk_size: Size of each chunk for testing. Adjust as needed for your available memory.
    - significance_level: The significance level for hypothesis testing.

    Returns:
    - List of dictionaries, each containing ADF test results for a chunk of the data.
    """
    final_results = []

    for col in df.columns:
        print(f'Starting ADF test for column "{col}"...')

        temp = df[[col]]

        results = []
        num_chunks = len(temp) // chunk_size + 1
        print(f'Number of chunks detected for column "{col}": {num_chunks}')

        for i in range(num_chunks):
            start_index = i * chunk_size
            end_index = (i + 1) * chunk_size
            chunk_df = temp.iloc[start_index:end_index]

            if len(chunk_df) > 0:
                # Check if the data in the chunk is constant
                if (chunk_df[col].std() > constant_th) and (chunk_df[col].std() > constant_std_th * df[col].std()):
                    # Perform the ADF test
                    adf_result = adfuller(chunk_df, autolag='AIC')

                    # Append results to the list
                    results.append({
                        'Chunk': i + 1,
                        'Start Date': chunk_df.index[0],
                        'End Date': chunk_df.index[-1],
                        'ADF Statistic': adf_result[0],
                        'P-Value': adf_result[1],
                        'Significance Level': significance_level,
                        'Stationary': adf_result[1] < significance_level,
                    })

                if i % round(num_chunks * 0.1) == 0:
                    print(f'Chunk #{i + 1} of column "{col}" completed.')

        final_results.append(results)

        print(f'ADF test for column "{col}" completed.')

    return final_results

In [16]:
adf, pvalue, usedlag_, nobs_, critical_values_, icbest_ = adfuller(test_df[['bike_rides']])
print("pvalue = ", pvalue, " if above 0.05, data is not stationary")

pvalue =  0.0  if above 0.05, data is not stationary


In [9]:
adf_test_large_data(test_df) #results at standard chunk size

Starting ADF test for column "hour"...
Number of chunks detected for column "hour": 657
Chunk #1 of column "hour" completed.
Chunk #67 of column "hour" completed.
Chunk #133 of column "hour" completed.
Chunk #199 of column "hour" completed.
Chunk #265 of column "hour" completed.
Chunk #331 of column "hour" completed.
Chunk #397 of column "hour" completed.
Chunk #463 of column "hour" completed.
Chunk #529 of column "hour" completed.
Chunk #595 of column "hour" completed.
ADF test for column "hour" completed.
Starting ADF test for column "area"...
Number of chunks detected for column "area": 657
Chunk #1 of column "area" completed.
Chunk #67 of column "area" completed.
Chunk #133 of column "area" completed.
Chunk #199 of column "area" completed.
Chunk #265 of column "area" completed.
Chunk #331 of column "area" completed.
Chunk #397 of column "area" completed.
Chunk #463 of column "area" completed.
Chunk #529 of column "area" completed.
Chunk #595 of column "area" completed.
ADF test for

[[{'Chunk': 1,
   'Start Date': Timestamp('2016-01-01 00:00:00'),
   'End Date': Timestamp('2016-01-01 00:00:00'),
   'ADF Statistic': -0.4548374970148843,
   'P-Value': 0.9005324551926577,
   'Significance Level': 0.05,
   'Stationary': False},
  {'Chunk': 2,
   'Start Date': Timestamp('2016-01-01 00:00:00'),
   'End Date': Timestamp('2016-01-02 00:00:00'),
   'ADF Statistic': -1.0986925353477988,
   'P-Value': 0.7155930252142889,
   'Significance Level': 0.05,
   'Stationary': False},
  {'Chunk': 3,
   'Start Date': Timestamp('2016-01-02 00:00:00'),
   'End Date': Timestamp('2016-01-02 00:00:00'),
   'ADF Statistic': -0.9331820676036043,
   'P-Value': 0.7768488193056804,
   'Significance Level': 0.05,
   'Stationary': False},
  {'Chunk': 4,
   'Start Date': Timestamp('2016-01-02 00:00:00'),
   'End Date': Timestamp('2016-01-03 00:00:00'),
   'ADF Statistic': -1.2441353639139339,
   'P-Value': 0.6543276565627265,
   'Significance Level': 0.05,
   'Stationary': False},
  {'Chunk': 5,
 

In [10]:
def find_ideal_chunk_size(df, col, max_memory_usage=1024):
    """
    Find the ideal chunk size for running the ADF test on a single column of a DataFrame.

    Parameters:
    - df: DataFrame with a single column.
    - col: Name of the column to test.
    - max_memory_usage: Maximum allowable memory usage in megabytes.

    Returns:
    - Ideal chunk size for the ADF test.
    """
    initial_chunk_size = 10000  # Starting chunk size
    current_chunk_size = initial_chunk_size
    last_memory_usage = sys.getsizeof(pd.DataFrame())

    print(f'Starting search for ideal chunk size for column "{col}"...')

    while True:
        # Increase the chunk size (increasing by a factor of 10)
        current_chunk_size *= 10

        # Create a chunk DataFrame using .loc to select rows by index
        chunk_df = df.loc[df.index[:current_chunk_size], [col]]

        # Perform the ADF test on the chunk
        try:
            adf_result = adfuller(chunk_df, autolag='AIC')
        except MemoryError:
            # If a MemoryError occurs, revert to the previous chunk size
            current_chunk_size //= 10
            print(f'Memory limit exceeded. Reducing chunk size to {current_chunk_size} rows...')
            continue

        # Calculate memory usage
        memory_usage = sys.getsizeof(chunk_df) / (1024 * 1024)  # Convert to megabytes

        print(f'Chunk size: {current_chunk_size} rows, Memory usage: {memory_usage:.2f} MB')

        # If memory usage exceeds the maximum allowable or increases substantially, break the loop
        if memory_usage > max_memory_usage or memory_usage > 2 * last_memory_usage:
            current_chunk_size //= 10
            print('Memory limit or usage increase exceeded. Reducing chunk size...')
            break

        # Update the last_memory_usage
        last_memory_usage = memory_usage

    print(f'Ideal chunk size for column "{col}": {current_chunk_size} rows')
    return current_chunk_size

In [11]:
def find_ideal_adf_parameters(df, col, max_memory_usage=1024):
    """
    Find the ideal ADF test parameters for running the test on a single column of a DataFrame.

    Parameters:
    - df: DataFrame with a single column.
    - col: Name of the column to test.
    - max_memory_usage: Maximum allowable memory usage in megabytes.

    Returns:
    - Tuple of ideal ADF test parameters: (significance_level, constant_th, constant_std_th).
    """
    best_significance_level = None
    best_constant_th = None
    best_constant_std_th = None

    best_chunk_size = None

    for significance_level in [0.01, 0.05, 0.1]:  # Test different significance levels
        for constant_th in [1e-6, 1e-5, 1e-4]:  # Test different absolute thresholds for constancy
            for constant_std_th in [0.01, 0.1, 1.0]:  # Test different relative thresholds for constancy
                # Find the ideal chunk size for these parameters
                chunk_size = find_ideal_chunk_size(df, col, max_memory_usage)

                # Check if these parameters result in a better chunk size
                if best_chunk_size is None or chunk_size > best_chunk_size:
                    best_chunk_size = chunk_size
                    best_significance_level = significance_level
                    best_constant_th = constant_th
                    best_constant_std_th = constant_std_th

    print(f'Ideal ADF Parameters for column "{col}": Significance Level={best_significance_level}, Constant Threshold={best_constant_th}, Constant Std Threshold={best_constant_std_th}')
    return best_significance_level, best_constant_th, best_constant_std_th

In [None]:
test_sl, test_th, test_std_th = find_ideal_adf_parameters(test_df, 'bike_rides')
test_adf_results = adf_test_large_data(test_df, test_sl, test_th, test_std_th)
test_adf_results

In [None]:
train_sl, train_th, train_std_th = find_ideal_adf_parameters(train_df, 'bike_rides')
train_adf_results = adf_test_large_data(train_df, train_sl, train_th, train_std_th)
train_adf_results