In [11]:
import matplotlib.pyplot as plt
import pandas as pd
import datetime
from statsmodels.tsa.stattools import adfuller
from sklearn.preprocessing import MinMaxScaler
import numpy as np

In [2]:
def read_in_tables(n=78):
    df_list = {}
    for i in range(1, n):

        table1_name = 'test_df' + str(i)
        table2_name = 'train_df' + str(i)

        df = pd.read_csv(f'../datasets/training/SARIMA/by_area/ml_{i}.csv', parse_dates=['date'])

        globals()[table1_name] = df[(df.date <= pd.to_datetime('2019-12-31')) & (df.date >= pd.to_datetime('2019-01-01'))]
        globals()[table2_name] = df[(df.date < pd.to_datetime('2019-01-01')) & (df.date > pd.to_datetime('2016-01-01'))]

        globals()[table1_name].fillna(0, inplace=True)
        globals()[table2_name].fillna(0, inplace=True)

        globals()[table1_name].set_index('date', inplace=True)
        globals()[table2_name].set_index('date', inplace=True)

        df_list[table1_name] = globals()[table1_name]
        df_list[table2_name] = globals()[table2_name]

    return df_list

In [3]:
tables = read_in_tables()
len(tables)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  globals()[table1_name].fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  globals()[table2_name].fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  globals()[table1_name].fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  globals()[table2_name].fill

154

In [4]:
train_df1

Unnamed: 0_level_0,non-violent,violent,train_rides,bike_rides,lighting,vacant_buildings
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-01-01 01:00:00,1.0,1.0,8811.0,0.0,43.0,76.0
2016-01-01 02:00:00,0.0,0.0,8811.0,1.0,43.0,76.0
2016-01-01 03:00:00,0.0,2.0,8811.0,0.0,43.0,76.0
2016-01-01 04:00:00,0.0,0.0,8811.0,0.0,43.0,76.0
2016-01-01 05:00:00,0.0,0.0,8811.0,0.0,43.0,76.0
...,...,...,...,...,...,...
2018-12-31 19:00:00,1.0,0.0,10661.0,2.0,0.0,92.0
2018-12-31 20:00:00,0.0,0.0,10661.0,0.0,0.0,92.0
2018-12-31 21:00:00,0.0,0.0,10661.0,2.0,0.0,92.0
2018-12-31 22:00:00,0.0,0.0,10661.0,0.0,0.0,92.0


In [9]:
def dickey_fuller_test(df, target_cols):

    output = []

    # Loop through columns
    for col in target_cols:

        # Print progress  
        print(f"{datetime.now()}: Testing {col} for stationarity...")

        try:
            # Perform ADF test
            adf_result = adfuller(df[col])
            
        except:
            # Handle errors
            print(f"Error testing {col}. Skipping.")
            continue
        
        output.append(adf_result[1])

    return output

In [13]:
def representative_sample(dfs, cols, n_categories):

    # Gather ADF P-Value Scores for each dataset
    nv_adf = {}
    v_adf = {}
    for i in range(len(dfs)):
        results = dickey_fuller_test(dfs[i], cols)
        nv_adf[results[0]] = dfs[i]
        v_adf[results[1]] = dfs[i]

    # Sort dictionaries by values and convert to lists of tuples
    sorted_nv_adf = sorted(nv_adf.items(), key=lambda item: item[1])
    sorted_v_adf = sorted(v_adf.items(), key=lambda item: item[1])

    # Extract ADF scores for 'non-violent' and 'violent'
    nv_scores = [score for _, score in sorted_nv_adf]
    v_scores = [score for _, score in sorted_v_adf]

    # Initialize Min-Max scalers
    nv_scaler = MinMaxScaler()
    v_scaler = MinMaxScaler()

    # Fit and transform ADF scores using the scalers
    normalized_nv_scores = nv_scaler.fit_transform(np.array(nv_scores).reshape(-1, 1))
    normalized_v_scores = v_scaler.fit_transform(np.array(v_scores).reshape(-1, 1))

    # Determine the normalized range for 'non-violent' and 'violent'
    nv_min_score = min(normalized_nv_scores)
    nv_max_score = max(normalized_nv_scores)
    v_min_score = min(normalized_v_scores)
    v_max_score = max(normalized_v_scores)

    nv_range = nv_max_score - nv_min_score
    v_range = v_max_score - v_min_score

    # Sort and split scores into n categories and create a representative dataset
    nv_length = nv_range // n_categories
    v_length = v_range // n_categories

    outputs = {}

    for n in range(1, n_categories + 1):
        # Deteremine which dataframes fit within the current range
        df_list = []
        for table, result in sorted_nv_adf:
            if (result < nv_length * n) & (result > nv_length * (n-1)):
                df_list.append(table)

        # Aggregate the dataframes together
        final = df_list[0]
        for df in df_list:
            final.add(df)
        final.div(len(df_list))

        outputs[f'nv_df_{n}'] = final

        # Deteremine which dataframes fit within the current range
        df_list = []
        for table, result in sorted_nv_adf:
            if (result < v_length * n) & (result > v_length * (n-1)):
                df_list.append(table)

        # Aggregate the dataframes together
        final = df_list[0]
        for df in df_list:
            final.add(df)
        final.div(len(df_list))

        outputs[f'v_df_{n}'] = final

    return outputs    

In [6]:
dfs = []

def visualize_seasonality(dfs):
    for df in dfs:
        intervals = [365 * 3, 52 * 3, 12 * 3]  # Number of intervals to split the DataFrame
        fig, axes = plt.subplots(len(intervals), 1, figsize=(30, 15))
        plt.title(f'Table {j}') 
        plt.subplots_adjust(hspace=0.4)

        for count, interval in enumerate(intervals):
            # Initialize an empty DataFrame for averaging
            rows_per_interval = len(df) // interval
            average_df = df.iloc[:rows_per_interval]

            # Create a list of DataFrames, each corresponding to an interval
            interval_dfs = [df.iloc[i * rows_per_interval:(i + 1) * rows_per_interval] for i in range(1, interval)]
            
            for interval_df in interval_dfs:
                for row in range(len(interval_df['non-violent'])):
                    average_df['non-violent'].iloc[row] += interval_df['non-violent'].iloc[row]
                    average_df['violent'].iloc[row] += interval_df['violent'].iloc[row]

            # Divide by the number of intervals to get the average
            average_df['non-violent'] /= len(interval_dfs)
            average_df['violent'] /= len(interval_dfs)

            # Create a plot for the averaged data in the current subplot
            ax = axes[count]
            ax.plot(average_df.index, average_df['non-violent'], label='Non-Violent')
            ax.plot(average_df.index, average_df['violent'], label='Violent')
            ax.legend()
            ax.grid(True)

        # Set a common x-label for all subplots
        axes[-1].set_xlabel('Date')

        # Show the entire plot
        plt.show()