In [2]:
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime
from statsmodels.tsa.stattools import adfuller
from sklearn.preprocessing import MinMaxScaler
from statistics import median
import numpy as np
from collections import OrderedDict

In [3]:
def read_in_tables(n=78):
    df_list = {}
    for i in range(1, n):

        table1_name = 'test_df' + str(i)
        table2_name = 'train_df' + str(i)

        df = pd.read_csv(f'../datasets/ml_{i}.csv', parse_dates=['date'])

        globals()[table1_name] = df[(df.date <= pd.to_datetime('2019-12-31')) & (df.date >= pd.to_datetime('2019-01-01'))]
        globals()[table2_name] = df[(df.date < pd.to_datetime('2019-01-01')) & (df.date > pd.to_datetime('2016-01-01'))]

        globals()[table1_name].fillna(0, inplace=True)
        globals()[table2_name].fillna(0, inplace=True)

        globals()[table1_name].set_index('date', inplace=True)
        globals()[table2_name].set_index('date', inplace=True)

        df_list[table1_name] = globals()[table1_name]
        df_list[table2_name] = globals()[table2_name]

    return df_list

In [4]:
tables = read_in_tables()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  globals()[table1_name].fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  globals()[table2_name].fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  globals()[table1_name].fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  globals()[table2_name].fill

In [5]:
# tests stationarity, returns a list of values corresponding to each column
def dickey_fuller_test(df, target_cols):

    output = {}

    # Loop through columns
    for col in target_cols:

        # Print progress  
        print(f"{datetime.now()}: Testing {col} for stationarity...")

        try:
            # Perform ADF test
            adf_result = adfuller(df[col])
            
        except Exception as e:
            # Handle errors
            print(f"Error testing {col}. Skipping.")
            print(e)
            continue
        
        output[col] = adf_result[1]

    return output

In [6]:
test_tables = {list(tables.keys())[j] : list(tables.values())[j] for j in range(0, len(tables), 2)}
train_tables = {list(tables.keys())[j] : list(tables.values())[j] for j in range(0, len(tables), 2)}

In [7]:
def create_adf_by_col(dfs, cols):
  adf_by_col = {}
  # Gather ADF P-Value Scores for each dataset
  for col in cols:
      adf_by_col[col] = {}
  
  for table_name, table in dfs.items():
      for col_name, col_result in dickey_fuller_test(table, cols).items():
          adf_by_col[col_name][table_name] = col_result
      
  return adf_by_col

In [8]:
test_adf_by_col = create_adf_by_col(test_tables, ['violent', 'non-violent'])
train_adf_by_col = create_adf_by_col(train_tables, ['violent', 'non-violent'])

2023-11-01 17:34:26.287865: Testing violent for stationarity...
2023-11-01 17:34:27.004957: Testing non-violent for stationarity...
2023-11-01 17:34:27.778359: Testing violent for stationarity...
2023-11-01 17:34:28.563023: Testing non-violent for stationarity...
2023-11-01 17:34:29.546662: Testing violent for stationarity...
2023-11-01 17:34:30.480556: Testing non-violent for stationarity...
2023-11-01 17:34:31.484175: Testing violent for stationarity...
2023-11-01 17:34:32.266071: Testing non-violent for stationarity...
2023-11-01 17:34:32.982064: Testing violent for stationarity...
2023-11-01 17:34:33.744635: Testing non-violent for stationarity...
2023-11-01 17:34:34.649741: Testing violent for stationarity...
2023-11-01 17:34:35.667202: Testing non-violent for stationarity...
2023-11-01 17:34:36.434575: Testing violent for stationarity...
2023-11-01 17:34:37.286492: Testing non-violent for stationarity...
2023-11-01 17:34:37.852216: Testing violent for stationarity...
2023-11-01 1

In [9]:
def representative_sample(dfs, adf):

    representative_samples = {}

    for col_name in adf:
        representative_samples[col_name] = {}
        representative_samples[col_name]['min'] = dfs[max(adf[col_name], key=adf[col_name].get)]
        representative_samples[col_name]['max'] = dfs[min(adf[col_name], key=adf[col_name].get)]
        
        for table_name, value in adf[col_name].items():
            if 'avg' not in representative_samples[col_name].keys():
                representative_samples[col_name]['avg'] = dfs[table_name]
            else:
                representative_samples[col_name]['avg'].add(dfs[table_name])

        representative_samples[col_name]['avg'].div(len(adf[col_name]))            
    
    return representative_samples

representative_sample(train_tables, train_adf_by_col)

{'violent': {'min':                      non-violent  violent  train_rides  bike_rides  lighting  \
  date                                                                           
  2019-01-01 00:00:00          6.0      3.0          0.0         0.0       0.0   
  2019-01-01 01:00:00          3.0      1.0          0.0         0.0       0.0   
  2019-01-01 02:00:00          1.0      0.0          0.0         0.0       0.0   
  2019-01-01 03:00:00          1.0      3.0          0.0         0.0       0.0   
  2019-01-01 04:00:00          1.0      1.0          0.0         0.0       0.0   
  ...                          ...      ...          ...         ...       ...   
  2019-12-30 19:00:00          0.0      3.0          0.0         0.0       0.0   
  2019-12-30 20:00:00          1.0      0.0          0.0         0.0       0.0   
  2019-12-30 21:00:00          0.0      0.0          0.0         0.0       0.0   
  2019-12-30 22:00:00          0.0      0.0          0.0         0.0       0.0  

In [12]:
def write_samples(samples, path, label):
  for col_name, tables in samples.items():
    for table_name, table in tables.items():
      table.to_csv(f"{path}{label}_{col_name}_{table_name}.csv", index=False)

In [13]:
write_samples(representative_sample(train_tables, train_adf_by_col), "../datasets/representative_samples/", "train")
write_samples(representative_sample(test_tables, test_adf_by_col), "../datasets/representative_samples/", "test")

In [11]:
dfs = []

def visualize_seasonality(dfs):
    for df in dfs:
        intervals = [365 * 3, 52 * 3, 12 * 3]  # Number of intervals to split the DataFrame
        fig, axes = plt.subplots(len(intervals), 1, figsize=(30, 15))
        plt.title(f'Table {j}') 
        plt.subplots_adjust(hspace=0.4)

        for count, interval in enumerate(intervals):
            # Initialize an empty DataFrame for averaging
            rows_per_interval = len(df) // interval
            average_df = df.iloc[:rows_per_interval]

            # Create a list of DataFrames, each corresponding to an interval
            interval_dfs = [df.iloc[i * rows_per_interval:(i + 1) * rows_per_interval] for i in range(1, interval)]
            
            for interval_df in interval_dfs:
                for row in range(len(interval_df['non-violent'])):
                    average_df['non-violent'].iloc[row] += interval_df['non-violent'].iloc[row]
                    average_df['violent'].iloc[row] += interval_df['violent'].iloc[row]

            # Divide by the number of intervals to get the average
            average_df['non-violent'] /= len(interval_dfs)
            average_df['violent'] /= len(interval_dfs)

            # Create a plot for the averaged data in the current subplot
            ax = axes[count]
            ax.plot(average_df.index, average_df['non-violent'], label='Non-Violent')
            ax.plot(average_df.index, average_df['violent'], label='Violent')
            ax.legend()
            ax.grid(True)

        # Set a common x-label for all subplots
        axes[-1].set_xlabel('Date')

        # Show the entire plot
        plt.show()