In [1]:
from dtaidistance import dtw
from statsmodels.tsa.stattools import grangercausalitytests
from sklearn.cross_decomposition import CCA
from statsmodels.tsa.stattools import adfuller
import pandas as pd
import numpy as np
import glob
import os


In [2]:
# Compute DTW distance
def dtw_distance(df):
    """
    Compute the DTW distance between all pairs of time series in the dataframe.
    Args:
        df (pd.DataFrame): DataFrame containing time series data.
    """
    series = []
    
    # extract time series from every column
    for i, col in enumerate(df.columns):
        # print("Column = ", i, col)
        series.append(df[col].values)
    
    # loop over series' to compute distances between each series
    num_series = len(series)
    results = np.zeros((num_series, num_series))
    for j in range(num_series):
        for l in range(num_series):
            results[j, l] = dtw.distance(series[j], series[l])
            # print(f"DTW Distance between series {j} and series {l} is: {results[j, l]}")
        
    # return distance
    return results

# Compute DTW distance BUT z-NORM FIRST
def dtw_distance_norm(df):
    """
    Compute the DTW distance between all pairs of time series in the dataframe.
    Args:
        df (pd.DataFrame): DataFrame containing time series data.
    """
    series = []
    
    # extract time series from every column
    for i, col in enumerate(df.columns):
        # normalize series with z-norm
        values = df[col].values
        z_norm = (values - np.mean(values)) / np.std(values) # znorm = (z - mean) / std
        series.append(z_norm)
    
    # loop over series' to compute distances between each series
    num_series = len(series)
    results = np.zeros((num_series, num_series))
    for j in range(num_series):
        for l in range(num_series):
            results[j, l] = dtw.distance(series[j], series[l])
            # print(f"DTW Distance between series {j} and series {l} is: {results[j, l]}")
        
    # return distance
    return results

def grangercasuality(df, max_lag = 12):
    # TIMESERIES MUST BE STATIONARY!
    # here we check for stationarity, if not we take the difference per column
    stationary_cols = {}
    for col in df.columns:
        result = adfuller(df[col].dropna())
        stationary = result[1] < 0.05
        print(f"{col}: p-value = {result[1]:.4f} -> {'Stationary' if result[1] < 0.05 else 'Non-stationary'}")
        stationary_cols[col] = stationary
    df_stationary = df.copy()
    for col in df.columns:
        if not stationary_cols[col]:
            df_stationary[col] = df[col].diff()
    df_stationary = df_stationary.dropna()

    granger_results = []
    # Loop over all pairs (excluding self-pairs)
    for col_x in df_stationary.columns:
        for col_y in df_stationary.columns:
            if col_x != col_y:
                test_data = df_stationary[[col_y, col_x]]
                try:
                    res = grangercausalitytests(test_data, maxlag=max_lag, verbose=False)
                    p_vals = [round(res[lag][0]['ssr_ftest'][1], 4) for lag in range(1, max_lag + 1)]
                    granger_results.append({
                        'cause': col_x,
                        'effect': col_y,
                        'p_values': p_vals
                    })
                except Exception as e:
                    print(f"Error processing {col_x} and {col_y}: {e}")
    return granger_results

    
    








#### On interpolated underlying data

In [3]:
# Correlation matrix for Spearman, Pearson, DTW and Granger
file_path = "data/processed/interpolated_and_trends"
files = glob.glob(os.path.join(file_path, "interpolated_complete_*.csv"))
methods = ['spearman', 'pearson', 'dtw', 'dtw_norm', 'granger']

all_corr_results = {}   # all results of all files

for file in files:
    df = pd.read_csv(file)
    df_drop = df[[col for col in df.columns if "interpolated" in col]]
    corr_results = []       # results of one file
    
    for method in methods:
        if method in ['spearman', 'pearson']:
            corr_res = df_drop.corr(method=method)
            corr_results.append({'Method': method, 'Results': corr_res})
        elif method == 'dtw':
            dtw_distances = dtw_distance(df_drop)   # returns a 2D np-array
            corr_results.append({'Method': method, 'Results': dtw_distances})
        elif method == 'dtw_norm':
            dtw_norm_distances = dtw_distance_norm(df_drop)   # returns a 2D np-array
            corr_results.append({'Method': method, 'Results': dtw_norm_distances})
        elif method == 'granger':
            granger_res = grangercasuality(df_drop, max_lag=12)  # returns list of dicts with max_lag p-values
            corr_results.append({'Method': method, 'Results': granger_res})
        
    all_corr_results[os.path.basename(file)] = corr_results


Traffic_interpolated: p-value = 0.2444 -> Non-stationary
CO (mg/m³)_interpolated: p-value = 0.0632 -> Non-stationary




Traffic_interpolated: p-value = 0.0084 -> Stationary
CO (mg/m³)_interpolated: p-value = 0.4720 -> Non-stationary




Traffic_interpolated: p-value = 0.0000 -> Stationary
PM10_interpolated: p-value = 0.0000 -> Stationary
NO2_interpolated: p-value = 0.1052 -> Non-stationary
O3_interpolated: p-value = 0.2276 -> Non-stationary




Traffic_interpolated: p-value = 0.0000 -> Stationary
PM10_interpolated: p-value = 0.0000 -> Stationary
NO2_interpolated: p-value = 0.1052 -> Non-stationary
O3_interpolated: p-value = 0.2276 -> Non-stationary




Traffic_interpolated: p-value = 0.1438 -> Non-stationary
PM10_interpolated: p-value = 0.2747 -> Non-stationary
NO2_interpolated: p-value = 0.4957 -> Non-stationary
PM2.5_interpolated: p-value = 0.5285 -> Non-stationary




Traffic_interpolated: p-value = 0.5068 -> Non-stationary
PM10_interpolated: p-value = 0.3111 -> Non-stationary
O3_interpolated: p-value = 0.0123 -> Stationary
NO2_interpolated: p-value = 0.8035 -> Non-stationary




#### On decomposed Trend data

In [None]:
# same on the Trend data
#-------------------------------------------------------------------
# Correlation matrix for Spearman, Pearson, DTW and Granger
file_path = "data/processed/interpolated_and_trends"
files = glob.glob(os.path.join(file_path, "trends_*.csv"))
methods = ['spearman', 'pearson', 'dtw', 'granger']

trends_corr_results = {}   # all results of all files

for file in files:
    df = pd.read_csv(file)
    #df_drop = df[[col for col in df.columns if "interpolated" in col]]
    corr_results = []       # results of one file
    
    for method in methods:
        if method in ['spearman', 'pearson']:
            corr_res = df.corr(method=method)
            corr_results.append({'Method': method, 'Results': corr_res})
        elif method == 'dtw':
            dtw_distances = dtw_distance(df)   # returns a 2D np-array
            corr_results.append({'Method': method, 'Results': dtw_distances})
        elif method == 'granger':
            granger_res = grangercasuality(df, max_lag=12)  # returns list of dicts with max_lag p-values
            corr_results.append({'Method': method, 'Results': granger_res})
        
    trends_corr_results[os.path.basename(file)] = corr_results
