In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats

In [2]:
date_range_observed = pd.date_range(start="2015-01-01", end="2024-07-24 23:00:00", freq='H')
np.random.seed(42)
observed_data = np.random.randn(len(date_range_observed))  # Random normal data
df_observed = pd.DataFrame(observed_data, index=date_range_observed, columns=['obs'])
date_range_predicted = pd.date_range(start=date_range_observed[-365*24], end=date_range_observed[-1], freq='H')
predicted_data = np.random.randn(len(date_range_predicted))  # Random normal data
df_predicted = pd.DataFrame(predicted_data, index=date_range_predicted, columns=['predicted'])
df_observed.head(), df_predicted.head()


(                          obs
 2015-01-01 00:00:00  0.496714
 2015-01-01 01:00:00 -0.138264
 2015-01-01 02:00:00  0.647689
 2015-01-01 03:00:00  1.523030
 2015-01-01 04:00:00 -0.234153,
                      predicted
 2023-07-26 00:00:00   0.030139
 2023-07-26 01:00:00  -0.052515
 2023-07-26 02:00:00   0.337386
 2023-07-26 03:00:00   0.547897
 2023-07-26 04:00:00  -0.829152)

### Cramer Von Mises

In [4]:
from scipy.stats import cramervonmises, ecdf

In [10]:
cdf = ecdf(df_observed['obs'])
cdf = cdf.cdf
res = cramervonmises(df_predicted['predicted'], cdf.evaluate)
res

CramerVonMisesResult(statistic=0.09929517822045522, pvalue=0.5883505353760552)

### KL divergence

In [11]:
from scipy.stats import entropy

In [14]:
n_bins = [30,60,90,150,300,500,1000]
for num_bins in n_bins:
    # num_bins = 30
    observed_hist, bin_edges = np.histogram(df_observed['obs'], bins=num_bins, density=True)
    predicted_hist, _ = np.histogram(df_predicted['predicted'], bins=bin_edges, density=True)
    predicted_hist += 1e-10
    kl_divergence = entropy(observed_hist, predicted_hist)
    print(num_bins, kl_divergence)

30 0.004678307142714767
60 0.013757784130966141
90 0.01987000445454138
150 0.025326580419997163
300 0.04708684196236582
500 0.09618378254583557
1000 0.21660212855700117


In [18]:
from sklearn.neighbors import KernelDensity

# Fit KDE for observed and predicted data
kde_observed = KernelDensity(kernel='gaussian', bandwidth=0.1).fit(df_observed['obs'].values.reshape(-1, 1))
kde_predicted = KernelDensity(kernel='gaussian', bandwidth=0.1).fit(df_predicted['predicted'].values.reshape(-1, 1))

# Evaluate KDE on a common set of points (a linear space covering both observed and predicted ranges)
common_grid = np.linspace(min(df_observed['obs'].min(), df_predicted['predicted'].min()),
                          max(df_observed['obs'].max(), df_predicted['predicted'].max()), 5000).reshape(-1, 1)

# Compute log density values
log_density_observed = kde_observed.score_samples(common_grid)
log_density_predicted = kde_predicted.score_samples(common_grid)

# Convert log densities to actual densities
density_observed = np.exp(log_density_observed)
density_predicted = np.exp(log_density_predicted)

# Add a small constant to prevent division by zero
density_predicted += 1e-10

# Calculate KL divergence
kl_divergence_kde = entropy(density_observed, density_predicted)

kl_divergence_kde

0.002666428657081275