In [1]:
import numpy as np
import pandas as pd

from utils import load_parquet_to_df_list, z_normalize
from approximation.paa import PAA
from discretization.sax.extended_sax import ExtendedSAX
from information_embedding_cost.kullback_leibler_divergence import compute_raw_prob_distribution, compute_symbolic_prob_distribution, EquiWidth, QuantileBinning

In [2]:
# load time series dataset into dataframe and z-normalize it
path = "../../../0_data/UCRArchive_2018/SwedishLeaf/train"
df_list = load_parquet_to_df_list(path)

# concatenate all time series to one dataframe, because they all have the same length
df_norm = pd.concat([z_normalize(df["signal"].to_frame()) for df in df_list], axis=1).iloc[:, :2]
df_norm

Unnamed: 0,signal,signal.1
0,2.223633,1.879365
1,2.058567,1.733153
2,1.849589,1.579823
3,1.642328,1.402389
4,1.438219,1.257130
...,...,...
123,1.345195,1.162620
124,1.547908,1.314745
125,1.760490,1.500432
126,1.970141,1.657326


In [3]:
alphabet_size = 9
window_size = 5

e_sax = ExtendedSAX(alphabet_size)
equi_width_binning = EquiWidth()
quantile_binning = QuantileBinning()

df_raw_prob_distr = compute_raw_prob_distribution(df_norm, e_sax, quantile_binning)
df_raw_prob_distr

Unnamed: 0,0,1
a,0.117188,0.117188
b,0.109375,0.109375
c,0.109375,0.109375
d,0.109375,0.109375
e,0.109375,0.109375
f,0.109375,0.109375
g,0.109375,0.109375
h,0.109375,0.109375
i,0.117188,0.117188


In [4]:
df_paa = PAA(window_size).transform(df_norm)
df_symbolic_prob_distr = compute_symbolic_prob_distribution(df_paa, df_norm, window_size, e_sax)
df_symbolic_prob_distr

Unnamed: 0,0,1
a,1e-10,0.153846
b,0.3076923,0.076923
c,0.1410256,0.076923
d,0.07692308,0.115385
e,0.07692308,0.153846
f,0.06410256,0.089744
g,0.08974359,0.089744
h,0.06410256,0.076923
i,0.1794872,0.166667


In [5]:
from scipy.stats import entropy
import numpy as np


entropy(df_raw_prob_distr, df_symbolic_prob_distr)

array([2.47171637, 0.04242907])

In [6]:
alphabet_size = 10

quantiles = df_norm.quantile(np.linspace(0,1, alphabet_size+1)).iloc[1:-1,:]
quantiles

Unnamed: 0,signal,signal.1
0.1,-1.090396,-1.426211
0.2,-1.009934,-1.000926
0.3,-0.789569,-0.497522
0.4,-0.595595,-0.179854
0.5,-0.245316,-0.01211
0.6,0.079462,0.153205
0.7,0.470311,0.483605
0.8,1.008178,0.929085
0.9,1.576234,1.435126


In [7]:
lst = []
for i in range(quantiles.shape[1]):
    bin_idxs = np.searchsorted(quantiles.iloc[:,i], df_norm.iloc[:,i], side="right")
    lst.append(pd.Series(bin_idxs))
df_bin_idxs = pd.concat(lst, axis=1)
df_bin_idxs = df_bin_idxs.apply(lambda col: col.value_counts().sort_index())
df_bin_idxs

Unnamed: 0,0,1
0,13,13
1,13,13
2,13,13
3,12,12
4,13,13
5,13,13
6,12,12
7,13,13
8,13,13
9,13,13
