In [1]:
import pandas as pd
import numpy as np
from scipy.stats import entropy

from utils import load_parquet_to_df_list, z_normalize
from approximation.paa import PAA
from discretization.sax.adaptive_sax import AdaptiveSAX
from discretization.sax.one_d_sax import OneDSAX
from information_embedding_cost.kullback_leibler_divergence import compute_raw_prob_distribution, compute_symbolic_prob_distribution, EquiWidth

In [2]:
# load time series dataset into dataframe and z-normalize it
path = "../../../0_data/UCRArchive_2018/SwedishLeaf/train"
df_list = load_parquet_to_df_list(path)

# concatenate all time series to one dataframe, because they all have the same length
df_norm = pd.concat([z_normalize(df["signal"].to_frame()) for df in df_list], axis=1).iloc[:, :2]
df_norm

Unnamed: 0,signal,signal.1
0,2.223633,1.879365
1,2.058567,1.733153
2,1.849589,1.579823
3,1.642328,1.402389
4,1.438219,1.257130
...,...,...
123,1.345195,1.162620
124,1.547908,1.314745
125,1.760490,1.500432
126,1.970141,1.657326


In [3]:
window_size = 5

df_paa = PAA(window_size).transform(df_norm)

In [4]:
df_norm_new = df_norm.iloc[:, 0]
df_scaled = (df_norm_new - df_norm_new.min()) / (df_norm_new.max() - df_norm_new.min())

In [5]:
# quantile binning
bins = 10

pd.qcut(df_scaled, bins).value_counts()

(-0.001, 0.0135]    13
(0.0135, 0.0374]    13
(0.0374, 0.103]     13
(0.161, 0.265]      13
(0.265, 0.362]      13
(0.478, 0.638]      13
(0.638, 0.807]      13
(0.807, 1.0]        13
(0.103, 0.161]      12
(0.362, 0.478]      12
Name: signal, dtype: int64

In [6]:
# equal width histogram
hist, bins = np.histogram(df_scaled, bins=bins, range=(df_scaled.min(), df_scaled.max()), density=False)
hist

array([38, 19, 11, 12, 11,  9,  7,  8,  6,  7], dtype=int64)

In [7]:
df_norm

Unnamed: 0,signal,signal.1
0,2.223633,1.879365
1,2.058567,1.733153
2,1.849589,1.579823
3,1.642328,1.402389
4,1.438219,1.257130
...,...,...
123,1.345195,1.162620
124,1.547908,1.314745
125,1.760490,1.500432
126,1.970141,1.657326


In [8]:
one_d_sax = OneDSAX(alphabet_size_avg = 6, alphabet_size_slope = 3)
equi_width_binning = EquiWidth()

df = compute_raw_prob_distribution(df_norm, one_d_sax, equi_width_binning)
df

Unnamed: 0,0,1
aa,0.05511811,0.07874016
ab,0.3149606,0.04724409
ac,0.03937008,0.05511811
ba,0.06299213,0.05511811
bb,0.02362205,1e-10
bc,0.07874016,0.06299213
ca,0.06299213,0.07086614
cb,0.02362205,0.1496063
cc,0.05511811,0.07086614
da,0.04724409,0.07086614


In [9]:
df_symbolic = compute_symbolic_prob_distribution(df_paa, df_norm, window_size, one_d_sax)
df_symbolic

Unnamed: 0,signal,signal.1
aa,0.03846154,0.1153846
ab,0.1538462,0.03846154
ac,1e-10,0.07692308
ba,0.07692308,1e-10
bb,0.03846154,1e-10
bc,0.1538462,0.07692308
ca,0.03846154,0.1153846
cb,1e-10,0.03846154
cc,0.03846154,0.03846154
da,0.07692308,0.03846154


In [12]:
entropy(df, df_symbolic)

array([1.48620131, 1.249871  ])