In [None]:
%load_ext autoreload
%autoreload 2

from ksfdtw import *

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tslearn.metrics import dtw, lb_keogh
from sklearn.metrics import accuracy_score
from tslearn.neighbors import KNeighborsTimeSeriesClassifier
from tslearn.metrics import dtw as tsln_dtw
from tqdm import tqdm

Install h5py to use hdf5 features: http://docs.h5py.org/
  warn(h5py_msg)


In [None]:
# # A neat way to load the dataset
# data = np.load("../data/gunpoint_preprocessed.npz")
# data_dict = {key: data[key] for key in data.files}

In [None]:
# A old way to load the dataset
data = np.load("../data/gunpoint_preprocessed.npz", allow_pickle=True)
X_train_scaled = data["X_train_scaled"]
X_train_proc = data["X_train_proc"]
X_train_proc_noise = data["X_train_proc_noise"]
y_train = data["y_train"]
X_test_scaled = data["X_test_scaled"]
X_test_proc = data["X_test_proc"]
X_test_proc_noisy = data["X_test_proc_noise"]
y_test = data["y_test"]
train_cutting_orig = data["train_cutting_orig"].tolist()
train_cutting_final = data["train_cutting_final"].tolist()
test_cutting_orig = data["test_cutting_orig"].tolist()
test_cutting_final = data["test_cutting_final"].tolist()

In [5]:
# Compute Euclidean distances to all training samples
def euclidean_distance(a, b):
    return np.sqrt(np.sum((a - b) ** 2))

In [None]:
m = len(X_train_proc[0])
n = len(X_train_scaled[0])
l = 2
L = int(np.floor(min(np.ceil(l * m), n) / 3))

Applying PSDTW to compute the distance profile of each time series from $\mathcal{D}_{\text{train, ps}}$ to every time series in $\mathcal{D}_{\text{train}}$

In [None]:
# ps
results = []
for i in range(0, 50):  # X_train_proc.shape[0]
    results.append(
        [
            ps_distance_p(
                X_train_proc[i],
                x,
                2,
                0.1,
                10,
                3,
                distance_method="dtw",
                lower_bound_method=lb_kim_fl,
            )
            for x in X_train_scaled[:50]
        ]
    )
    # print(i)
np.savez("../results/results.npz", results=np.array(results, dtype=object))

Applying PSDTW to compute the distance profile of each time series from $\mathcal{D}_{\text{train, ps, noise}}$ to every time series in $\mathcal{D}_{\text{train}}$

In [None]:
# ps + noise
results_noise = []
for i in range(0, 50):  # X_train_proc.shape[0]
    results_noise.append(
        [
            ps_distance_p(
                X_train_proc_noise[i],
                x,
                2,
                0.1,
                10,
                3,
                distance_method="dtw",
                lower_bound_method=lb_kim_fl,
            )
            for x in X_train_scaled
        ]
    )
    # print(i)
np.savez("../results/results_noise.npz", results=np.array(results_noise, dtype=object))

To check the pruning power, we use the theoretical tightest lower bound $LB_{Shen}$. However, the running time of it may not be the fastest as the computation time of $LB_{Shen}$ is higher than those losse lower bound that are computational cheap.
$LB_{Kim\_FL}$, which only consider the first point pair and the last point pair to construct the lower bound, run exceptionally fast.

Applying PSDTW to compute the distance profile of each time series from $\mathcal{D}_{\text{train, ps}}$[:10] ($\mathcal{D}_{\text{train, ps, noise}}$[:10]) to every time series in $\mathcal{D}_{\text{train}}[:10]$ with the theoretical tightest lower bound $LB_{Shen}$.

In [None]:
# ps
# Check the pruning power
results_lb_shen = []

for i in range(0, 10):  # X_train_proc.shape[0]
    results_lb_shen.append(
        [
            ps_distance_p(
                X_train_proc[i],
                x,
                2,
                0.1,
                10,
                3,
                distance_method="dtw",
                lower_bound_method=lb_shen,
            )
            for x in X_train_scaled[:10]
        ]
    )
    # print(i)
np.savez(
    "../results/results_lb_shen.npz", results=np.array(results_lb_shen, dtype=object)
)

In [None]:
# ps + noise
# Check the pruning power
results_lb_shen_noise = []

for i in range(0, 10):  # X_train_proc.shape[0]
    results_lb_shen_noise.append(
        [
            ps_distance_p(
                X_train_proc_noise[i],
                x,
                2,
                0.1,
                10,
                3,
                distance_method="dtw",
                lower_bound_method=lb_shen,
            )
            for x in X_train_scaled[:10]
        ]
    )
    # print(i)
np.savez(
    "../results/results_lb_shen_noise.npz",
    results=np.array(results_lb_shen_noise, dtype=object),
)

Applying PSDTW to compute the distance profile of each time series from $\mathcal{D}_{\text{train, ps}}$[:10] $\mathcal{D}_{\text{train, ps, noise}}$[:10] to every time series in $\mathcal{D}_{\text{train}}[:10]$ without using any lower bound.

In [None]:
# ps
results_no_prune = []
for i in range(0, 10):  # X_train_proc.shape[0]
    results_no_prune.append(
        [
            ps_distance_p_without_prune(
                X_train_proc[i],
                x,
                2,
                0.1,
                10,
                3,
                distance_method="dtw",
                lower_bound_method=lb_dummy,
            )
            for x in X_train_scaled[:10]
        ]
    )
    # print(i)
np.savez(
    "../results/results_no_prune.npz", results=np.array(results_no_prune, dtype=object)
)

In [None]:
# ps + noise
results_no_prune_noise = []
for i in range(0, 10):  # X_train_proc.shape[0]
    results_no_prune_noise.append(
        [
            ps_distance_p_without_prune(
                X_train_proc_noise[i],
                x,
                2,
                0.1,
                10,
                3,
                distance_method="dtw",
                lower_bound_method=lb_dummy,
            )
            for x in X_train_scaled[:10]
        ]
    )
    # print(i)
np.savez(
    "../results/results_no_prune_noise.npz",
    results=np.array(results_no_prune_noise, dtype=object),
)

# Precision@k

In [None]:
def precision_at_k(distances, true_index, k):
    # Get the indices of the top-k smallest distances
    top_k_indices = sorted(range(len(distances)), key=lambda x: distances[x])[:k]

    # Check if the true match is among them
    return 1 if true_index in top_k_indices else 0

Compute $P@k$ for querying $Q \in \mathcal{D}_{\text{train, ps}}$ using PSDTW

In [None]:
# PSDTW
precision_at_1, precision_at_3, precision_at_5, precision_at_7 = 0, 0, 0, 0
for i in range(0, 50):
    distances = np.array([r[0] for r in results[i]])
    precision_at_1 += precision_at_k(distances, i, 1)
    precision_at_3 += precision_at_k(distances, i, 3)
    precision_at_5 += precision_at_k(distances, i, 5)
    precision_at_7 += precision_at_k(distances, i, 7)
    # cuts = [r[1] for r in results[i]]
    # iterations = np.array([r[2] for r in results[i]])
    # best_idx = np.argmin(distances)
    # if i != best_idx:
    #      print(i, "cannot retrieve the original time series")
print(
    precision_at_1 / 50, precision_at_3 / 50, precision_at_5 / 50, precision_at_7 / 50
)
# 0.8 0.96 1.0 1.0

0.8 0.96 1.0 1.0


Compute $P@k$ for querying $Q \in \mathcal{D}_{\text{train, ps, noise}}$ using PSDTW

In [None]:
# PSDTW, noisy
precision_at_1, precision_at_3, precision_at_5, precision_at_7 = 0, 0, 0, 0
for i in range(0, 50):
    distances = np.array([r[0] for r in results_noise[i]])
    precision_at_1 += precision_at_k(distances, i, 1)
    precision_at_3 += precision_at_k(distances, i, 3)
    precision_at_5 += precision_at_k(distances, i, 5)
    precision_at_7 += precision_at_k(distances, i, 7)
    # cuts = [r[1] for r in results[i]]
    # iterations = np.array([r[2] for r in results[i]])
    # best_idx = np.argmin(distances)
    # if i != best_idx:
    #      print(i, "cannot retrieve the original time series")
print(
    precision_at_1 / 50, precision_at_3 / 50, precision_at_5 / 50, precision_at_7 / 50
)
# 0.48 0.82 0.96 1.0
# 0.58 0.88 0.92 0.96 <- The updated version has a slightly different result.

0.58 0.88 0.92 0.96


# Analyzing Pruning Power

In [None]:
# ps
# No prune
total_no_iterations = 0
for i in range(0, 10):
    # cuts = [r[1] for r in results[i]]
    iterations = np.array([r[2] for r in results_no_prune[i]])
    total_no_iterations += sum(iterations)
    # best_idx = np.argmin(distances)
    # if i != best_idx:
    #      print(i, "cannot retrieve the original time series")
print(total_no_iterations)
# 3985237
total_no_iterations_no_prune = total_no_iterations

3985237


In [None]:
# ps
# Using LB_Shen
total_no_iterations = 0
for i in range(0, 10):
    # cuts = [r[1] for r in results[i]]
    iterations = np.array([r[2] for r in results_lb_shen[i]])
    total_no_iterations += sum(iterations)
    # best_idx = np.argmin(distances)
    # if i != best_idx:
    #      print(i, "cannot retrieve the original time series")
print(total_no_iterations)
total_no_iterations_lb_shen = total_no_iterations
# 1404294

1404294


Compute how many of distance measures have been pruned

In [None]:
(
    total_no_iterations_no_prune - total_no_iterations_lb_shen
) / total_no_iterations_no_prune

np.float64(0.6476259755693325)

In [None]:
# ps + noise
# No prune
total_no_iterations = 0
for i in range(0, 10):
    # cuts = [r[1] for r in results[i]]
    iterations = np.array([r[2] for r in results_no_prune_noise[i]])
    total_no_iterations += sum(iterations)
    # best_idx = np.argmin(distances)
    # if i != best_idx:
    #      print(i, "cannot retrieve the original time series")
print(total_no_iterations)
total_no_iterations_no_prune = total_no_iterations
# 4023811
# 4031567 <- The updated version has a slightly different result.

4031567


In [None]:
# ps + noise
# Using LB_Shen
total_no_iterations = 0
for i in range(0, 10):
    # cuts = [r[1] for r in results[i]]
    iterations = np.array([r[2] for r in results_lb_shen_noise[i]])
    total_no_iterations += sum(iterations)
    # best_idx = np.argmin(distances)
    # if i != best_idx:
    #      print(i, "cannot retrieve the original time series")
print(total_no_iterations)
total_no_iterations_lb_shen = total_no_iterations
# 1483516
# 1497458 <- The updated version has a slightly different result.

1497458


Compute how many of distance measures have been pruned

In [None]:
(
    total_no_iterations_no_prune - total_no_iterations_lb_shen
) / total_no_iterations_no_prune

np.float64(0.628566758285302)

In [9]:
###