In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

import librosa
from tqdm.notebook import tqdm
from IPython.display import Audio
from pandas.api.types import is_numeric_dtype

import sys
sys.path.append('../')
import default_style

TS_DATASET_FOLDER = os.path.join("..", "dataset")
TS_PREPROC_FOLDER = os.path.join(TS_DATASET_FOLDER, "preprocessed_traces")
DF_PREPROC_FILE = os.path.join(TS_PREPROC_FOLDER, "preproc_ts.df")

INTERESTING_TRACES = ["denoised_trace", "std_phon_trace", "phonetic_trace","traces_fourier"]

INTERESTING_TRACES += [f"syllables_{i}_fourier" for i in range(7)]

## Save folder for long-run estimators
ESTIMATORS_FOLDER = os.path.join("..", "estimators")

## Import

In [2]:
df = pd.read_csv(DF_PREPROC_FILE)

traces = dict()
for t in INTERESTING_TRACES:
    traces[t] = np.load(os.path.join(TS_PREPROC_FOLDER, f"{t}.npy"))


FileNotFoundError: [Errno 2] No such file or directory: '../dataset/preprocessed_traces/syllables_0_fourier.npy'

In [None]:
len(traces["fourier_coeffs"][0])

In [None]:
plt.plot(traces["fourier_coeffs"][0])

## Metrics

In [None]:
from scipy.spatial.distance import euclidean
from scipy.spatial.distance import cdist

In [None]:
mat_euclidean = cdist(traces["fourier_coeffs"],traces["fourier_coeffs"])

In [None]:
plt.grid("")
plt.axis("off")
plt.imshow(mat_euclidean)

In [None]:
# Distribution of distances
distances_eucl = np.triu(mat_euclidean).reshape(-1)
distances_eucl = distances_eucl[distances_eucl != 0.0]
plt.hist(distances_eucl, histtype="step", bins=40, density=True);

### Manhattan

In [None]:
mat_manhattan = cdist(traces["fourier_coeffs"],traces["fourier_coeffs"], metric="cityblock")

In [None]:
plt.grid("")
plt.axis("off")
plt.imshow(mat_manhattan)

In [None]:
# Distribution of distances
distances_man = np.triu(mat_manhattan).reshape(-1)
distances_man = distances_man[distances_man != 0.0]
plt.hist(distances_man, histtype="step", bins=40, density=True);

### DTW

In [None]:
from tslearn.metrics import cdist_dtw

mat_dtw = cdist_dtw(traces["std_phon_trace"][:10],traces["std_phon_trace"][:10], itakura_max_slope=2, n_jobs=-1)

In [None]:
plt.grid("")
plt.axis("off")
plt.imshow(mat_dtw)

## Test: KNN search

In [None]:
from tslearn.neighbors import KNeighborsTimeSeries
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(traces["fourier_coeffs"], test_size=0.1)

knn = KNeighborsTimeSeries(n_neighbors=4)
knn.fit(X_train)

# Find nearest neighbours of a query trace
query_trace = 1
nearest = knn.kneighbors([X_test[query_trace]], return_distance=False)[0]

fig, (axquery, axnearest) = plt.subplots(2,1, sharex=True)

# Plot the requested query
axquery.plot(X_test[query_trace])

# Plot the most similar results
for idx in nearest:
    axnearest.plot(X_train[idx])

In [None]:
X_train.shape

## Kmeans

### Euclidean

In [None]:
from tslearn.clustering import TimeSeriesKMeans

n_clusters = 5

km = TimeSeriesKMeans(n_clusters=n_clusters, verbose=True)
y_pred = km.fit_predict(X_train)

fig, axes = plt.subplots(n_clusters,1,sharex=True)

for clus in range(n_clusters):
    cluster_elements = X_train[y_pred == clus]
    for i in range(n_clusters):
        axes[clus].plot(cluster_elements[i], alpha=0.2, color="k")
    
    axes[clus].plot(km.cluster_centers_[clus])

### DTW

In [None]:
n_clusters = 5

# Fit and save
km = TimeSeriesKMeans(n_clusters=n_clusters,
                      metric="dtw",
                      n_jobs=-1,
                      verbose=True)

y_pred = km.fit_predict(X_train)
# Save the estimator
km.to_pickle(os.path.join(ESTIMATORS_FOLDER, "dtw_kmeans"))

km = TimeSeriesKMeans.from_pickle(os.path.join(ESTIMATORS_FOLDER, "dtw_kmeans"))
y_pred = km.predict(X_train)

fig, axes = plt.subplots(n_clusters,1,sharex=True)

for clus in range(n_clusters):
    cluster_elements = X_train[y_pred == clus]
    for i in range(10):
        axes[clus].plot(cluster_elements[i], alpha=0.2, color="k")
    axes[clus].plot(km.cluster_centers_[clus])