In [1]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch

from time import time
from os import listdir, path

from numpy.random import choice, shuffle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from scipy.stats import zscore
from dtaidistance import dtw, dtw_c, dtw_ndim
from torch import nn
from torch.utils.data import DataLoader, TensorDataset, random_split


from IPython.display import clear_output

In [7]:
%load_ext autoreload
%autoreload 1

import preprocessing
import utils
from models import Encoder, DecoderLinear, DecoderLSTM, Sequence2Sequence

%aimport preprocessing
%aimport models

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
device = torch.device("cuda")
BASE_PATH = "../data/Univariate_arff"
TRAIN = "_TRAIN"
TEST = "_TEST"

BIG_CONST = 100000
datasets = [name for name in listdir(BASE_PATH) if path.isdir(path.join(BASE_PATH, name))]

In [4]:
with open("../data/length_Univariate_arff.json") as f:
    lengths = json.load(f)

In [None]:
w = 3
k = 3
hidden_dim = 3
sample_size = 300

results = {name: {} for name in datasets}
try:
    for idx, problem in enumerate([name for name in datasets if lengths[name] < 200]):    
        length = lengths[problem]
        data_path = path.join(BASE_PATH, problem, problem)    
        pca = PCA(n_components=hidden_dim)
        X, y = preprocessing.get_dataset(data_path, length=length)

        X = X[:sample_size]
        y = y[:sample_size]
        train_set, test_set, valid_set = preprocessing.prepare_data(X, y, k, w, device)
        train_it = iter(train_set)
        for batch, _, _ in train_it:
            for x in batch.cpu().detach().numpy():
                pca = pca.fit(x)

        valid_it = iter(valid_set)
        batch, timeseries, labels = next(valid_it)
        batch = batch.cpu().detach().numpy()
        timeseries = timeseries.numpy()
        idxs = np.arange(len(timeseries)).reshape(-1, 1)

        scores_hidden = []
        scores_ts = []
        t = time()
        hiddens = np.stack([pca.transform(x) for x in batch])

        print(problem, batch.shape[0])
        scores_ts, scores_hidden = utils.classify(timeseries, hiddens, labels)
        results[problem]["ts"] = scores_ts
        results[problem]["hidden"] = scores_hidden
        results[problem]["shape"] = X.shape
        results[problem]["balance"] = np.unique(y, return_counts=True)
        print()

    with open("results_pca.json", "w") as f:
        json.dump(results, f, cls=utils.NumpyEncoder)

except Exception as exc:
    with open("results_pca.json", "w") as f:
        json.dump(results, f, cls=utils.NumpyEncoder)

    raise exc


# Experiments

* Select $sample\_size$ items of length $n$ from dataset .
* Split into train, test, valid.  
    P.S.: *Now valid set is unused.*
    
* Split each ts into sequence of small time-series  
    **Parameters**
    * $w$: shift size
    * $k$: 2*k - window size 
    * $h$: hidden dimension, dimension of each point after encoding
    
    The length of new sequence:
$$l = \dfrac{n-2k}{w} + 1 $$
    Encoder: 
$$
 \mathbf{R}^{l\times 2k} \rightarrow \mathbf{R}^{l\times h}
$$
    
* Train autoencoder or PCA on train test.
* Encode valid set
* Calculate distance matrix. (c engine is used only for univariate ts. So, we can't directly compare times)
* Classify hiddens and raw time series via KNN classifier.
    * Split valid dataset into classify_train, classify_test. Classify_test size = 0.7 * valid size.
    * KNN with k = 3 now.
    * Repeat many times
      

## PCA
* Valid size = $20\%$ of sample size.
* Train size = $75\%$ of sample size.

In [13]:
with open("results_pca.json") as f:
    results = json.load(f)

    
header = "{0:3s} | {1:^20s} | {2:^7s} | {3:^6s} | {4:^12s} | {5:^12s} ".format(
    "", "Problem", "N items", "Length", "Raw ts score", "Hidden score")
print(header)
print("-"*len(header))

for idx, problem in enumerate([name for name in datasets if lengths[name] < 200]):    
    length = lengths[problem]
    print("{:3d} | {:>20s} | {:7d} | {:6d} | {:.2f} +- {:.2f} | {:.2f} +- {:.2f} ".format(
        idx, problem[:20],
        results[problem]["shape"][0],
        results[problem]["shape"][1],
        np.mean(results[problem]["ts"]), np.std(results[problem]["ts"]),
        np.mean(results[problem]["hidden"]), np.std(results[problem]["hidden"]),    
    ))

    |       Problem        | N items | Length | Raw ts score | Hidden score 
----------------------------------------------------------------------------
  0 |                ACSF1 |     300 |     50 | 0.95 +- 0.01 | 0.94 +- 0.01 
  1 |                Adiac |     300 |    175 | 0.31 +- 0.04 | 0.26 +- 0.03 
  2 |   AllGestureWiimoteX |     300 |    115 | 0.46 +- 0.04 | 0.47 +- 0.04 
  3 |   AllGestureWiimoteY |     300 |    115 | 0.47 +- 0.03 | 0.49 +- 0.03 
  4 |   AllGestureWiimoteZ |     300 |    115 | 0.43 +- 0.04 | 0.41 +- 0.04 
  5 |            BeetleFly |     120 |    150 | 0.59 +- 0.05 | 0.58 +- 0.05 
  6 |                  BME |     180 |    128 | 0.93 +- 0.04 | 0.99 +- 0.03 
  7 |                  CBF |     300 |    128 | 1.00 +- 0.00 | 1.00 +- 0.01 
  8 |            Chinatown |     300 |     24 | 0.97 +- 0.01 | 0.97 +- 0.01 
  9 | ChlorineConcentratio |     300 |    150 | 0.45 +- 0.04 | 0.45 +- 0.04 
 10 |                 Crop |     300 |     46 | 1.00 +- 0.00 | 1.00 +- 0.00 