In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch

from time import time
from numpy.random import choice, shuffle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from scipy.stats import zscore
from torch import nn
from torch.utils.data import DataLoader, TensorDataset, random_split
from dtaidistance import dtw, dtw_c, dtw_ndim


from IPython.display import clear_output

In [2]:
%load_ext autoreload
%autoreload 1

from preprocessing import *
from models import Encoder, Decoder, Sequence2Sequence

%aimport preprocessing
%aimport models

In [3]:
device = torch.device("cuda")

def get_dataset(data_path, length=100):
    df_train = pd.read_csv(data_path + TRAIN + ".txt", header=None, delim_whitespace=True)
    df_test = pd.read_csv(data_path + TEST + ".txt", header=None, delim_whitespace=True)
        
    df = pd.concat([df_test, df_train], ignore_index=True, sort=False)
    X = df.iloc[:, 1:].values
    y = df.iloc[:, 0].values

    if np.isnan(X).any():
        timeseries = []
        labels = []
        for label, x in zip(y, X):
            ts = slice_timeseries(x[~np.isnan(x)], 100)
            timeseries.extend(ts)
            labels.extend(np.repeat(label, len(ts)))

        X = np.asarray(timeseries)
        y = np.asarray(labels)
    else:
        X = slice_timeseries(X.swapaxes(1, 0), min(length, X.shape[-1])).swapaxes(1, 2)
        X = np.vstack(X)
        y = np.repeat(y, X.shape[0] // y.shape[0])
    
    assert not np.isnan(X).any(), "Stand from under! NAN in data!"

    return zscore(X, 1), y


def prepare_data(X, y, k, w):
    ds = SplittedDataset(X, y, k, w, device=device)
    train_ds, test_ds, valid_ds = train_test_valid_split(ds, 0.05, 0.75)

    train_set = DataLoader(train_ds, batch_size=1024, shuffle=True)
    test_set = DataLoader(test_ds, batch_size=1024, shuffle=True)
    valid_set = DataLoader(valid_ds, batch_size=256, shuffle=True)
    
    return train_set, test_set, valid_set

In [4]:
def get_model(hidden_dim, k, n_layers):
    input_dim = 2*k
    enc = Encoder(input_dim, hidden_dim, 1, n_layers, False)
    dec = Decoder(hidden_dim, input_dim, 1, n_layers)

    model = Sequence2Sequence(enc, dec)
    model.to(device)
    
    return model

def train(model, train_ds, optim, loss_fn, valid_ds, n_step):
    model.train()
    for step in range(n_step):
        it = iter(train_ds)
        train_loss = 0.
        for batch, _, _ in it:
            batch = batch.permute(1, 0, 2)
            optim.zero_grad()
            out = model(batch)
            loss = loss_fn(batch, out)

            loss.backward()
            optim.step()
            train_loss += loss.cpu().detach().numpy()

        if (step+1) % 500 == 0:
            valid_loss = valid(model, valid_ds, loss_fn)
            print("{:4d}: train loss: {:.3f}; valid loss: {:.3f}".format(step+1, train_loss, valid_loss))
            model.train()


def valid(model, valid_ds, loss_fn):
    model.eval()
    with torch.no_grad():
        loss = 0.
        it = iter(valid_ds)
        for batch, _, _ in it:
            batch = batch.permute(1, 0, 2)
            out = model(batch)
            loss += loss_fn(batch, out)
    
    return loss.cpu().detach().numpy()

In [5]:
def _metric(id1, id2, matrix):
    """
    Return distance between elements from distance matrix
    """
    x, y = int(min(id1[0], id2[0])), int(max(id1[0], id2[0]))
    return matrix[x, y]


def get_metric(matrix):
    """
    Return distance function from distance matrix
    """
    return lambda id1, id2: _metric(id1, id2, matrix)


def workflow(w, k, hidden_dim, n_layers, length, n_step=6000):
    """
    Plan of the experiment
    """
    X, y = get_dataset(length)
    train_set, test_set, valid_set = prepare_data(X, y, k, w)
    model = get_model(hidden_dim, k, n_layers)
    loss_fn = nn.MSELoss()
    optim = torch.optim.Adam(model.parameters())
    train(model, train_set, optim, loss_fn, test_set, n_step)
    torch.save(model.state_dict(), "../data/w={};k={};nl={};length={};h={}".format(w, k, n_layers, length, hidden_dim))
    valid(model, test_set, loss_fn)
    valid_it = iter(valid_set)
    batch, timeseries, labels = next(valid_it)
    timeseries = timeseries.numpy()

    scores_ts = []
    scores_hidden = []

    t = time()
    matrix_ts = dtw.distance_matrix(timeseries, use_c=True)
    print("raw_ts: {:.3f}".format(time() - t))
    t = time()
    hiddens = model.encoder(batch.permute(1, 0, 2))[0].permute(1, 0, 2).detach().cpu().numpy()
    matrix_hidden = dtw_ndim.distance_matrix(hiddens)
    print("hidden_ts: {:.3f}".format(time() - t))
    idxs = np.arange(len(timeseries)).reshape(-1, 1)

    for i in range(100):
        X_train, X_test, y_train, y_test = train_test_split(idxs, labels.cpu().numpy(), test_size=0.4)

        clf = KNeighborsClassifier(metric=get_metric(matrix_ts), algorithm="brute")
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        scores_ts.append(score)
        clf = KNeighborsClassifier(metric=get_metric(matrix_hidden), algorithm="brute")
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        scores_hidden.append(score)

    print("Raw ts score: {:.3f} +- {:.3f}".format(np.mean(scores_ts), np.std(scores_ts)))    
    print("Hidden ts score: {:.3f} +- {:.3f}".format(np.mean(scores_hidden), np.std(scores_hidden)))
    
    return model, train_set, test_set, valid_set

In [6]:
from os import listdir, path
from arff2pandas import a2p
from tqdm import tqdm
from sklearn.decomposition import PCA


In [None]:
BASE_PATH = "../data/Univariate_arff"
TRAIN = "_TRAIN"
TEST = "_TEST"

datasets = listdir(BASE_PATH)
w = 3
k = 4
hidden_dim = 3
length = 100
sample_size = 220

header = "{0:3s} | {1:^20s} | {2:^4s} | {3:^4s}".format("", "Problem", "Items", "Length")
print(header)
print("-"*len(header))
for idx, problem in enumerate(datasets):    
    data_path = path.join(BASE_PATH, problem, problem)    
    pca = PCA(n_components=hidden_dim)
    X, y = get_dataset(data_path)
    print("{0:3d} | {1:>20s} | {2:5d} | {3:6d}".format(
        idx, problem[:20], X.shape[0], X.shape[1]))
    X = X[:sample_size]
    y = y[:sample_size]
    train_set, test_set, valid_set = prepare_data(X, y, k, w)
    train_it = iter(train_set)
    batch, timeseries, labels = next(train_it)
    batch = batch.cpu().detach().numpy()
    for x in batch:
        pca = pca.fit(x)
    
    valid_it = iter(valid_set)
    batch, timeseries, labels = next(valid_it)
    batch = batch.cpu().detach().numpy()
    timeseries = timeseries.numpy()
    idxs = np.arange(len(timeseries)).reshape(-1, 1)

    scores_hidden = []
    scores_ts = []
    t = time()
    hiddens = np.stack([pca.transform(x) for x in batch])
    matrix_hidden = dtw_ndim.distance_matrix(hiddens)
    print("hidden_ts: {:.3f}".format(time() - t))
    
    t = time()
    matrix_ts = dtw.distance_matrix(timeseries, use_c=True)
    print("raw_ts: {:.3f}".format(time() - t))
    
    for i in range(100):
        X_train, X_test, y_train, y_test = train_test_split(idxs, labels.cpu().numpy(), test_size=0.7)
        clf = KNeighborsClassifier(metric=get_metric(matrix_ts), algorithm="brute", n_neighbors=3)
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        scores_ts.append(score)
        clf = KNeighborsClassifier(metric=get_metric(matrix_hidden), algorithm="brute", n_neighbors=3)
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        scores_hidden.append(score)
    
    print("Raw ts score: {:.3f} +- {:.3f}".format(np.mean(scores_ts), np.std(scores_ts)))    
    print("Hidden ts score: {:.3f} +- {:.3f}".format(np.mean(scores_hidden), np.std(scores_hidden)))

    

# problem = datasets[88]
# data_path = path.join(BASE_PATH, problem, problem)
# X, y = get_dataset(data_path)
# X.shape

    |       Problem        | Items | Length
-------------------------------------------
  0 |                ACSF1 |  2800 |    100
hidden_ts: 170.316
raw_ts: 259.027
Raw ts score: 0.576 +- 0.046
Hidden ts score: 0.570 +- 0.043
  1 |                Adiac |   781 |    100
hidden_ts: 167.534


In [None]:
X.swapaxes(1, 0)