In [1]:
import sys
import os
main_path = os.path.abspath(os.path.join(os.path.dirname("./ibopf_pipeline.ipynb"), '..'))
sys.path.insert(0, main_path)

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import avocado
import time
from scipy import sparse
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import f_classif, SelectKBest, VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.decomposition import TruncatedSVD
from sklearn.base import TransformerMixin, BaseEstimator

from src.preprocesing import gen_dataset, gen_dataset_from_h5
from src.transformation.document_transformer import DocumentGeneration, MPDocumentGenerator, DocumentSelector
from src.segmentation.window_slider import TwoWaysSlider
from src.transformation.count_words import count_words, merge_documents, Vectorizer

In [3]:
merged_labels_to_num = {
    "Single microlens": 1,
    "TDE": 2,
    "Short period VS": 3,
    "SN": 4,
    "M-dwarf": 5,
    "AGN": 6,
    "Unknown": 99
}

merged_labels = {
    6: "Single microlens",
    15: "TDE",
    16: "Short period VS",
    42: "SN",
    52: "SN",
    53: "Short period VS",
    62: "SN",
    64: "SN",
    65: "M-dwarf",
    67: "SN",
    88: "AGN",
    90: "SN",
    92: "Short period VS",
    95: "SN",
    99: "Unknown"
}

res, labels, metadata = gen_dataset_from_h5("plasticc_balanced_combined_classes_small_ddf")
bands = ["lsstg", "lssti", "lsstr", "lsstu", "lssty", "lsstz"]
spatial_comp = np.sum([len(ts.observations["flux"]) * 2 for ts in res])
time_durations = np.array([ts.observations["time"].to_numpy()[-1] - ts.observations["time"].to_numpy()[0] for ts in res])
mean_time = np.mean(time_durations)
std_time = np.std(time_durations)
print(mean_time, std_time)

k = spatial_comp // len(res)
print("target k:", k)
labels_merged = np.array([merged_labels_to_num[merged_labels[x]] for x in labels])
print("classes:", np.unique(labels_merged))

Object: 100%|████████████████████████████████████████████████████████████████████| 600/600 [00:00<00:00, 300918.62it/s]

601.5589505662784 299.43984508617797
target k: 463
classes: [1 2 3 4 5 6]





In [22]:
win = 56
wl = 1
alph_size = [4, 4, 4]
quantity = ["mean", "trend", "std"]
index_based_paa = False
num_reduction = False
tol = 4
mean_bp_dist = "normal"
threshold_missing = None
verbose= False

doc_gen = DocumentGeneration(win, word_length=wl, alph_size=alph_size,
                            quantity=quantity, num_reduction=num_reduction,
                            index_based_paa=index_based_paa, tol=tol,
                            mean_bp_dist=mean_bp_dist, threshold_missing=threshold_missing)

In [21]:
doc_gen.bop_size, ((4*4*4)+1) ** 1

(65, 65)

In [5]:
vec = Vectorizer(doc_gen.bop_size, bands)

In [23]:
corpus0 = doc_gen.transform(res)

[win: 56.000000.3, wl: 1, faileds: 0]: 100%|█████████████████████████████████████████| 600/600 [01:00<00:00,  9.99it/s]


In [9]:
sparse.csr_matrix(vec.transform(corpus))

NameError: name 'vec' is not defined

# Precomputing vectorizer in sparse form



In [22]:
limit = int(len(labels) * 0.05)
if __name__ == "__main__":
    wls = [2, 3, 4, 5, 6]
    wins = (mean_time + std_time) * 10 ** np.linspace(-1.95, 0, 40)
    values = []
    out_wins = []
    out_wls = []
    ini = time.time()
    for wl in wls:
        for win in wins:
            mp_doc_gen = MPDocumentGenerator(bands, win=win, n_jobs=6, word_length=wl, alph_size=alph_size,
                            quantity=quantity, num_reduction=num_reduction,
                            index_based_paa=index_based_paa, tol=wl,
                            mean_bp_dist=mean_bp_dist, threshold_missing=threshold_missing)
            vec = Vectorizer(mp_doc_gen.get_bop_size(), bands)
            corpus = np.array(mp_doc_gen.transform(res))
            fails = 0
            for c in corpus:
                if c is None:
                    fails += 1
                    
            if fails > limit:
                print("%d>%s time series failed to be represented, dropping sequence" % (fails, limit))
                continue
            matrix = sparse.csr_matrix(vec.transform(corpus))
            values.append(matrix)
            out_wins.append(win)
            out_wls.append(wl)
#             if len(values) == 5:
#                 break
#         if len(values) == 5:
#             break
    end = time.time()
    print("TOTAL TIME:", end-ini)

HBox(children=(HTML(value='[win: 10.109, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 11.343, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 12.727, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 14.280, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 16.022, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 17.977, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 20.171, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 22.632, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 25.394, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 28.492, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 31.969, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 35.869, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 40.246, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 45.157, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 50.667, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 56.849, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 63.786, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 71.569, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 80.302, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 90.100, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 101.094, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 113.429, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 127.269, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 142.799, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 160.223, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 179.773, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 201.709, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 226.321, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 253.936, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


76>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 284.921, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


69>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 319.686, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


67>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 358.694, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


70>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 402.461, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


63>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 451.569, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


60>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 506.669, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


60>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 568.492, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


59>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 637.858, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 715.689, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 803.016, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 900.999, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 10.109, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


113>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 11.343, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


114>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 12.727, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


124>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 14.280, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


139>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 16.022, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 17.977, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 20.171, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 22.632, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 25.394, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 28.492, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 31.969, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 35.869, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 40.246, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 45.157, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 50.667, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 56.849, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 63.786, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 71.569, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 80.302, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 90.100, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 101.094, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 113.429, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 127.269, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 142.799, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 160.223, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 179.773, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


100>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 201.709, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


272>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 226.321, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


472>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 253.936, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


542>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 284.921, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


583>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 319.686, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


597>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 358.694, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


458>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 402.461, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


230>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 451.569, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


146>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 506.669, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


100>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 568.492, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


78>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 637.858, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


71>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 715.689, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


94>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 803.016, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


143>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 900.999, wl: 3]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


159>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 10.109, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


46>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 11.343, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


55>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 12.727, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


182>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 14.280, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


174>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 16.022, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


145>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 17.977, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


100>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 20.171, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 22.632, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 25.394, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 28.492, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 31.969, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 35.869, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 40.246, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 45.157, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 50.667, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 56.849, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 63.786, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 71.569, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 80.302, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 90.100, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 101.094, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 113.429, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 127.269, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 142.799, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 160.223, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 179.773, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 201.709, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 226.321, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 253.936, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


153>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 284.921, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


381>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 319.686, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


366>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 358.694, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


190>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 402.461, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


127>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 451.569, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


87>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 506.669, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


71>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 568.492, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


68>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 637.858, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


62>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 715.689, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


59>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 803.016, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


55>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 900.999, wl: 4]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 10.109, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


245>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 11.343, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


325>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 12.727, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


372>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 14.280, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


368>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 16.022, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


422>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 17.977, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


410>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 20.171, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


210>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 22.632, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


202>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 25.394, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


185>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 28.492, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


122>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 31.969, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 35.869, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 40.246, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 45.157, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 50.667, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 56.849, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 63.786, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 71.569, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 80.302, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 90.100, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 101.094, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 113.429, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 127.269, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 142.799, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 160.223, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 179.773, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 201.709, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


111>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 226.321, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


296>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 253.936, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


479>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 284.921, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


546>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 319.686, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


584>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 358.694, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


599>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 402.461, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


472>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 451.569, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


231>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 506.669, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


171>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 568.492, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


168>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 637.858, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


185>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 715.689, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


221>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 803.016, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


205>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 900.999, wl: 5]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


197>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 10.109, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


167>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 11.343, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


164>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 12.727, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


314>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 14.280, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


378>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 16.022, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


396>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 17.977, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


381>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 20.171, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


417>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 22.632, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


405>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 25.394, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


397>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 28.492, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


323>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 31.969, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


124>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 35.869, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 40.246, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 45.157, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 50.667, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 56.849, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 63.786, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 71.569, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 80.302, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 90.100, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 101.094, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 113.429, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 127.269, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 142.799, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 160.223, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 179.773, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 201.709, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 226.321, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 253.936, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


154>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 284.921, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


381>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 319.686, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


507>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 358.694, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


424>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 402.461, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


226>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 451.569, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


156>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 506.669, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


121>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 568.492, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


92>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 637.858, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


111>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 715.689, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


126>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 803.016, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


148>30 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 900.999, wl: 6]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


166>30 time series failed to be represented, dropping sequence
TOTAL TIME: 1670.794047832489


In [54]:
6 * (5 ** 2)

150

In [None]:
# merge values by class
class_values = []
n_classes = len(np.unique(labels_merged))
classes = np.unique(labels_merged)
for v in values:
    n, m = v.shape
    merged_matrix = np.zeros((n_classes, m), dtype=float)
    for i in range(n):
        label = labels_merged[i]
        k = np.where(label == classes)[0][0]
        merged_matrix[k] += v[i]
    class_values.append(sparse.csr_matrix(merged_matrix))
    
class ClassVectorizer(TransformerMixin):
    def __init__(self):
        self.classes = None
        self.n_classes = None
    
    def fit(self, x, y=None, **kwargs):
        self.classes = np.unique(y)
        self.n_classes = len(self.classes)
        self.k = [np.where(label == self.classes)[0][0] for label in y]
    
    def transform(x, **kwargs):
        class_values = []
        for v in values:
            n, m = v.shape
            merged_matrix = np.zeros((self.n_classes, m), dtype=float)
            for i in range(n):
                k = self.k[i]
                merged_matrix[k] += v[i]
            class_values.append(sparse.csr_matrix(merged_matrix))
        return class_values

In [62]:
# pipeline

norm = "l2"  # cosine normalization
use_idf = True  # compute IDF vector
smooth_idf = True  # prevent zero division, add bias
sublinear_tf = True  # use log-tf
vsm = TfidfTransformer(norm=norm, use_idf=use_idf, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf)

# selector
select = DocumentSelector(idx=-1, data=values, win_arr=out_wins, wl_arr=out_wls)

# reducer
reducer1 = VarianceThreshold()
k = 149
reducer2 = SelectKBest(f_classif, k=k)
# reducer2 = TruncatedSVD(k)

# normalizer
normalizer = Normalizer()

# classifier
classifier = KNeighborsClassifier(n_neighbors=1)


pipe = [
    ("sel", select),
    ("vsm", vsm),
#     ("zeroRed", reducer1),
    ("feaRed", reducer2),
    ("norm", normalizer),
    ("classif", classifier)
]

pipeline = Pipeline(pipe)

In [None]:
parameters = {
    "sel__idx": np.arange(len(values)),
}

x = np.arange(len(res))
if __name__ == "__main__":
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=6, verbose=1, cv=10)
    t0 = time.time()
    grid_search.fit(x, labels_merged)
    print("done in %0.3fs" % (time.time() - t0))
    

In [59]:
best_acc = grid_search.best_score_
best_idx = grid_search.best_params_
print(best_acc, best_idx)

0.46333333333333326 {'sel__idx': 75}


In [18]:
scores = cross_val_score(pipeline, res, labels_merged, cv=10, scoring='balanced_accuracy')

Object: 100%|████████████████████████████████████████████████████████████████████████| 540/540 [00:25<00:00, 21.15it/s]
Object:   0%|                                                                                   | 0/60 [00:00<?, ?it/s]

1 TIME SEIRES FAILED TO BE TRANSFORMED [WIN=100.000000.3, WL=4]


Object: 100%|██████████████████████████████████████████████████████████████████████████| 60/60 [00:02<00:00, 20.65it/s]
Object: 100%|████████████████████████████████████████████████████████████████████████| 540/540 [00:25<00:00, 21.16it/s]
Object:   0%|                                                                                   | 0/60 [00:00<?, ?it/s]

1 TIME SEIRES FAILED TO BE TRANSFORMED [WIN=100.000000.3, WL=4]


Object: 100%|██████████████████████████████████████████████████████████████████████████| 60/60 [00:03<00:00, 19.88it/s]
Object: 100%|████████████████████████████████████████████████████████████████████████| 540/540 [00:25<00:00, 20.83it/s]
Object:   0%|                                                                                   | 0/60 [00:00<?, ?it/s]

1 TIME SEIRES FAILED TO BE TRANSFORMED [WIN=100.000000.3, WL=4]


Object: 100%|██████████████████████████████████████████████████████████████████████████| 60/60 [00:02<00:00, 21.64it/s]
Object: 100%|████████████████████████████████████████████████████████████████████████| 540/540 [00:24<00:00, 22.14it/s]
Object:   0%|                                                                                   | 0/60 [00:00<?, ?it/s]

1 TIME SEIRES FAILED TO BE TRANSFORMED [WIN=100.000000.3, WL=4]


Object: 100%|██████████████████████████████████████████████████████████████████████████| 60/60 [00:02<00:00, 21.15it/s]
Object: 100%|████████████████████████████████████████████████████████████████████████| 540/540 [00:24<00:00, 21.87it/s]
Object:   3%|██▌                                                                        | 2/60 [00:00<00:03, 17.90it/s]

1 TIME SEIRES FAILED TO BE TRANSFORMED [WIN=100.000000.3, WL=4]


Object: 100%|██████████████████████████████████████████████████████████████████████████| 60/60 [00:02<00:00, 20.83it/s]
Object: 100%|████████████████████████████████████████████████████████████████████████| 540/540 [00:24<00:00, 21.99it/s]
Object:   0%|                                                                                   | 0/60 [00:00<?, ?it/s]

1 TIME SEIRES FAILED TO BE TRANSFORMED [WIN=100.000000.3, WL=4]


Object: 100%|██████████████████████████████████████████████████████████████████████████| 60/60 [00:02<00:00, 20.86it/s]
Object: 100%|████████████████████████████████████████████████████████████████████████| 540/540 [00:24<00:00, 22.09it/s]
Object:   0%|                                                                                   | 0/60 [00:00<?, ?it/s]

1 TIME SEIRES FAILED TO BE TRANSFORMED [WIN=100.000000.3, WL=4]


Object: 100%|██████████████████████████████████████████████████████████████████████████| 60/60 [00:02<00:00, 22.74it/s]
Object: 100%|████████████████████████████████████████████████████████████████████████| 540/540 [00:24<00:00, 21.96it/s]
Object: 100%|██████████████████████████████████████████████████████████████████████████| 60/60 [00:02<00:00, 22.92it/s]
Object:   0%|▎                                                                         | 2/540 [00:00<00:29, 18.13it/s]

1 TIME SEIRES FAILED TO BE TRANSFORMED [WIN=100.000000.3, WL=4]


Object: 100%|████████████████████████████████████████████████████████████████████████| 540/540 [00:24<00:00, 21.88it/s]
Object:   0%|                                                                                   | 0/60 [00:00<?, ?it/s]

1 TIME SEIRES FAILED TO BE TRANSFORMED [WIN=100.000000.3, WL=4]


Object: 100%|██████████████████████████████████████████████████████████████████████████| 60/60 [00:02<00:00, 25.35it/s]
Object: 100%|████████████████████████████████████████████████████████████████████████| 540/540 [00:24<00:00, 22.03it/s]
Object:   3%|██▌                                                                        | 2/60 [00:00<00:03, 15.31it/s]

1 TIME SEIRES FAILED TO BE TRANSFORMED [WIN=100.000000.3, WL=4]


Object: 100%|██████████████████████████████████████████████████████████████████████████| 60/60 [00:02<00:00, 22.94it/s]


In [21]:
np.max(scores)

0.4166666666666667

3.229052643315694