In [1]:
import sys
import os
main_path = os.path.abspath(os.path.join(os.path.dirname("./test_pipeline.ipynb"), '..'))
sys.path.insert(0, main_path)

In [2]:
from src.preprocesing import gen_dataset, gen_dataset_from_h5
from src.pipelines import PipelineBuilder
from src.feature_extraction.text import MPTextGenerator, CountVectorizer


import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
from scipy import sparse
import time

In [3]:
merged_labels_to_num = {
    "Single microlens": 1,
    "TDE": 2,
    "Short period VS": 3,
    "SN": 4,
    "M-dwarf": 5,
    "AGN": 6,
    "Unknown": 99
}

merged_labels = {
    6: "Single microlens",
    15: "TDE",
    16: "Short period VS",
    42: "SN",
    52: "SN",
    53: "Short period VS",
    62: "SN",
    64: "SN",
    65: "M-dwarf",
    67: "SN",
    88: "AGN",
    90: "SN",
    92: "Short period VS",
    95: "SN",
    99: "Unknown"
}

res, labels, metadata = gen_dataset_from_h5("plasticc_balanced_combined_classes_small_ddf")
bands = ["lsstg", "lssti", "lsstr", "lsstu", "lssty", "lsstz"]
spatial_comp = np.sum([len(ts.observations["flux"]) * 2 for ts in res])
time_durations = np.array([ts.observations["time"].to_numpy()[-1] - ts.observations["time"].to_numpy()[0] for ts in res])
mean_time = np.mean(time_durations)
std_time = np.std(time_durations)
print(mean_time, std_time)

k = spatial_comp // len(res)
print("target k:", k)
labels_merged = np.array([merged_labels_to_num[merged_labels[x]] for x in labels])
print("classes:", np.unique(labels_merged))

Object: 100%|████████████████████████████████████████████████████████████████████| 600/600 [00:00<00:00, 204850.01it/s]

601.5589505662784 299.43984508617797
target k: 463
classes: [1 2 3 4 5 6]





In [11]:
alph_size = 4
quantity = "mean"
num_reduction=True
irr_handler="supp_interp"
index_based_paa = False
mean_bp_dist="normal"
verbose=False
_BANDS = ["lsstg", "lssti", "lsstr", "lsstu", "lssty", "lsstz"]
n_jobs = 6
limit = int(len(np.unique(labels_merged)) * 2)
limit

12

In [12]:
if __name__ == "__main__":
    wls = [2, 3, 4, 5, 6]
    wins = (mean_time + std_time) * 10 ** np.linspace(-1.95, 0, 40)
    values = []
    out_wins = []
    out_wls = []
    ini = time.time()
    for wl in wls:
        for win in wins:
            threshold = max(1, int(round(wl / 2)))
            tol = wl * 1
            gen = MPTextGenerator(bands=_BANDS, n_jobs=n_jobs, alph_size=4,
                                 quantity=quantity, num_reduction=num_reduction,
                                 irr_handler=irr_handler, index_based_paa=index_based_paa,
                                 mean_bp_dist=mean_bp_dist, verbose=verbose, win=win, word_length=wl)
            vec = CountVectorizer(alph_size=alph_size, word_length=wl, empty_handler=irr_handler, bands=_BANDS)
            corpus = np.array(gen.transform(res))
            fails = 0
            for c in corpus:
                if c is None:
                    fails += 1
                    
            if fails > limit:
                print("%d>%s time series failed to be represented, dropping sequence" % (fails, limit))
                continue
            matrix = sparse.csr_matrix(vec.transform(corpus))
            values.append(matrix)
            out_wins.append(win)
            out_wls.append(wl)
    end = time.time()

HBox(children=(HTML(value='[win: 10.109, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


160>12 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 11.343, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


155>12 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 12.727, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


306>12 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 14.280, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


324>12 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 16.022, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


324>12 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 17.977, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


371>12 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 20.171, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


401>12 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 22.632, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


392>12 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 25.394, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


385>12 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 28.492, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


78>12 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 31.969, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))


25>12 time series failed to be represented, dropping sequence


HBox(children=(HTML(value='[win: 35.869, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 40.246, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 45.157, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 50.667, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(HTML(value='[win: 56.849, wl: 2]'), FloatProgress(value=0.0, max=600.0), HTML(value='')))




KeyboardInterrupt: 