In [2]:
import hdbscan
import pandas as pd
import tsfel
import pickle
from utils.preprocess import *

In [3]:
def ts_extract_features(ts_df:pd.DataFrame) -> pd.DataFrame:
    header_names = ts_df.drop(['NPLV'], axis = 1).columns
    cfg_file = tsfel.get_features_by_domain('statistical')
    extr_ts = pd.DataFrame()
    for NPLV in tqdm(ts_df.NPLV.unique()):
        curr_ts = ts_df[ts_df['NPLV'] == NPLV]
        curr_ts = curr_ts.drop(['NPLV'], axis = 1)

        feat_for_cts = tsfel.time_series_features_extractor(cfg_file, curr_ts, header_names = header_names, verbose=0)
        feat_for_cts['NPLV'] = NPLV

        extr_ts = pd.concat((extr_ts,feat_for_cts))
    
    return extr_ts


In [4]:
lom_train = pd.read_csv('data/cat_lom_train.csv')
lom_test = pd.read_csv('data/cat_lom_test.csv')
sip_train = pd.read_csv('data/cat_sip_train.csv')
sip_test = pd.read_csv('data/cat_sip_test.csv')

In [8]:
drop_lom_train = list(set(lom_train.columns).difference(set(lom_test.columns)))
drop_sip_train = list(set(sip_train.columns).difference(set(sip_test.columns)))
drop_lom_test = list(set(lom_test.columns).difference(set(lom_train.columns)))
drop_sip_test = list(set(sip_test.columns).difference(set(sip_train.columns)))

In [9]:
lom_train = lom_train.drop(drop_lom_train, axis=1)
lom_test = lom_test.drop(drop_lom_test, axis=1)
sip_train = sip_train.drop(drop_sip_train, axis=1)
sip_test = sip_test.drop(drop_sip_test, axis=1)

In [10]:
produv_train = pd.read_csv('data/reduced_produv_train.csv')
produv_test = pd.read_csv('data/reduced_produv_test.csv')

In [11]:
preproc_produv_train = ts_preproc(produv_train, 'SEC')
preproc_produv_test = ts_preproc(produv_test, 'SEC')

In [12]:
extr_produv_train = ts_extract_features(preproc_produv_train)
extr_produv_test = ts_extract_features(preproc_produv_test)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2063/2063 [00:20<00:00, 102.59it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 780/780 [00:06<00:00, 117.15it/s]


In [13]:
select_produv_train, select_produv_test = ts_select_features(extr_produv_train, extr_produv_test)

In [14]:
cluster_train = select_produv_train.merge(lom_train, on='NPLV', suffixes=('',f'_lom'))
cluster_train = cluster_train.merge(sip_train, on='NPLV', suffixes=('',f'_sip'))

In [15]:
cluster_test = select_produv_test.merge(lom_test, on='NPLV', suffixes=('',f'_lom'))
cluster_test = cluster_test.merge(sip_test, on='NPLV', suffixes=('',f'_sip'))

In [16]:
cluster = pd.concat((cluster_train, cluster_test))

In [17]:
cluster = cluster.set_index('NPLV')

In [18]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=10)
cluster_labels = clusterer.fit_predict(cluster)

In [21]:
cluster_labels_train = cluster_labels[:2063]
with open('clustering_train.pickle', 'wb') as f:
        pickle.dump(cluster_labels_train, f)
        
cluster_labels_test = cluster_labels[2063:]
with open('clustering_test.pickle', 'wb') as f:
        pickle.dump(cluster_labels_test, f)