In [5]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from tqdm.notebook import tqdm
import gc
import h5py
import multiprocessing as mp

In [6]:
WITH_LABELS, WITHOUT_LABELS = True, False

files = [
    ('test', WITH_LABELS),
    # ('train', WITHOUT_LABELS),
    ('train_full_genuine', WITHOUT_LABELS),
    # ('train_95_genuine', WITHOUT_LABELS),
    # ('train_90_genuine', WITHOUT_LABELS),
]

fields = [
    'snd_pos_x',
    'snd_pos_y',
    'snd_spd_x',
    'snd_spd_y',
    'snd_acl_x',
    'snd_acl_y',
    'snd_hed_x',
    'snd_hed_y',
    'rcv_pos_x',
    'rcv_pos_y',
    'rcv_spd_x',
    'rcv_spd_y',
    'rcv_acl_x',
    'rcv_acl_y',
    'rcv_hed_x',
    'rcv_hed_y',
    #'delta_time',
]

def normalize3(a, min_a, max_a):
	return (a - min_a) / (max_a - min_a + 0.0001)

window_size = 20
stride = 10
V = pd.read_feather(f'out_veremi/veremi-mixall.feather')

datasets = V.groupby('dataset_id').attack_type.unique()
dataset_to_main_attack = {
    key: int(attacks.max()) if len(attacks) == 2 else 0
    for key, attacks in zip(datasets.index, datasets)
}

all_seqs = V[fields]
min_a = all_seqs.min(axis=0).to_numpy()
max_a = all_seqs.max(axis=0).to_numpy()
del all_seqs

for file, save_labels in files:
    # stride = stride if file != 'test' else 1
    df = pd.read_feather(f'out_veremi/{file}-mixall.feather')
    
    # group by sender and receiver
    # grouped = df.groupby(['dataset_id', 'sender', 'receiver', 'attack_type'])
    grouped = df.groupby(['dataset_id', 'sender', 'attack_type'])
    grouped_size = grouped.size().sort_values(ascending=False)
    sorted_order = grouped_size.index
    
    diff = (grouped_size - window_size) // stride
    num_windows_all = np.where(diff >= 0, diff + 1, 0)
    del diff

    count_seqs = num_windows_all.sum()
    sequences = np.empty((window_size, count_seqs, len(fields)), dtype=np.float32)
    #sequences = np.memmap('sequences.memmap', dtype='float32', mode='w+', shape=(window_size, count_seqs, len(fields)))
    if save_labels:
        labels = np.empty((count_seqs, 3), dtype=np.uint32)
    seq_idx = 0
    print(len(sorted_order), len(num_windows_all))
    for group_id, k, num_windows in tqdm(zip(range(len(sorted_order)), sorted_order, num_windows_all), total=len(num_windows_all)):
        if num_windows <= 0:
            continue
        t = grouped.get_group(k)
        t = t.sort_values(['bsm_rcv_time', 'receiver'])
        attack_type = k[-1]
        
        array = t[fields].to_numpy()
        for i in range(num_windows):
            start_idx = i * stride
            end_idx = start_idx + window_size
            window = array[start_idx : end_idx]
            sequences[:, seq_idx] = normalize3(window, min_a=min_a, max_a=max_a)
            if save_labels:
                labels[seq_idx] = (attack_type, dataset_to_main_attack[k[0]], group_id) 
            seq_idx += 1

        # Comentado porque será truncado, mas descomentar caso não queira truncar
        # final_window_size = len(t) % window_size
        # if final_window_size > 0:
        #     final_window = array[-final_window_size:]
        #     sequences.append(torch.tensor(final_window))
        #     if save_labels:
        #         labels.append(attack_type)

        del array
        del t

    del grouped_size
    del sorted_order
    del df
    
    gc.collect()
    
    if save_labels:
        with h5py.File("out_veremi/veremi-mixall-20-10.h5", "a") as hf:
            ds_name = f"{file}_labels"
            if ds_name in hf.keys():
                del hf[ds_name]
            hf.create_dataset(ds_name, data=labels, compression="gzip")
        del labels
        
    with h5py.File("out_veremi/veremi-mixall-20-10.h5", "a") as hf:
        ds_name = file
        if ds_name in hf.keys():
            del hf[ds_name]
        hf.create_dataset(file, data=sequences, compression="gzip")
    del sequences

4966 4966


  0%|          | 0/4966 [00:00<?, ?it/s]

13990 13990


  0%|          | 0/13990 [00:00<?, ?it/s]