In [1]:
!pip install gdown -q

In [2]:
!gdown 1TMj8Yh34tYQ3qXtDzQeTPwrrPOmSd5bf
!unzip -qo Data.zip -d datasets
!rm -rf Data.zip

Downloading...
From (original): https://drive.google.com/uc?id=1TMj8Yh34tYQ3qXtDzQeTPwrrPOmSd5bf
From (redirected): https://drive.google.com/uc?id=1TMj8Yh34tYQ3qXtDzQeTPwrrPOmSd5bf&confirm=t&uuid=0cd1fa75-9340-441b-b478-59d977680a40
To: /home/jovyan/RAINCOAT/Data.zip
100%|█████████████████████████████████████████| 469M/469M [00:04<00:00, 105MB/s]


In [11]:
!git clone https://github.com/DMIRLAB-Group/SASA.git

Cloning into 'SASA'...
remote: Enumerating objects: 47, done.[K
remote: Counting objects: 100% (47/47), done.[K
remote: Compressing objects: 100% (45/45), done.[K
remote: Total 47 (delta 18), reused 2 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (47/47), 12.13 MiB | 2.37 MiB/s, done.
Resolving deltas: 100% (18/18), done.


In [12]:
import os, torch
import pandas as pd
import numpy as np
from tqdm import tqdm

In [5]:
# For unknown reasons HAR's tensors have the shape of
# [batch x features x timeseries], this fraction of the code swaps two last
# dimensions to obtain coherent shape of [batch x timeseries x features]

# If this and only this dataset fails, comment the code below

FOLDER = './datasets/HAR'
for file in tqdm(sorted(os.listdir(FOLDER)), desc="HAR batches tranposal"):
    filename = os.path.join(FOLDER, file)
    batch = torch.load(filename, weights_only=False)
    batch['samples'] = batch['samples'].transpose((0, 2, 1))
    torch.save(batch, filename)
    del batch

HAR batches tranposal: 100%|██████████| 60/60 [00:01<00:00, 32.94it/s]


In [13]:
def datasets_verification(datasets_dir : str):
    files = []
    for root, _, filenames in os.walk(datasets_dir):
        for filename in filenames:
            if '.pt' in filename:
                files.append(os.path.join(root, filename))

    passed, failed = [], []
    for file in tqdm(files, desc="Files verification"):
        tensor = torch.load(file, weights_only=False)
        if tensor['samples'].shape[0] == tensor['labels'].shape[0]:
            passed.append(file)
        else:
            failed.append(file)
            print(f"File '{file}' has failed the check.")
        del tensor

    print()
    print(f"Passed:{len(passed):>4} ({len(passed)/len(files):.2%})")
    print(f"Failed:{len(failed):>4} ({len(failed)/len(files):.2%})")
    print(f" Total:{len(files):>4}")

In [14]:
def boiler_translator(source_folder : str, target_folder : str):
    BATCH_SIZE = 128
    SEQ_LENGTH = 36
    STEP_LENGTH = SEQ_LENGTH

    DEBUG = True

    if not os.path.exists(target_folder): os.mkdir(target_folder)
    # Get list of files, presplit by train/test
    files = []
    for root, dir, filenames in os.walk(source_folder):
        for filename in filenames:
            files.append(os.path.join(root, filename))
    files = [file for file in files if '#' not in file]

    index = {'train' : 0, 'test' : 0}
    for file in files:
        split_type = os.path.basename(file).split('.')[0]
        data = pd.read_csv(file).iloc[:, 2:].to_numpy() # Remove timestamps and boilers' #
        total = data.shape[0]
        features, labels = [], []
        # The last samples are truncated.
        # The batch label is determined by the last one
        for end_index in range(SEQ_LENGTH, total, STEP_LENGTH):
            start_index = end_index - SEQ_LENGTH
            features.append(data[start_index:end_index, :-1])
            labels.append(data[end_index, -1])
        features, labels = np.stack(features), np.stack(labels)


        datapack = {}
        for start_index in range(0, features.shape[0], BATCH_SIZE):
            end_index = min(start_index + BATCH_SIZE, total)
            datapack['samples'] = features[start_index:end_index, :, :]
            datapack['labels'] = labels[start_index:end_index]
            torch.save(datapack, os.path.join(target_folder, f"{split_type}_{index[split_type]}.pt"))
            index[split_type] += 1
            if DEBUG:
                print(f"| Samples shape: {datapack['samples'].shape}", end='')
                print(f"\t| Labels shape: {datapack['labels'].shape}\t|")
    pass

In [15]:
boiler_translator('./SASA/datasets/Boiler', './datasets/Boiler')

| Samples shape: (128, 36, 20)	| Labels shape: (128,)	|
| Samples shape: (128, 36, 20)	| Labels shape: (128,)	|
| Samples shape: (128, 36, 20)	| Labels shape: (128,)	|
| Samples shape: (116, 36, 20)	| Labels shape: (116,)	|
| Samples shape: (128, 36, 20)	| Labels shape: (128,)	|
| Samples shape: (128, 36, 20)	| Labels shape: (128,)	|
| Samples shape: (128, 36, 20)	| Labels shape: (128,)	|
| Samples shape: (128, 36, 20)	| Labels shape: (128,)	|
| Samples shape: (128, 36, 20)	| Labels shape: (128,)	|
| Samples shape: (128, 36, 20)	| Labels shape: (128,)	|
| Samples shape: (128, 36, 20)	| Labels shape: (128,)	|
| Samples shape: (128, 36, 20)	| Labels shape: (128,)	|
| Samples shape: (128, 36, 20)	| Labels shape: (128,)	|
| Samples shape: (128, 36, 20)	| Labels shape: (128,)	|
| Samples shape: (128, 36, 20)	| Labels shape: (128,)	|
| Samples shape: (128, 36, 20)	| Labels shape: (128,)	|
| Samples shape: (128, 36, 20)	| Labels shape: (128,)	|
| Samples shape: (128, 36, 20)	| Labels shape: (

In [16]:
!rm -rf SASA

In [17]:
datasets_verification('./datasets')

Files verification: 100%|██████████| 250/250 [00:05<00:00, 47.10it/s] 


Passed: 250 (100.00%)
Failed:   0 (0.00%)
 Total: 250



