# Dataset Resample Notebook
In this notebook is contained the code to resample the given datasets.

In [None]:
# dataset reduction procedures
def duplicate_lines_remover(csv_listed_dataset):
    """
    Remove duplicate subsequent equal lines from the CSV data.
    """
    csv_text_clean = [csv_listed_dataset[0]]
    starting_len = len(csv_listed_dataset[1:])
    for line in csv_listed_dataset[1:]:
        if csv_text_clean[-1] != line:
            csv_text_clean.append(line)
    print(
        f"Deduplication: reduced size from {starting_len} to {len(csv_text_clean) - 1}.")
    return csv_text_clean


def reduce_size(csv_listed_dataset, remove_one_every_n=2, times=1):
    """
    Reduce the size of the dataset by removing one row every n rows.
    """
    clean_data = [csv_listed_dataset[0]]
    starting_len = len(csv_listed_dataset[1:])
    for i in range(1, starting_len + 1):
        if i % remove_one_every_n == 0:
            continue
        clean_data.append(csv_listed_dataset[i])
    print(f"Remove one every {remove_one_every_n}: reduced size from {starting_len} to {len(clean_data) - 1}.")
    return clean_data


def header_anonymizer(csv_listed_dataset):
    """
    Anonymize the header of the CSV data, substituing register names with letters.
    """
    # Header anonymization
    old_header = csv_listed_dataset[0].split(",")
    new_header = ",".join([chr(ord('A') + i) for i in range(len(old_header))])
    anonymized_header_csv_listed_dataset = [c for c in csv_listed_dataset]
    anonymized_header_csv_listed_dataset[0] = new_header

    # reference mapping (letter, original register name)
    ref = {chr(ord('A') + i): old_header[i] for i in range(len(old_header))}
    
    return (anonymized_header_csv_listed_dataset, ref)


# dataset selection
#TESTED_DATASET = "plc_data_log_20251128_212142"
TESTED_DATASET = "baseline"

# dataset reading
with open(f"../datasets/swat/{TESTED_DATASET}.csv", "r") as f:
    csv_text = f.read()
    
# csv file in which every line is an element of a list (header included)
csv_listed_dataset = csv_text.split("\n")

# deduplication
to_analyze = duplicate_lines_remover(csv_listed_dataset)
# size reduction
to_analyze = reduce_size(to_analyze)
to_analyze = reduce_size(to_analyze)
to_analyze = reduce_size(to_analyze)

# header anonymization
anonymized_to_analyze, reference_mapping = header_anonymizer(to_analyze)

# final dataset size and name
compressed_dataset_size = len(to_analyze) - 1
tested_dataset_name = f"{TESTED_DATASET}_compressed_{compressed_dataset_size}_rows.csv"

text_data = "\n".join(to_analyze)
anonymized_text_data = "\n".join(anonymized_to_analyze)