# Dataset Reduction Notebook
In this notebook is contained the code to reduce the given datasets.

In [2]:
from time import time
from datetime import datetime

In [14]:
def duplicate_lines_remover(csv_listed_dataset):
    """
    Remove duplicate subsequent equal lines from the CSV data.
    """
    csv_text_clean = [csv_listed_dataset[0]]
    starting_len = len(csv_listed_dataset[1:])
    
    for line in csv_listed_dataset[1:]:
        if csv_text_clean[-1] != line:
            csv_text_clean.append(line)
            
    print(f"Deduplication: reduced size from {starting_len} to {len(csv_text_clean) - 1}.")
    
    return csv_text_clean    

def reduce_size(csv_listed_dataset, remove_one_every_n=2):
    """
    Reduce the size of the dataset by removing one row every n rows.
    """
    clean_data = [csv_listed_dataset[0]]
    starting_len = len(csv_listed_dataset[1:])
    for i in range(1, starting_len + 1):
        if i % remove_one_every_n == 0:
            continue
        clean_data.append(csv_listed_dataset[i])
    print(f"Remove one every {remove_one_every_n}: reduced size from {starting_len} to {len(clean_data) - 1}.")
    return clean_data

def capped_halfed_size_reduction(csv_listed_dataset, cap=2000):
    """
    Reduce the size of the dataset by removing one row every 2 rows, capped to a maximum size.
    """
    if len(csv_listed_dataset) - 1 <= cap:
        return csv_listed_dataset
    reduced_data = reduce_size(csv_listed_dataset, 2)
    
    while len(reduced_data) - 1 > cap:
        reduced_data = reduce_size(reduced_data, 2)
        
    return reduced_data
        
         
def header_anonymizer(csv_listed_dataset):
    """
    Anonymize the header of the CSV data, substituing register names with letters.
    """
    # Header anonymization
    old_header = csv_listed_dataset[0].split(",")
    new_header = ",".join([chr(ord('A') + i) for i in range(len(old_header))])
    anonymized_header_csv_listed_dataset = [c for c in csv_listed_dataset]
    anonymized_header_csv_listed_dataset[0] = new_header

    # reference mapping (letter, original register name)
    ref = {chr(ord('A') + i): old_header[i] for i in range(len(old_header))}
    
    return (anonymized_header_csv_listed_dataset, ref)

def timecode_replacer(csv_listed_dataset):
    """
    Replace timecodes in the first column with elapsed time in seconds.
    """
    # insert header
    new_csv_listed_dataset = [csv_listed_dataset[0]]
    i = 1
    
    for line in csv_listed_dataset[1:len(csv_listed_dataset)-1]:
        record = line.split(",")
        record[0] = str(i)
        new_csv_listed_dataset.append(",".join([str(r) for r in record]))
        i += 1
        
    return new_csv_listed_dataset
        
    
    

In [3]:
# ics
ics = "swat"

# dataset selection
ds = f"{ics}/simplified-swat_plc_data_log_20251128_212142.csv"

# compressed dataset name (without number of lines)
out_name_template = f"compressed_simplified-swat_plc-data-log_"

## First compression
First compression with 2000 lines as cap.

In [15]:
# dataset reading
with open(f"../datasets/{ds}", "r") as f:
    csv_text = f.read()
   
# csv file in which every line is an element of a list (header includedin first element)
csv_listed_dataset = csv_text.split("\n")

# timecode replacement
to_analyze = timecode_replacer(csv_listed_dataset)

# reduction steps
to_analyze = capped_halfed_size_reduction(to_analyze, cap=1000)

Remove one every 2: reduced size from 6000 to 3000.
Remove one every 2: reduced size from 3000 to 1500.
Remove one every 2: reduced size from 1500 to 750.


In [None]:
# specific for boilers dataset, which has some columns with always the same value


In [16]:
# csv creation

# header anonymization
anonymized_to_analyze, reference_mapping = header_anonymizer(to_analyze)

# final dataset size
txt_data_len = len(to_analyze) - 1

# write final datasets
with open(f"../datasets/{ics}/{out_name_template}{txt_data_len}-lines.csv", "w") as f:
    f.write("\n".join(to_analyze))
    
with open(f"../datasets/{ics}/anonymized_{out_name_template}{txt_data_len}-lines.csv", "w") as f:
    f.write("\n".join(anonymized_to_analyze))