# Dataset Reduction Notebook
In this notebook is contained the code to reduce the given datasets.

In [5]:
def duplicate_lines_remover(csv_listed_dataset):
    """
    Remove duplicate subsequent equal lines from the CSV data.
    """
    csv_text_clean = [csv_listed_dataset[0]]
    starting_len = len(csv_listed_dataset[1:])
    for line in csv_listed_dataset[1:]:
        if csv_text_clean[-1] != line:
            csv_text_clean.append(line)
    print(
        f"Deduplication: reduced size from {starting_len} to {len(csv_text_clean) - 1}.")
    return csv_text_clean


def reduce_size(csv_listed_dataset, remove_one_every_n=2):
    """
    Reduce the size of the dataset by removing one row every n rows.
    """
    clean_data = [csv_listed_dataset[0]]
    starting_len = len(csv_listed_dataset[1:])
    for i in range(1, starting_len + 1):
        if i % remove_one_every_n == 0:
            continue
        clean_data.append(csv_listed_dataset[i])
    print(f"Remove one every {remove_one_every_n}: reduced size from {starting_len} to {len(clean_data) - 1}.")
    return clean_data

def capped_halfed_size_reduction(csv_listed_dataset, cap=2000):
    """
    Reduce the size of the dataset by removing one row every 2 rows, capped to a maximum size.
    """
    if len(csv_listed_dataset) - 1 <= cap:
        return csv_listed_dataset
    reduced_data = reduce_size(csv_listed_dataset, 2)
    
    while len(reduced_data) - 1 > cap:
        reduced_data = reduce_size(reduced_data, 2)
        
    return reduced_data
        
         
def header_anonymizer(csv_listed_dataset):
    """
    Anonymize the header of the CSV data, substituing register names with letters.
    """
    # Header anonymization
    old_header = csv_listed_dataset[0].split(",")
    new_header = ",".join([chr(ord('A') + i) for i in range(len(old_header))])
    anonymized_header_csv_listed_dataset = [c for c in csv_listed_dataset]
    anonymized_header_csv_listed_dataset[0] = new_header

    # reference mapping (letter, original register name)
    ref = {chr(ord('A') + i): old_header[i] for i in range(len(old_header))}
    
    return (anonymized_header_csv_listed_dataset, ref)

In [None]:
# dataset reading
with open(f"../datasets/swat/baseline.csv", "r") as f:
    csv_text = f.read()
    
# csv file in which every line is an element of a list (header included)
csv_listed_dataset = csv_text.split("\n")
print(csv_listed_dataset[0:5])

# deduplication
to_analyze = duplicate_lines_remover(csv_listed_dataset)
# size reduction
to_analyze = capped_halfed_size_reduction(to_analyze, cap=2000)

print(to_analyze[0:5])

# header anonymization
#anonymized_to_analyze, reference_mapping = header_anonymizer(to_analyze)

# final dataset size and name
#compressed_dataset_size = len(to_analyze) - 1

text_data = "\n".join(to_analyze)
#anonymized_text_data = "\n".join(anonymized_to_analyze)

['PLC3_InputRegisters_IW0,PLC3_MemoryRegisters_MW1,PLC3_Coils_QX00,PLC1_InputRegisters_IW0,PLC1_MemoryRegisters_MW0,PLC1_MemoryRegisters_MW1,PLC1_Coils_QX00,PLC1_Coils_QX01,PLC1_Coils_QX02,PLC2_InputRegisters_IW0,PLC2_MemoryRegisters_MW0,PLC2_MemoryRegisters_MW1,PLC2_Coils_QX00,prev_PLC3_InputRegisters_IW0,prev_PLC3_Coils_QX00,prev_PLC1_InputRegisters_IW0,prev_PLC1_Coils_QX00,prev_PLC1_Coils_QX01,prev_PLC1_Coils_QX02,prev_PLC2_InputRegisters_IW0,prev_PLC2_Coils_QX00', '1,10,0,81,40,80,0,0,0,10,10,20,1,1,0,81,0,0,0,10,1', '1,10,0,81,40,80,0,0,0,10,10,20,1,1,0,81,0,0,0,10,1', '1,10,0,81,40,80,0,0,0,10,10,20,1,1,0,81,0,0,0,10,1', '1,10,0,81,40,80,0,0,0,10,10,20,1,1,0,81,0,0,0,10,1']
Deduplication: reduced size from 30059 to 10279.
Remove one every 2: reduced size from 10279 to 5140.
Remove one every 2: reduced size from 5140 to 2570.
Current size: 2570
Remove one every 2: reduced size from 2570 to 1285.
Current size: 1285
['PLC3_InputRegisters_IW0,PLC3_MemoryRegisters_MW1,PLC3_Coils_QX00,