# Load data

Here we load and preprocess data from CSV file

In [1]:
import modules.patterns as patterns
import tabulate

# bigger number is lower precission
# highest precission is 1
p = patterns.Patterns(precission=100, window_size=100)

folder_loc = "../data/2015/Network"
result_loc = "../output/results.html"

stats = p.bootstrap(folder_loc)
table = tabulate.tabulate(stats, tablefmt="html")
table

header = """
<!doctype html>
<html lang="en-GB">
    <head>
        <meta charset="utf-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <link rel="stylesheet" href="https://cdn.simplecss.org/simple.css">
    </head>
"""


with open(result_loc, "w") as fd:
    fd.write(header)
    fd.write(table.str)


## Detect anomalies from input data

We can get the information about attacks from SWaT dataset `data/2015/List_of_attacks_Final.csv. They represent are divided into groups:

- Single stage single point (SSSP)
- Single stage multi point (SSMP)
- Multi stage single point (MSSP)
- Multi stage multi point attacks (MSMP)

In [2]:
import modules.load as load

attacks_file_loc = "../data/2015/List_of_attacks_Final_fixed.xlsx"
stages, anomalies = load.anomalies(attacks_file_loc)

print("Loaded %d anomalies\n" %(len(anomalies)))

First anomaly detected 2015-12-28T10:29:14
Loaded 35 anomalies



# Process data

Here we process every single file for finding possible patterns

In [3]:
import time
from IPython import display

# edge case, testing one
ts_start = time.time()
stats = p.process_all(anomalies, skip_first=414, max_process=-1)

# stats = p.process_all(anomalies, skip_first=414, max_process=-1)

# end of tests
# stats = p.process_all(anomalies, skip_first=701, max_process=5)

for stat in stats:
    table = tabulate.tabulate(stat, tablefmt="html")
    display.display_html(table, raw=True)

    with open(result_loc, "a") as fd:
        fd.write(table.str)

diff = time.time() - ts_start
print("Runtime %d sec." %(diff))

with open(result_loc, "a") as fd:
    fd.write("<p>Runtime %d sec.</p>" %(diff))


[ 1 / 370 | 414 / 784 ] processing file ../data/2015/Network/2015-12-28_113021_98.log.part07_sorted.csv
[ 2 / 370 | 415 / 784 ] processing file ../data/2015/Network/2015-12-28_113021_98.log.part08_sorted.csv
[ 3 / 370 | 416 / 784 ] processing file ../data/2015/Network/2015-12-28_113021_98.log.part09_sorted.csv
[ 4 / 370 | 417 / 784 ] processing file ../data/2015/Network/2015-12-28_113021_98.log.part10_sorted.csv
[ 5 / 370 | 418 / 784 ] processing file ../data/2015/Network/2015-12-28_113021_98.log.part11_sorted.csv
[ 6 / 370 | 419 / 784 ] processing file ../data/2015/Network/2015-12-28_113021_98.log.part12_sorted.csv
[ 7 / 370 | 420 / 784 ] processing file ../data/2015/Network/2015-12-28_113021_98.log.part13_sorted.csv
[ 8 / 370 | 421 / 784 ] processing file ../data/2015/Network/2015-12-28_113021_98.log.part14_sorted.csv
[ 9 / 370 | 422 / 784 ] processing file ../data/2015/Network/2015-12-28_164554_99.log.part01_sorted.csv
[ 10 / 370 | 423 / 784 ] processing file ../data/2015/Network/20