# Botnet Profiling
In this notebook, we will use the output from Task 3 (Flow Discretization), in order to build a probabilistic model for profiling botnet behaviours.

In [57]:
# import necessary modules
import json
import sys
sys.path.insert(0, "./pautomac")
from pautomac_baumwelch import *

In [None]:
# set alphabet_size based on previous task
alphabet_size = 12

In [14]:
# read the json file
timed_events_by_host = json.load(open("output/events_scenario10.json"))

In [15]:
timed_events = timed_events_by_host['147.32.84.165']
timed_events

[[4, 0],
 [4, 2247],
 [1, 11],
 [4, 2549],
 [4, 12],
 [0, 407256],
 [0, 1496],
 [0, 3499],
 [0, 1502],
 [0, 2491],
 [0, 13505],
 [0, 17046],
 [0, 2502],
 [4, 124243],
 [4, 2283],
 [1, 1],
 [4, 2690],
 [4, 12],
 [4, 994911],
 [4, 4502],
 [1, 2],
 [0, 60005],
 [4, 64458],
 [1, 2],
 [0, 61],
 [0, 89],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 1],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 1],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 2],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 1],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 1],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 25674],
 [0, 19708],
 [0, 23534],
 [1, 30325],
 [1, 18245],
 [1, 14055],
 [0, 12270

## Sliding Windows
To generate a list of sequences, we will apply a sliding window of 20 milliseconds (based on the time parameter of the events in the timed_events)

In [43]:
sequences = []
window_size = 20
for idx,event in enumerate(timed_events):
    sequence = [event[0]]
    window_cursor = int(event[1])
    next_event_idx = idx + 1
    while window_cursor < window_size and next_event_idx < len(timed_events):
        next_event = timed_events[next_event_idx]
        window_cursor = window_cursor + next_event[1]
        sequence.append(next_event[0])
        next_event_idx = next_event_idx + 1
    sequences.append(sequence)

In [44]:
# write sequences to file
with open("sequences/147.32.84.165.sequence", "w") as f:
    f.write(str(len(sequences)) + " 5\n")
    for seq in sequences:
        f.write(" ".join([str(x) for x in seq]))
        f.write("\n")
    f.close()

In [45]:
# infected host list from https://mcfp.felk.cvut.cz/publicDatasets/CTU-Malware-Capture-Botnet-51/
infected_hosts = ["147.32.84.165", "147.32.84.191", "147.32.84.192", "147.32.84.193", "147.32.84.204", "147.32.84.205", "147.32.84.206", "147.32.84.207", "147.32.84.208", "147.32.84.209"]
normal_hosts = ["147.32.84.170", "147.32.84.134", "147.32.84.164", "147.32.87.36", "147.32.80.9", "147.32.87.11"]

In [53]:
for host in infected_hosts + normal_hosts:
    timed_events = timed_events_by_host[host]
    sequences = []
    window_size = 20
    for idx,event in enumerate(timed_events):
        sequence = [event[0]]
        window_cursor = int(event[1])
        next_event_idx = idx + 1
        while window_cursor < window_size and next_event_idx < len(timed_events):
            next_event = timed_events[next_event_idx]
            window_cursor = window_cursor + next_event[1]
            sequence.append(next_event[0])
            next_event_idx = next_event_idx + 1
        sequences.append(sequence)
        
    with open("sequences/" + host+".sequence", "w") as f:
        f.write(str(len(sequences)) + " "+alphabet_size+"\n")
        for seq in sequences:
            f.write(" ".join([str(x) for x in seq]))
            f.write("\n")
    f.close()

In [58]:
%run pautomac/pautomac_baumwelch.py sequences/147.32.84.165.sequence sequences/147.32.84.191.sequence

('loglikelihood:', -361204.8114959437)
('loglikelihood: ', -127367.43686563095)
('loglikelihood: ', -117120.11255085323)
('loglikelihood: ', -109150.93196740422)
('loglikelihood: ', -102342.9964435714)
('loglikelihood: ', -96024.69799210668)
('loglikelihood: ', -92522.53916998889)
('loglikelihood: ', -91943.43181124634)
('loglikelihood: ', -91908.71321048825)
('loglikelihood: ', -91889.22419101394)
('loglikelihood: ', -91861.35582805998)
('loglikelihood: ', -91841.8833505487)
('loglikelihood: ', -91829.71537797413)
('loglikelihood: ', -91818.91468391802)
('loglikelihood: ', -91808.55325652241)
('loglikelihood: ', -91798.66780163026)
