# Botnet Profiling
In this notebook, we will use the output from Task 3 (Flow Discretization), in order to build a probabilistic model for profiling botnet behaviours.

In [11]:
# import necessary modules
import json
from hmmlearn import hmm
import numpy as np
from sklearn.externals import joblib
import warnings
from sklearn.preprocessing import LabelEncoder

In [2]:
# set alphabet_size based on previous task
alphabet_size = 12

In [3]:
# read the json file
timed_events_by_host = json.load(open("output/events_scenario10.json"))

In [4]:
timed_events = timed_events_by_host['147.32.84.165']
timed_events

[[4, 0],
 [4, 2247],
 [1, 11],
 [4, 2549],
 [4, 12],
 [0, 407256],
 [0, 1496],
 [0, 3499],
 [0, 1502],
 [0, 2491],
 [0, 13505],
 [0, 17046],
 [0, 2502],
 [4, 124243],
 [4, 2283],
 [1, 1],
 [4, 2690],
 [4, 12],
 [4, 994911],
 [4, 4502],
 [1, 2],
 [0, 60005],
 [4, 64458],
 [1, 2],
 [0, 61],
 [0, 89],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 1],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 1],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 2],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 1],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 1],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 25674],
 [0, 19708],
 [0, 23534],
 [1, 30325],
 [1, 18245],
 [1, 14055],
 [0, 12270

## Sliding Windows
To generate a list of sequences, we will apply a sliding window of 20 milliseconds (based on the time parameter of the events in the timed_events)

In [5]:
# infected host list from https://mcfp.felk.cvut.cz/publicDatasets/CTU-Malware-Capture-Botnet-51/
infected_hosts = ["147.32.84.165", "147.32.84.191", "147.32.84.192", "147.32.84.193", "147.32.84.204", "147.32.84.205", "147.32.84.206", "147.32.84.207", "147.32.84.208", "147.32.84.209"]
normal_hosts = ["147.32.84.170", "147.32.84.134", "147.32.84.164", "147.32.87.36", "147.32.80.9"]

In [6]:
sequences_by_hosts = {}
window_size = 20
for host in infected_hosts + normal_hosts:
    timed_events = timed_events_by_host[host]
    sequences = []
    for idx,event in enumerate(timed_events):
        sequence = [event[0]]
        window_cursor = int(event[1])
        next_event_idx = idx + 1
        while window_cursor < window_size and next_event_idx < len(timed_events):
            next_event = timed_events[next_event_idx]
            window_cursor = window_cursor + next_event[1]
            sequence.append(next_event[0])
            next_event_idx = next_event_idx + 1
        sequences.append(sequence)
    sequences_by_hosts[host] = sequences

## HMMLearn

In [24]:
def prepare_hmm_input(samples):
    X = []
    lengths = [len(x) for x in samples]
    lengths
    for sample in samples:
        for x in sample:
            X.append([x])
    return (np.array(X),lengths)
    
X,lengths = prepare_hmm_input(sequences_by_hosts[infected_hosts[0]])

In [25]:
labelEncoder = LabelEncoder().fit(X)
X = labelEncoder.transform(X).reshape(-1,1)
X

array([[2],
       [2],
       [2],
       ...,
       [2],
       [0],
       [0]])

In [21]:
warnings.filterwarnings('ignore')

remodel = hmm.MultinomialHMM(n_components=3, n_iter=100)
remodel.fit(X, lengths)

MultinomialHMM(algorithm='viterbi', init_params='ste', n_components=3,
        n_iter=100, params='ste',
        random_state=<mtrand.RandomState object at 0x111a4acf0>,
        startprob_prior=1.0, tol=0.01, transmat_prior=1.0, verbose=False)

In [76]:
#joblib.dump(remodel, "output/infected_host_profile.pkl")

remodel = joblib.load("output/infected_host_profile.pkl")  

In [77]:
def prepare_hmm_input_test(samples):
    classes_map = dict(zip([x for x in labelEncoder.classes_],range(len(labelEncoder.classes_))))
    extra_class = len(labelEncoder.classes_)
    
    X = []
    lengths = [len(x) for x in samples]
    lengths
    for sample in samples:
        for x in sample:
            if x in classes_map:
                X.append([classes_map[x]])
            else:
                X.append([extra_class])
    return (np.array(X),lengths)

In [78]:
# a little hack for the HMM model to be able to predict out of training symbols 

n_components = remodel.emissionprob_.shape[0]
n_features = remodel.emissionprob_.shape[1]
remodel.emissionprob_ = np.append(remodel.emissionprob_,np.zeros((n_components,1)),axis=1)
remodel.n_features = 8

# smoothing
new_emissionprob = remodel.emissionprob_ + np.ones(remodel.emissionprob_.shape)
new_emissionprob = new_emissionprob / np.sum(new_emissionprob,axis=1).reshape(-1,1)
remodel.emissionprob_ = new_emissionprob

In [80]:
print("Infected Host Log Probability")
for host in infected_hosts:
    X,lengths = prepare_hmm_input_test(sequences_by_hosts[host])
    score = remodel.score(X,lengths)
    print(host,"\t",score)

print("Normal Host Log Probability")
for host in normal_hosts:
    X,lengths = prepare_hmm_input_test(sequences_by_hosts[host])
    score = remodel.score(X,lengths)
    print(host,"\t",score)

Infected Host Log Probability
147.32.84.165 	 -66306.83065271228
147.32.84.191 	 -72962.05647429707
147.32.84.192 	 -72818.14052075947
147.32.84.193 	 -71575.33427204542
147.32.84.204 	 -77310.42634446496
147.32.84.205 	 -82975.86415686195
147.32.84.206 	 -78661.53101502194
147.32.84.207 	 -76093.62513085992
147.32.84.208 	 -77573.3528056918
147.32.84.209 	 -70042.90042169884
Normal Host Log Probability
147.32.84.170 	 -145335.10596926298
147.32.84.134 	 -40655.84288196526
147.32.84.164 	 -65065.91552843944
147.32.87.36 	 -3853.966491925952
147.32.80.9 	 -8199.844263782352
