# Botnet Profiling
In this notebook, we will use the output from Task 3 (Flow Discretization), in order to build a probabilistic model for profiling botnet behaviours.

In [1]:
# import necessary modules
import json
from hmmlearn import hmm
import numpy as np
from sklearn.externals import joblib
import warnings
from sklearn.preprocessing import LabelEncoder

In [2]:
# set alphabet_size based on previous task
alphabet_size = 12

In [3]:
# read the json file
timed_events_by_host = json.load(open("output/events_scenario10.json"))

In [4]:
timed_events = timed_events_by_host['147.32.84.165']
timed_events

[[4, 0],
 [4, 2247],
 [1, 11],
 [4, 2549],
 [4, 12],
 [0, 407256],
 [0, 1496],
 [0, 3499],
 [0, 1502],
 [0, 2491],
 [0, 13505],
 [0, 17046],
 [0, 2502],
 [4, 124243],
 [4, 2283],
 [1, 1],
 [4, 2690],
 [4, 12],
 [4, 994911],
 [4, 4502],
 [1, 2],
 [0, 60005],
 [4, 64458],
 [1, 2],
 [0, 61],
 [0, 89],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 1],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 1],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 2],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 1],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 1],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 0],
 [0, 25674],
 [0, 19708],
 [0, 23534],
 [1, 30325],
 [1, 18245],
 [1, 14055],
 [0, 12270

## Sliding Windows
To generate a list of sequences, we will apply a sliding window of 20 milliseconds (based on the time parameter of the events in the timed_events)

In [5]:
# infected host list from https://mcfp.felk.cvut.cz/publicDatasets/CTU-Malware-Capture-Botnet-51/
infected_hosts = ["147.32.84.165", "147.32.84.191", "147.32.84.192", "147.32.84.193", "147.32.84.204", "147.32.84.205", "147.32.84.206", "147.32.84.207", "147.32.84.208", "147.32.84.209"]
normal_hosts = ["147.32.84.170", "147.32.84.134", "147.32.84.164", "147.32.87.36", "147.32.80.9"]

In [6]:
sequences_by_hosts = {}
window_size = 20
for host in infected_hosts + normal_hosts:
    timed_events = timed_events_by_host[host]
    sequences = []
    for idx,event in enumerate(timed_events):
        sequence = [event[0]]
        window_cursor = 0
        next_event_idx = idx + 1
        while window_cursor < window_size and next_event_idx < len(timed_events):
            next_event = timed_events[next_event_idx]
            window_cursor = window_cursor + next_event[1]
            sequence.append(next_event[0])
            next_event_idx = next_event_idx + 1
        sequences.append(sequence)
    sequences_by_hosts[host] = sequences

## HMMLearn

In [8]:
def prepare_hmm_input(samples):
    X = []
    lengths = [len(x) for x in samples]
    lengths
    for sample in samples:
        for x in sample:
            X.append([x])
    return (np.array(X),lengths)
    
X,lengths = prepare_hmm_input(sequences_by_hosts[infected_hosts[0]])

In [9]:
labelEncoder = LabelEncoder().fit(X)
X = labelEncoder.transform(X).reshape(-1,1)
X

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


array([[2],
       [2],
       [2],
       ...,
       [2],
       [0],
       [0]])

In [10]:
warnings.filterwarnings('ignore')

remodel = hmm.MultinomialHMM(n_components=3, n_iter=100)
remodel.fit(X, lengths)

MultinomialHMM(algorithm='viterbi', init_params='ste', n_components=3,
        n_iter=100, params='ste',
        random_state=<mtrand.RandomState object at 0x107cb6d80>,
        startprob_prior=1.0, tol=0.01, transmat_prior=1.0, verbose=False)

In [28]:
#joblib.dump(remodel, "output/infected_host_profile.pkl")

remodel = joblib.load("output/infected_host_profile.pkl")  

In [29]:
def prepare_hmm_input_test(samples):
    classes_map = dict(zip([x for x in labelEncoder.classes_],range(len(labelEncoder.classes_))))
    extra_class = len(labelEncoder.classes_)
    
    X = []
    lengths = [len(x) for x in samples]
    lengths
    for sample in samples:
        for x in sample:
            if x in classes_map:
                X.append([classes_map[x]])
            else:
                X.append([extra_class])
    return (np.array(X),lengths)

In [30]:
# a little hack for the HMM model to be able to predict out of training symbols 

n_components = remodel.emissionprob_.shape[0]
n_features = remodel.emissionprob_.shape[1]
remodel.emissionprob_ = np.append(remodel.emissionprob_,np.zeros((n_components,1)),axis=1)
remodel.n_features = 8

# smoothing
new_emissionprob = remodel.emissionprob_ + np.ones(remodel.emissionprob_.shape)
new_emissionprob = new_emissionprob / np.sum(new_emissionprob,axis=1).reshape(-1,1)
remodel.emissionprob_ = new_emissionprob

In [31]:
print("Infected Host Log Probability")
for host in infected_hosts:
    X,lengths = prepare_hmm_input_test(sequences_by_hosts[host])
    score = remodel.score(X,lengths)
    print(host,"\t",score)

print("Normal Host Log Probability")
for host in normal_hosts:
    X,lengths = prepare_hmm_input_test(sequences_by_hosts[host])
    score = remodel.score(X,lengths)
    print(host,"\t",score)

Infected Host Log Probability
147.32.84.165 	 -99480.64680639247
147.32.84.191 	 -108738.28344749968
147.32.84.192 	 -108520.62730674112
147.32.84.193 	 -105805.94341974313
147.32.84.204 	 -115219.59775862304
147.32.84.205 	 -123809.39702429171
147.32.84.206 	 -117111.39128027116
147.32.84.207 	 -112457.60935793286
147.32.84.208 	 -115551.65592885671
147.32.84.209 	 -103493.50234663485
Normal Host Log Probability
147.32.84.170 	 -190701.17750430567
147.32.84.134 	 -53400.23570647441
147.32.84.164 	 -88629.37698843011
147.32.87.36 	 -6883.867308610395
147.32.80.9 	 -9790.909446149177


In [None]:
X,lengths = prepare_hmm_input_test(sequences_by_hosts[infected_hosts[1]])
remodel.predict_proba(X,lengths)

In [34]:
seq_prob_by_host = {}
for host in infected_hosts + normal_hosts:
    X,lengths = prepare_hmm_input_test(sequences_by_hosts[host])
    prev_length = 0
    prob_list = []
    for length in lengths:
        prob_list.append(np.exp(remodel.score(X[prev_length:prev_length+length],[length])))
        prev_length = length
        
    seq_prob_by_host[host] = prob_list

In [57]:
seq_prob_by_host[infected_hosts[0]]

[0.012378453345061994,
 0.0013785945714121686,
 0.012389872229433928,
 0.0013785945714121686,
 0.012389872229433928,
 0.012389658880243437,
 0.012389658880243437,
 0.012389658880243437,
 0.012389658880243437,
 0.012389658880243437,
 0.012389658880243437,
 0.012389658880243437,
 0.012389658880243437,
 0.012389658880243437,
 0.0013785945714121686,
 0.012389872229433928,
 0.0013785945714121686,
 0.012389872229433928,
 0.012389658880243437,
 0.0013785945714121686,
 0.012389872229433928,
 0.012389658880243437,
 0.0013785945714121686,
 0.012389872229433928,
 0.012389658880243437,
 3.997689893299416e-57,
 1.4236143455608767e-47,
 6.40626986149118e-47,
 2.882823825583265e-46,
 1.2972717960738647e-45,
 5.837727917862676e-45,
 2.6269797390286344e-44,
 1.1821418617593904e-43,
 5.3196427843051785e-43,
 2.3938412358135374e-42,
 1.0772294484111242e-41,
 4.84753653318103e-41,
 2.1813932468318976e-40,
 9.816277741802227e-40,
 4.417328642790442e-39,
 1.987799535797811e-38,
 8.945105320535791e-38,
 4.02