# Filtering the Honeypot data

The data are not publicly available, but the publicly available repository https://github.com/microsoft/msticpy
contains data similar in structure.

Import data:

In [1]:
import json
import codecs
with codecs.open("../../../../home/hpms/Microsoft.IoT-Dump1.json", "r",'utf-8-sig') as f:
    data = json.load(f)

Store data in a dataframe, add the commands to the dataframe, and remove duplicates:

In [2]:
import pandas as pd
data = pd.DataFrame(data)
data["Commands"] = [tuple(session) for session in data["Commands"]]
data = data.drop_duplicates(subset="Commands").reset_index(drop=True)

In [3]:
data.head(10)

Unnamed: 0,Protocol,Commands,ID,TimesSeen,FirstSeen,LastSeen
0,Telnet,"(sh, /bin/busybox Uirusu, /bin/busybox ps; /bi...",61130d46117b4caf96eec5b4a5b6e4a1141aaad3e9076d...,692977,2019-07-09T15:48:24.45,2019-11-06T11:22:03.877
1,Telnet,"(sh, >/tmp/.ptmx && cd /tmp/, >/var/.ptmx && c...",285f4b2fc2ba5b1e737bc9002992a082008fe1361b334e...,43810830,2019-07-09T11:26:41.202,2019-11-06T11:20:48.611
2,Telnet,"(enable, system, shell, sh, >/tmp/.ptmx && cd ...",c6a071d3d8c39c321b0313f103e736105093bf90324719...,29292349,2019-07-09T11:26:42.295,2019-11-06T11:20:46.809
3,Telnet,"(enable, system, shell, sh, >/tmp/.ptmx && cd ...",d86e8e9f64827476f6f68292aa813561a9c764496b225b...,948780,2019-07-24T07:42:51.304,2019-11-06T11:21:47.753
4,Telnet,"(sh, >/tmp/.ptmx && cd /tmp/, >/var/.ptmx && c...",5541980fb93d397260790b84a92bc38fed061cda4950f4...,1472177,2019-07-24T07:42:51.304,2019-11-06T11:21:54.94
5,Telnet,"(sh, /bin/busybox Uirusu, /bin/busybox ps; /bi...",f226f536cff422996a9088f53ce6d72a10dbee2d588782...,1150187,2019-07-09T15:48:22.458,2019-11-06T11:21:57.666
6,Telnet,"(enable, shell, sh, /bin/busybox Uirusu, /bin/...",025a1138007357965a26c7e37efa3f6f3f4ccc23b19689...,391946,2019-07-09T15:48:23.482,2019-11-06T11:21:39.712
7,Telnet,"(enable, shell, sh, /bin/busybox Uirusu, /bin/...",85fc66c96f273c088b8fc75cb90b1a730df16f16040861...,670236,2019-07-09T15:48:22.418,2019-11-06T11:21:39.295
8,Telnet,"(shell, sh, /bin/busybox SYLVEON)",768606ec3ac1dc459ef0011774026f34a03fd85a760629...,6409,2019-07-09T11:25:07.735,2019-11-06T11:19:03.974
9,Telnet,"(sh, /bin/busybox Uirusu, /bin/busybox ps; /bi...",a64e084d790a44c4eb0905543359b41be107166f4727a9...,266,2019-11-02T10:28:28.417,2019-11-05T14:26:56.55


Use the function `clean_commands` from `lda_clust`, which returns a list of lists with command strings as elements. The function also scraps the hostname from each URL observed in the data. 

In [4]:
import lda_clust
data_clean = lda_clust.clean_commands(data, no_below=1, no_above=1.1)
sessions_list = data_clean[0] # list of lists with elements command strings

Remove empty commands and commands containing only a dot `.`. 

In [5]:
import tqdm
# Remove empty commands ' ' and 'dot' commands '.'
for i in tqdm.tqdm(range(len(sessions_list))):
    sessions_list[i] = list(filter(lambda x: x != '', sessions_list[i]))
    sessions_list[i] = list(filter(lambda x: x != '.', sessions_list[i]))

100%|██████████| 124883/124883 [00:08<00:00, 14520.24it/s]


Replace strings followed by random numbers with a unique string. 

In [6]:
replace_dic = {r"AKEMI\_[0-9]{4}":"AKEMI_num",
               r"BOT_ID\s[0-9]{4}":"BOT_ID num",
               r"Ex0\_[0-9]{4}":"Ex0_num",
               r"HORIZON\_[0-9]{4}":"HORIZON_num",
               r"Hades\_[0-9]{4}":"Hades_num",
               r"Hikari\_[0-9]{4}":"Hikari_num",
               r"Kill\_[0-9]{4}":"Kill_num",
               r"Mewski\_[0-9]{4}":"Mewski_num",
               r"SEFA\_ID\s[0-9]{4}":"SEFA_ID num",
               r"UNSTABLE\_[0-9]{4}":"UNSTABLE_num",
               r"WOLF\_[0-9]{4}":"WOLF_num",
               r"dstrtn\_[0-9]{4}":"dstrtn_num",
               r"hhh\_lol\s[0-9]{4}":"hhh_lol num",
               r"orphic\_[0-9]{4}":"orphic_num",
               r"shibui\_[0-9]{4}":"shibui_num",
               r"slumpp\_[0-9]{4}":"slumpp_num"}

In [7]:
import re
for i in tqdm.tqdm(range(len(sessions_list))):
    for j in range(len(sessions_list[i])):
        for key, value in replace_dic.items():
            sessions_list[i][j] = re.sub(key, value, sessions_list[i][j])

100%|██████████| 124883/124883 [13:42<00:00, 151.77it/s]


Replace commands that appear to be abruptly interrupted. 

In [8]:
# Patterns to be replaced
patt_list = ["\.nippe"]
patt_list1 = ["\.nippo"]
patt_list2 = ["bin b","bin bu","bin bus","bin busy","bin busyb","bin busybo"]
patt_list3 = ["Ui","Uir","Uiru"]
patt_list4 = ['bin busybox cat bin busybox w','bin busybox cat bin busybox wh','bin busybox cat bin busybox whi',
              'bin busybox cat bin busybox whil','bin busybox cat bin busybox while','bin busybox cat bin busybox while r',
              'bin busybox cat bin busybox while rea', 'bin busybox cat bin busybox while read']
patt_list5 = ['bin busybox rm proc sy','bin busybox rm proc sys','bin busybox rm proc sys f',
              'bin busybox rm proc sys fs','bin busybox rm proc sys fs b','bin busybox rm proc sys fs bi',
              'bin busybox rm proc sys fs bin', 'bin busybox rm proc sys fs binfm','bin busybox rm proc sys fs binfmt', 
              'bin busybox rm proc sys fs binfmt_','bin busybox rm proc sys fs binfmt_m','bin busybox rm proc sys fs binfmt_mi',
              'bin busybox rm proc sys fs binfmt_mis','bin busybox rm proc sys fs binfmt_misc']
patt_list6 = ['bin busybox cat proc sys fs b','bin busybox cat proc sys fs bi','bin busybox cat proc sys fs binf',
              'bin busybox cat proc sys fs binfm','bin busybox cat proc sys fs binfmt','bin busybox cat proc sys fs binfmt_',
              'bin busybox cat proc sys fs binfmt_m','bin busybox cat proc sys fs binfmt_mi','bin busybox cat proc sys fs binfmt_misc']
patt_list7 = ['sys fs c',"sys fs cg","sys fs cgro","sys fs cgrou","sys fs cgroup b","sys fs cgroup blki"]
patt_list8 = ['sys fs f','sys fs fu','sys fs fus','sys fs fuse','sys fs fuse c','sys fs fuse co',
              'sys fs fuse con','sys fs fuse conn','sys fs fuse conne','sys fs fuse connec','sys fs fuse connect',
              'sys fs fuse connecti','sys fs fuse connectio','sys fs fuse connection']
patt_list9 = ['cgroup p','cgroup pe','cgroup perf','cgroup perf_','cgroup perf_e',
              'cgroup perf_eve','cgroup perf_even']
patt_list10 = ['\.hum','\.huma']

patt_all = [patt_list,patt_list1,patt_list2,patt_list3,patt_list4,patt_list5,patt_list6,patt_list7,patt_list8,patt_list9,patt_list10]

In [9]:
# List of strings corresponding to replacements of patterns in patt_all 
replace_list = ['.nipped','.nippon','bin busybox','Uirusu',
                'bin busybox cat bin busybox while read i',
                'bin busybox rm proc sys fs binfmt_misc .',
                'bin busybox cat proc sys fs binfmt_misc .',
                'sys fs cgroup blkio',
                'sys fs fuse connections','cgroup perf_event',
                '.human']

In [10]:
# Replace cut commands with replace_list
for i in tqdm.tqdm(range(len(sessions_list))):
    for k in range(len(patt_all)):
        for l in range(len(patt_all[k])):
            patt = re.compile("(?<!\w)" + patt_all[k][l] + "(?!\w)$")
            j = len(sessions_list[i])-1
            if re.search(patt,sessions_list[i][j]):
                sessions_list[i][j] = re.sub(patt, replace_list[k], sessions_list[i][j])

100%|██████████| 124883/124883 [00:55<00:00, 2260.09it/s]


Replace random HEX strings.

In [11]:
replacements = {r"(?<!\.)\bx[a-fA-F0-9]{2}\b(?!\.)": " HEX "}
#iterate throught corpus
for i in tqdm.tqdm(range(len(sessions_list))):
    for j in range(len(sessions_list[i])):
        #iterate through replacement patters
        for key, value in replacements.items():
            text_test = re.sub(key, value, sessions_list[i][j])
            while text_test.startswith(" HEX "): 
                text_test = text_test[1:] 
            while text_test.endswith(" HEX "):
                text_test = text_test[:-1] 
        text_test = re.sub(' +', ' ', text_test) # detect double white spaces and substitute with single space
        sessions_list[i][j] = text_test

100%|██████████| 124883/124883 [04:23<00:00, 474.74it/s]


Obtain the filtered corpus (divided into sessions and commands). 

In [12]:
commands_list = []
corpus = []
for session in sessions_list:
    corpus.append([])
    for command in session:
        c = command.split(' ')
        corpus[-1] += [c]
        commands_list += [c]

Obtain the dictionary from the list of commands. 

In [13]:
from gensim.corpora import Dictionary
dictionary = Dictionary(commands_list) 

Filter uncommon strings or very common strings:

In [14]:
dictionary.filter_extremes(no_below=20, no_above=0.1)

Obtain the mapping to words to numbers and vice-versa:

In [15]:
word_map = {}
for w in range(len(dictionary)):
    word_map[w] = dictionary[w]
    word_map[dictionary[w]] = w

Transform the words into integers (as required by `gensim` and `lda_clust`):

In [16]:
W = {}
i = 0
for session in corpus:
    W[i] = {}
    j = 0
    for command in session:
        W[i][j] = []
        for word in command:
            try:
                W[i][j] += [word_map[word]]
            except:
                continue
        if len(W[i][j]) > 0:
            j += 1
    if len(W[i]) > 0:
        i += 1

Only retain *unique* sessions:

In [17]:
from collections import Counter
session_counter = {}
rm_list = []
sessions = []
for s in W:
    session = []
    for c in W[s]:
        session.append(' '.join(str(x) for x in W[s][c]))
    session = ' '.join(str(x) for x in session[-1])
    session = session.strip(' ')
    sessions += [session]
    if session in session_counter:
        session_counter[session] += 1
        rm_list += [s]
    else:
        session_counter[session] = 1

In [18]:
for s in rm_list:
    del W[s]

Adjust the indices in the dictionary `W` after removing the redundant sessions.

In [19]:
W_filter = {}
j = 0
for key in W:
    W_filter[j] = W[key]
    j += 1
    
del W

Save the output.

In [20]:
import pickle
with open('../data/W.pkl', 'wb') as f:
    pickle.dump(W_filter, f)

In [21]:
with open('../data/word_map.pkl', 'wb') as f:
    pickle.dump(word_map, f)