In [54]:
import numpy as np
import json
import pandas as pd
import codecs
from collections import Counter
import itertools
from itertools import chain
from nltk.tokenize import RegexpTokenizer
from gensim.corpora import Dictionary
import re
import tqdm

In [2]:
with codecs.open("/Users/amantzio/Desktop/imperial/jupyter_notebook/Microsoft.IoT-Dump1.json", "r",'utf-8-sig') as f:
    data = json.load(f)


In [3]:
data_pd = pd.DataFrame(data)
data_pd["Commands"]=[tuple(session) for session in data_pd["Commands"]]
data_pd=data_pd.drop_duplicates(subset="Commands").reset_index(drop=True)

### Hanyang and Philip's cleaning:

In [None]:
def clean_commands(dat, no_below=2, no_above=1.1):
    """
    This function 
    1.splits multiple commands in the same line
    2. tokenize the commands
    3. replace rare commands by rarecommand

    :param dat: dataset
    :param no_below: Keep tokens which are contained in at least no_below documents.
    :param no_above: Keep tokens which are contained in no more than no_above documents 
    (fraction of total corpus size, not an absolute number).

    :return sessins_token_list: tokenized list of sessions of commands
    :return dictionary: dictionary generated
    """
    # for commands splitted by ;
    sessions = []
    for session in dat['Commands']:
        sessions.append([]) # to make list of lists
        for command in session:
            sessions[-1] += command.split('; ')
    # tokenizer
    tokenizer = RegexpTokenizer(r'[a-zA-Z0-9_\.\-\*]+')
    sessions_list = []
    commands_list = []
    for session in sessions:
        sessions_list.append([])
        commands_list.append([])
        for command in session:
            command_token = tokenizer.tokenize(command)
            sessions_list[-1] += [command_token]
            commands_list[-1] += command_token
    dictionary = Dictionary(commands_list) 
    dictionary.filter_extremes(no_below, no_above)
    # repleace rare commands by rarecommand
    dictionary.id2token[-1] = 'rarecommand' ###!!! add a 'rarecommand' token to the corpus and assign a unique value to it -1
    ''' -1 value indicates rareword, so when a token appears in sessions
    that is not in final dictionary (after filtering) it substitutes the unique value -1 to this token'''
    dictionary.token2id['rarecommand'] = -1 
    sessions_token_list = []
    for session in sessions_list:
        sessions_token_list.append([])
        commands_token_list = []
        for command in session:
            idxs = dictionary.doc2idx(command) ###!!! gives to tokens of command the unique values/ids assigned
            commands_token_list.append(' '.join([dictionary[idx] for idx in idxs]))
        sessions_token_list[-1] += commands_token_list

    return sessions_token_list, dictionary

In [None]:
data_pd_clean=clean_commands(data_pd)
sessions_list_cleaned=data_pd_clean[0] # list of lists with elements command strings

### Remove empty strings and '.' from sessions list of lists


In [68]:
# Detect position of patterns. patt2 found only in two sessions, remove manually
patt=re.compile(r"^$")# empty string
patt2=re.compile(r"^\.$") # onlyn . in string

for i in tqdm.tqdm(range(len(sessions_list_cleaned))):
    for j in range(len(sessions_list_cleaned[i])):
        if re.search(patt,sessions_list_cleaned[i][j]):
            print((i,j))

100%|██████████| 124883/124883 [00:22<00:00, 5645.00it/s]


In [64]:
# Remove "." from list of commands
sessions_list_cleaned[77974].remove(".")
sessions_list_cleaned[44135].remove(".")

In [67]:
# Remove empty strings for sessions list of lists
for i in range(len(sessions_list_cleaned)):
    sessions_list_cleaned[i]=list(filter(None,sessions_list_cleaned[i]))

### Wildcard number for hostnames

In [69]:
replace_dic={r"AKEMI\_[0-9]{4}":"AKEMI_num",
           r"BOT_ID\s[0-9]{4}":"BOT_ID num",
           r"Ex0\_[0-9]{4}":"Ex0_num",
           r"HORIZON\_[0-9]{4}":"HORIZON_num",
           r"Hades\_[0-9]{4}":"Hades_num",
           r"Hikari\_[0-9]{4}":"Hikari_num",
           r"Kill\_[0-9]{4}":"Kill_num",
           r"Mewski\_[0-9]{4}":"Mewski_num",
           r"SEFA\_ID\s[0-9]{4}":"SEFA_ID num",
           r"UNSTABLE\_[0-9]{4}":"UNSTABLE_num",
           r"WOLF\_[0-9]{4}":"WOLF_num",
           r"dstrtn\_[0-9]{4}":"dstrtn_num",
           r"hhh\_lol\s[0-9]{4}":"hhh_lol num",
           r"orphic\_[0-9]{4}":"orphic_num",
           r"shibui\_[0-9]{4}":"shibui_num",
           r"slumpp\_[0-9]{4}":"slumpp_num"}

In [70]:
for i in tqdm.tqdm(range(len(sessions_list_cleaned))):
    for j in range(len(sessions_list_cleaned[i])):
        for key, value in replace_dic.items():
            sessions_list_cleaned[i][j] = re.sub(key, value, sessions_list_cleaned[i][j])

100%|██████████| 124883/124883 [04:18<00:00, 482.61it/s]


### Fill-in cut commands

In [88]:
#patterns to be replaced
patt_list=["\.nippe"]
patt_list1=["\.nippo"]
patt_list2=["bin b","bin bu","bin bus","bin busy","bin busyb","bin busybo"]
patt_list3=["Ui","Uir","Uiru"]
patt_list4=['bin busybox cat bin busybox w','bin busybox cat bin busybox wh','bin busybox cat bin busybox whi',
            'bin busybox cat bin busybox whil','bin busybox cat bin busybox while','bin busybox cat bin busybox while r',
            'bin busybox cat bin busybox while rea', 'bin busybox cat bin busybox while read']
patt_list5=['bin busybox rm proc sy','bin busybox rm proc sys','bin busybox rm proc sys f',
            'bin busybox rm proc sys fs','bin busybox rm proc sys fs b','bin busybox rm proc sys fs bi',
            'bin busybox rm proc sys fs bin', 'bin busybox rm proc sys fs binfm','bin busybox rm proc sys fs binfmt', 
            'bin busybox rm proc sys fs binfmt_','bin busybox rm proc sys fs binfmt_m','bin busybox rm proc sys fs binfmt_mi',
            'bin busybox rm proc sys fs binfmt_mis','bin busybox rm proc sys fs binfmt_misc']
patt_list6=['bin busybox cat proc sys fs b','bin busybox cat proc sys fs bi','bin busybox cat proc sys fs binf',
           'bin busybox cat proc sys fs binfm','bin busybox cat proc sys fs binfmt','bin busybox cat proc sys fs binfmt_',
           'bin busybox cat proc sys fs binfmt_m','bin busybox cat proc sys fs binfmt_mi','bin busybox cat proc sys fs binfmt_misc']
patt_list7=['sys fs c',"sys fs cg","sys fs cgro","sys fs cgrou","sys fs cgroup b","sys fs cgroup blki"]
patt_list8=['sys fs f','sys fs fu','sys fs fus','sys fs fuse','sys fs fuse c','sys fs fuse co',
            'sys fs fuse con','sys fs fuse conn','sys fs fuse conne','sys fs fuse connec','sys fs fuse connect',
           'sys fs fuse connecti','sys fs fuse connectio','sys fs fuse connection']
patt_list9=["cgroup p",'cgroup pe','cgroup perf','cgroup perf_','cgroup perf_e',
            'cgroup perf_eve','cgroup perf_even']
patt_list10=["\.hum","\.huma"]

In [91]:
patt_all=[patt_list,patt_list1,patt_list2,patt_list3,patt_list4,patt_list5,patt_list6,patt_list7,patt_list8,patt_list9,patt_list10]

In [92]:
#list of strings corresponding to replacements of patterns in patt_all (1st element in replace list corresponds to replacement of patterns in 1st list of patt_all etc..
replace_list=['.nipped','.nippon','bin busybox','Uirusu',
              'bin busybox cat bin busybox while read i',
              'bin busybox rm proc sys fs binfmt_misc .',
              'bin busybox cat proc sys fs binfmt_misc .',
              'sys fs cgroup blkio',
               'sys fs fuse connections','cgroup perf_event',
               '.human']

In [95]:
# Replace cut commands with replace_list
for i in tqdm.tqdm(range(len(sessions_list_cleaned))):
    for j in range(len(sessions_list_cleaned[i])):
        for k in range(len(patt_all)):
            for l in range(len(patt_all[k])):
                patt=re.compile("(?<!\w)"+patt_all[k][l]+"(?!\w)$")
                if re.search(patt,sessions_list_cleaned[i][j]):
                    if j==len(sessions_list_cleaned[i])-1: #commands closing a session (that's why commands in patt_all considered as cut commands)
                        sessions_list_cleaned[i][j] = re.sub(patt, replace_list[k], sessions_list_cleaned[i][j])

100%|██████████| 124883/124883 [53:22<00:00, 39.00it/s] 


### HEX patt replace


In [96]:
replacements = {r"(?<!\.)\bx[a-fA-F0-9]{2}\b(?!\.)": " HEX "}
#iterate throught corpus
for sess in tqdm.tqdm(range(len(sessions_list_cleaned))):
    for comm in range(len(sessions_list_cleaned[sess])):
        #iterate through replacement patters
        for key, value in replacements.items():
            text_test = re.sub(key, value, sessions_list_cleaned[sess][comm])
            if text_test.startswith(" HEX "): 
                text_test = text_test[1:] 
            if text_test.endswith(" HEX "):
                text_test = text_test[:-1]
            if text_test.endswith(" HEX  "):
                text_test = text_test[:-2]    
        text_test = re.sub(' +', ' ', text_test) #detect double white spaces and substitute with single space
        sessions_list_cleaned[sess][comm] = text_test

100%|██████████| 124883/124883 [01:30<00:00, 1377.45it/s]


### Format data

In [97]:
# For each command, Create list of tokens(words) from string command

sessions_list_new=[]
for sess in tqdm.tqdm(sessions_list_cleaned):
    sess_ls=[]
    for comm in sess:
        sess_ls.append(comm.split(' '))
    sessions_list_new.append(sess_ls)

100%|██████████| 124883/124883 [02:24<00:00, 865.10it/s] 


In [98]:
# Make list of list of lists a single list (to then find the unique tokens)
flat_sessions_list_new=[k for i in sessions_list_new for j in i for k in j]


In [99]:
uniq_tokens=set(flat_sessions_list_new)

In [100]:
# Create a dictionary that maps the unique tokens to unique numbers (Vocabulary)

mapping={key:val for val,key in enumerate(uniq_tokens)}

In [102]:
# Obtain the final form of the data to be used as input in algo
# Use mapping to assign the unique values to the tokens in list of sessions

final_sessions_list=[]
for i in tqdm.tqdm(sessions_list_new):
    final_ls=[]
    for j in i:
        final_ls.append(np.vectorize(mapping.__getitem__)(j))
    final_sessions_list.append(final_ls)

100%|██████████| 124883/124883 [04:29<00:00, 463.43it/s]


In [112]:
# Three empty string tokens '' identified, drop them from final_sessions_list and from mapping dictionary

for i in tqdm.tqdm(range(len(final_sessions_list))):
    for j in range(len(final_sessions_list[i])):
        if 0 in final_sessions_list[i][j]:
            #print(i,j)
            '''Exclude empty string '' from list of tokens'''
            mask=final_sessions_list[i][j]!=0 
            final_sessions_list[i][j]=final_sessions_list[i][j][mask]

100%|██████████| 124883/124883 [00:45<00:00, 2727.50it/s]


In [118]:
del mapping['']