In [82]:
import orjson
import time
import logging

import os
import sys
sys.path.extend([".", "../.."])
from nebula.misc import get_path
from nebula import PEDynamicFeatureExtractor, JSONTokenizer

SCRIPT_PATH = get_path(type="notebook")

In [83]:
SPEAKEASY_RECORD_FIELDS = [
    'file_access.event',
    'file_access.path',
    'network_events.traffic.server',
    'network_events.traffic.port',
    'registry_access.event',
    'registry_access.path',
    'apis.api_name',
    'apis.args',
    'apis.ret_val',
]
SPEAKEASY_RECORD_LIMITS = {"network_events.traffic": 256}
JSON_CLEANUP_SYMBOLS = ['"', "'", ":", ",", "[", "]", "{", "}", "\\", "/"]
BENIGN_FOLDERS = ["report_clean", "report_windows_syswow64"]

REPO_ROOT = os.path.join(SCRIPT_PATH, "..", "..")
EMULATION_TRAINSET_PATH = os.path.join(REPO_ROOT, "data", "data_raw", "windows_emulation_trainset")
EMULATION_TESTSET_PATH = os.path.join(REPO_ROOT, "data", "data_raw", "windows_emulation_testset")

extractor = PEDynamicFeatureExtractor(
    speakeasyRecordFields=SPEAKEASY_RECORD_FIELDS,
    recordLimits=SPEAKEASY_RECORD_LIMITS
)

def readAndFilterFolders(subFolders, parserFunction, limit=None):
    """
    Reads and filters all json files in subFolders. Returns a list of events and a list of y values.
    """
    events = []
    y = []
    yHashes = []
    for subFolder in subFolders:
        
        files = [os.path.join(subFolder,x) for x in os.listdir(subFolder)[:limit] if x.endswith(".json")]
        for file in files:
            with open(file, "r") as f:
                entryPoints = orjson.loads(f.read())

            jsonEventRecord = parserFunction(entryPoints)
            if jsonEventRecord:
                events.append(jsonEventRecord)
                
                hhash = os.path.basename(file).rstrip('.json')
                yHashes.append(hhash)
                if os.path.basename(subFolder) in BENIGN_FOLDERS:
                    y.append(0)
                else:
                    y.append(1)
    return events, y, yHashes

subFoldersTrain = [os.path.join(EMULATION_TRAINSET_PATH, x) for x in os.listdir(EMULATION_TRAINSET_PATH) if x.startswith("report_")]

eventsTrain, yTrain, yHashesTrain = readAndFilterFolders(subFoldersTrain, extractor.filter_and_normalize_report, limit=100)

In [84]:
eventsTrain[0:1]

[{'file_access': [{'event': 'create',
    'path': '<drive>\\windows\\temp\\golfinfo.ini'},
   {'event': 'create', 'path': '<drive>\\windows\\temp\\golfset.ini'},
   {'event': 'create', 'path': '<drive>\\windows\\temp\\golfinfo.ini'},
   {'event': 'write', 'path': '<drive>\\windows\\temp\\golfinfo.ini'},
   {'event': 'open', 'path': '<drive>\\windows\\system32\\<sha256>'},
   {'event': 'read', 'path': '<drive>\\windows\\system32\\<sha256>'},
   {'event': 'create', 'path': 'cd.exedows\\temp\\hgdraw.dll'},
   {'event': 'write', 'path': 'cd.exedows\\temp\\hgdraw.dll'}],
  'apis': [{'api_name': 'KERNEL32.GetSystemTimeAsFileTime',
    'args': ['0x1211fd8'],
    'ret_val': None},
   {'api_name': 'KERNEL32.GetCurrentProcessId',
    'args': [],
    'ret_val': '0x7e148'},
   {'api_name': 'KERNEL32.GetCurrentThreadId',
    'args': [],
    'ret_val': '0x7e15c'},
   {'api_name': 'KERNEL32.GetTickCount', 'args': [], 'ret_val': '0x5265c14'},
   {'api_name': 'KERNEL32.QueryPerformanceCounter',
    'ar

In [85]:
tokenizer = JSONTokenizer(
    patternCleanup=JSON_CLEANUP_SYMBOLS
)
#tokenizer.clearJsonEvent(eventsTrain)

In [86]:
import sentencepiece as spm
# spm.SentencePieceTrainer.Train(rf'''--input=auditd_total.cm --model_prefix=auditd --vocab_size=1024 --model_type=bpe --split_by_number=false''')
sp = spm.SentencePieceProcessor()

In [87]:
print(sp.__class__)
print(tokenizer.tokenizer.__class__)

<class 'sentencepiece.SentencePieceProcessor'>
<class 'nltk.tokenize.regexp.WhitespaceTokenizer'>


In [279]:
from nebula.constants import *
import string 
import numpy as np

def get_alphanum_chars(s):
    return ''.join(filter(lambda x: x in string.ascii_letters + string.digits + string.punctuation, s))


class JSONTokenizerBPE:
    def __init__(self,
                model_path=None,
                patternCleanup=JSON_CLEANUP_SYMBOLS,
                stopwords=SPEAKEASY_TOKEN_STOPWORDS,
                specialTokens = ["<unk>", "<pad>", "<mask>"]):
        self.patternCleanup = patternCleanup
        self.stopwords = stopwords

        self.specialTokens = dict(zip(specialTokens, range(len(specialTokens))))
        self.pad_token = "<pad>"
        self.unk_token = "<unk>"
        self.mask_token = "<mask>"
        self.pad_token_id = self.specialTokens[self.pad_token]
        self.unk_token_id = self.specialTokens[self.unk_token]
        self.mask_token_id = self.specialTokens[self.mask_token]

        if model_path:
            self.tokenizer = spm.SentencePieceProcessor(model_file=model_path.rstrip(".model")+".model")
        else:
            self.tokenizer = spm.SentencePieceTrainer
        self.vocab = None
        self.reverse_vocab = None
    
    def split_string_to_chunks(self, s, chunkSize=4192):
        """This function should split a long string into smaller chunks of size chunkSize, 
        but it shouldn't split the string in the middle of a word.

        Args:
            s (str): Longstring
            chunkSize (int, optional): _description_. Defaults to 512.

        Returns:
            list: List of smaller strings
        """
        chunks = []
        words = s.split(" ")
        currentChunk = ""
        for word in words:
            if len(currentChunk) + len(word) < chunkSize:
                currentChunk += word + " "
            else:
                chunks.append(currentChunk)
                currentChunk = word + " "
        chunks.append(currentChunk)
        return chunks

    def clear_json_event(self, jsonData):
        """
        Removes all special characters from the json event.
        """
        assert isinstance(jsonData, (str, bytes, list, dict))
        jsonData = str(jsonData).lower()
        for pattern in self.patternCleanup:
            jsonData = jsonData.replace(pattern, " ")
        jsonData = [get_alphanum_chars(x) for x in jsonData.split(" ") if x not in self.stopwords]
        return ' '.join(jsonData)

    def load_vocab(self):
        with open(self.model_path+".vocab", encoding="utf-8") as f:
            vocab = f.read().splitlines()
        vocab = [x.split("\t")[0] for x in vocab]
        self.vocab = {k:i for i,k in enumerate(vocab)}
        self.reverse_vocab = {v:k for k,v in self.vocab.items()}
        

    def train(self, jsonData, model_prefix="speakeasy", vocab_size=1024, model_type="bpe", split_by_number=False, spLength=4192, removeTrainFiles=True):
        """
        Trains the tokenizer on the given json data.
        """
        jsonDataClean = self.clear_json_event(jsonData)
        # splitting a string into chunks of 4192 characters since this sentencepiece limitation
        jsonDataChunks = self.split_string_to_chunks(jsonDataClean, chunkSize=spLength)
        # dump jsonDataClean to file
        trainFile = f"{model_prefix}_tokenizer_train_{int(time.time())}.txt"
        with open(trainFile, "w", encoding="utf-8") as f:
            f.write("\n".join(jsonDataChunks))

        trainCmd = " ".join([
            f"--input={trainFile}",
            f"--model_prefix={model_prefix}",
            f"--vocab_size={vocab_size}",
            f"--model_type={model_type}",
            f"--split_by_number={split_by_number}",
            f"--max_sentence_length={spLength}",
            f"--max_sentencepiece_length=64"
        ])
        print(f"Training tokenizer with command: {trainCmd}")
        self.tokenizer.Train(trainCmd)
        self.tokenizer = spm.SentencePieceProcessor(model_file=f"{model_prefix}.model")
        
        self.model_path = model_prefix
        self.load_vocab()

        if removeTrainFiles:
            os.remove(trainFile)
            os.remove(f"{model_prefix}.vocab")
    
    def tokenize(self, jsonData):
        """
        Tokenizes the given json data.
        """
        if isinstance(jsonData, (str, bytes, dict)):
            jsonData = [jsonData]
        jsonDataClean = [self.clear_json_event(x) for x in jsonData]
        return [self.tokenizer.encode_as_pieces(x) for x in jsonDataClean]
    
    def encode(self, jsonData):
        """
        Encodes the given json data.
        """
        if isinstance(jsonData, (str, bytes, dict)):
            jsonData = [jsonData]
        jsonDataClean = [self.clear_json_event(x) for x in jsonData]
        return [self.tokenizer.encode_as_ids(x) for x in jsonDataClean]

    def pad_sequence(self, encodedSequence):
        if len(encodedSequence) > self.sequenceLength:
            return encodedSequence[:self.sequenceLength]
        else:
            return encodedSequence + [self.pad_token_id] * (self.sequenceLength - len(encodedSequence))
    
    def pad_sequence_list(self, encodedSequenceList, sequenceLength=512):
        self.sequenceLength = sequenceLength
        return np.array([self.pad_sequence(x) for x in encodedSequenceList], dtype=np.int32)

    def pad_sequences(self, encodedSequences, sequenceLength=512):
        return self.pad_sequence_list(encodedSequences, sequenceLength=sequenceLength)

In [280]:
t = JSONTokenizerBPE(model_path="speakeasy_2k.model")
t.tokenizer
#prefix=r"speakeasy_2k"
#t.train(eventsTrain, vocab_size=2e3, model_prefix=prefix)

<sentencepiece.SentencePieceProcessor; proxy of <Swig Object of type 'sentencepiece::SentencePieceProcessor *' at 0x0000015C3AE6BCF0> >

In [281]:
print(t.tokenize(eventsTrain[0:3])[0][0:10])

['▁file', '_', 'access', '▁create', '▁<', 'drive', '>', '▁windows', '▁temp', '▁golfinfo']


In [282]:
t.encode(eventsTrain[0])[0][0:10]

[1303, 1983, 946, 346, 724, 771, 1988, 286, 483, 1312]

In [283]:
t.tokenizer.decode(t.encode(eventsTrain[0:10])[0][0:10])

'file_access create <drive> windows temp golfinfo'

In [284]:
encoded = t.encode(eventsTrain[0:10])

print(t.pad_sequences(encoded).shape)
print(t.pad_sequences(encoded, sequenceLength=1024).shape)

(10, 512)
(10, 1024)
