In [32]:
import os
basepath = os.path.dirname(os.path.abspath("."))
REPO_ROOT = os.path.join(basepath, "..")

import sys
sys.path.append(REPO_ROOT)

import json
from tqdm import tqdm
from nebula.preprocessing import JSONTokenizerBPE, JSONTokenizerWhiteSpace
from nebula.constants import JSON_CLEANUP_SYMBOLS
from pandas import read_csv, to_datetime

LIMIT = 30

folder = os.path.join(REPO_ROOT, r"data\data_raw\Avast\Public_Avast_CTU_CAPEv2_Dataset_Small\public_small_reports")
EXAMPLE_PATHS = [os.path.join(folder, x) for x in os.listdir(folder)[:LIMIT]]

label_file = os.path.join(REPO_ROOT, r"data\data_raw\Avast\Public_Avast_CTU_CAPEv2_Dataset_Small\public_labels.csv")
LABEL_FIELD = 'classification_family'
LABEL_TABLE = read_csv(label_file)
LABEL_MAP = dict(zip(
    sorted(LABEL_TABLE[LABEL_FIELD].unique()),
    list(range(LABEL_TABLE[LABEL_FIELD].nunique()))
))

In [33]:
capa_normalizer = {
    "resolved_apis": lambda x: x.lower(),
    "mutexes": lambda x: x.lower()
}

X_raw_train = []
X_raw_test = []
y_train = []
y_test = []
train_test_split_date = '2019-08-01'
for example in tqdm(EXAMPLE_PATHS):
    hhash = os.path.basename(example).replace(".json", "")
    sample_data = LABEL_TABLE[LABEL_TABLE['sha256'] == hhash].iloc[0]
    family = sample_data[LABEL_FIELD]
    
    with open(example, encoding='utf-8') as f:
        sample = json.load(f)
    sample = sample["behavior"]['summary']
    normalized_sample = {field: [capa_normalizer[field](x) for x in sample[field]] for field in capa_normalizer}
    if to_datetime(sample_data['date']) < to_datetime(train_test_split_date):
        X_raw_train.append(normalized_sample)
        y_train.append(LABEL_MAP[family])
    else:
        X_raw_test.append(normalized_sample)
        y_test.append(LABEL_MAP[family])

100%|██████████| 30/30 [00:00<00:00, 56.34it/s]


In [35]:
tokenizer = JSONTokenizerWhiteSpace(
    vocab_size=300,
    seq_len=512,
    cleanup_symbols=JSON_CLEANUP_SYMBOLS,
    stopwords=[]
)

tokenizer.train(X_raw_train)
encoded = tokenizer.encode(X_raw_train)
print(encoded.shape)

100%|██████████| 21/21 [00:00<00:00, 2333.01it/s]


(21, 512)


In [36]:
tokenizer_bpe = JSONTokenizerBPE(
    vocab_size=300,
    seq_len=512,
    cleanup_symbols=JSON_CLEANUP_SYMBOLS,
    stopwords=[]
)
tokenizer_bpe.train(X_raw_train)
encoded = tokenizer.encode(X_raw_train)
print(encoded.shape)

	You need to train tokenizer with .train() or specify 'model_path=' during initialization!


(21, 512)
