# Full fasttext pipeline

Can be used to test different scenarios, e.g. separator, etc.


- (A) Take raw events and produce string with event sequence within a time window
- (B) Train a FastText model on this data
- (C) Take host with Red Teaming actions and preprocess its events to fasttext format
- (D) Evaluate on behalf of anomaly detections and plot results in 3D

In [None]:
import fasttext
import os
import numpy as np
import sys
sys.path.append("../../")
from utils.preprocessing import auditdPreprocessTable, auditdReadAndFilterFile
from utils.misc import getNotebookPath, splitDataFrameTimeStampToChunks, isolationForestAnomalyDetctions
from utils.plots import plot3D, tSneReductionTo3D

SCRIPT_PATH = getNotebookPath()

# input
DATA_FOLDER = SCRIPT_PATH + "\\..\\..\\data\\auditd_raw\\"
LIMIT = None
files = os.listdir(DATA_FOLDER)[:LIMIT]

# output
OUT_FILE = SCRIPT_PATH + "\\auditd_parsed.out"
#OUT_FILE = SCRIPT_PATH + "\\auditd_parsed_noLengthLimit.out"
open(OUT_FILE, 'w').close() # clears file if exists


print('[*] preprocessing ... ')
for i,file in enumerate(files):
    fileFullPath = DATA_FOLDER + file
    df = auditdReadAndFilterFile(fileFullPath)

    # run preprocessing
    out = auditdPreprocessTable(df)
    #out = auditdPreprocessTable(df, lengthLimit=None)
    with open(OUT_FILE, "a", encoding='utf-8') as f:
        f.writelines(out)

print('[*] model training ... ')
EPOCHS = 5
modelName = f'auditd_model_epoch{EPOCHS}_dim100.bin'
model = fasttext.train_unsupervised(input='auditd_parsed.out', 
                                    model='cbow',
                                    dim=100,
                                    epoch=EPOCHS
                                  )
model.save_model(modelName)
#model = fasttext.load_model(modelName)


# intrinsic evaluation on red team host
testpath = r"C:\Users\dtrizna\Code\ReverseShellModel\data\auditd_redTeam_Vanquish\hn0-mcvq-h.mcvq-hdi-1\\"
files = os.listdir(testpath)

arr = np.empty((0,100)) # 100 -- fasttext dim
arrRaw = []
for i, testfile in enumerate(files):
    df = auditdReadAndFilterFile(testpath + testfile)
    if df.empty:
        continue

    # ground timestamps to 5 min intervals
    df = splitDataFrameTimeStampToChunks(df.copy(), chunkSize="5min")
    
    # get grounded timestamps
    chunks = df.TimeStamp.unique()

    # parse each interval separately
    for chunk in chunks:
        newdf = df[df.TimeStamp == chunk].copy()
        auditdHostActivities = auditdPreprocessTable(newdf)
        
        for hostActivity in auditdHostActivities:
            hostRepresentation = model.get_sentence_vector(hostActivity.strip())
            arrRaw.append(newdf)
            arr = np.vstack((arr, hostRepresentation))

    print(f"{i+1}/{len(files)} File: {testfile}; Chunks: {len(chunks)}; Array: {arr.shape}")

labels = isolationForestAnomalyDetctions(arr)
arrnew = tSneReductionTo3D(arr)
plot3D(arrnew, labels, "Isolation Forest")

anomalousActivityTables = [arrRaw[x] for x in np.where(labels != 1)[0]]