In [33]:
import os
import yaml
import torch
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, roc_auc_score, roc_curve, auc

LIMIT = None
SEED = 33
ROOT = os.path.dirname(os.path.abspath('__file__'))
sigma_yml = os.path.join(ROOT, "data", "rvrs_sigma.yml")

def load_data():
    train_base_parquet_file = [x for x in os.listdir(os.path.join(ROOT,'data/train_baseline.parquet/')) if x.endswith('.parquet')][0]
    test_base_parquet_file = [x for x in os.listdir(os.path.join(ROOT,'data/test_baseline.parquet/')) if x.endswith('.parquet')][0]
    train_rvrs_parquet_file = [x for x in os.listdir(os.path.join(ROOT,'data/train_rvrs.parquet/')) if x.endswith('.parquet')][0]
    test_rvrs_parquet_file = [x for x in os.listdir(os.path.join(ROOT,'data/test_rvrs.parquet/')) if x.endswith('.parquet')][0]

    # load as dataframes
    train_baseline_df = pd.read_parquet(os.path.join(ROOT,'data/train_baseline.parquet/', train_base_parquet_file))
    test_baseline_df = pd.read_parquet(os.path.join(ROOT,'data/test_baseline.parquet/', test_base_parquet_file))
    train_malicious_df = pd.read_parquet(os.path.join(ROOT,'data/train_rvrs.parquet/', train_rvrs_parquet_file))
    test_malicious_df = pd.read_parquet(os.path.join(ROOT,'data/test_rvrs.parquet/', test_rvrs_parquet_file))

    # X_train_non_shuffled = train_baseline_df['cmd'].values.tolist() + train_malicious_df['cmd'].values.tolist()
    X_train_non_shuffled = train_baseline_df['cmd'].values.tolist() + test_malicious_df['cmd'].values.tolist()
    # y_train = np.array([0] * len(train_baseline_df) + [1] * len(train_malicious_df), dtype=np.int8)
    y_train = np.array([0] * len(train_baseline_df) + [1] * len(test_malicious_df), dtype=np.int8)
    X_train_cmds, y_train = shuffle(X_train_non_shuffled, y_train, random_state=SEED)

    # X_test_non_shuffled = test_baseline_df['cmd'].values.tolist() + test_malicious_df['cmd'].values.tolist()
    X_test_non_shuffled = test_baseline_df['cmd'].values.tolist() + train_malicious_df['cmd'].values.tolist()
    # y_test = np.array([0] * len(test_baseline_df) + [1] * len(test_malicious_df), dtype=np.int8)
    y_test = np.array([0] * len(test_baseline_df) + [1] * len(train_malicious_df), dtype=np.int8)
    X_test_cmds, y_test = shuffle(X_test_non_shuffled, y_test, random_state=SEED)

    # ===========================================
    # DATASET LIMITS FOR TESTING
    # ===========================================
    X_train_cmds = X_train_cmds[:LIMIT]
    y_train = y_train[:LIMIT]
    
    X_test_cmds = X_test_cmds[:LIMIT]
    y_test = y_test[:LIMIT]

    return X_train_cmds, y_train, X_test_cmds, y_test

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def get_tpr_at_fpr(predicted_probs, true_labels, fprNeeded=1e-4):
    # if isinstance(predicted_logits, torch.Tensor):
    #     predicted_probs = torch.sigmoid(predicted_logits).cpu().detach().numpy()
    # else:
    #     predicted_probs = sigmoid(predicted_logits)
    
    if isinstance(true_labels, torch.Tensor):
        true_labels = true_labels.cpu().detach().numpy()
    
    fpr, tpr, thresholds = roc_curve(true_labels, predicted_probs)
    if all(np.isnan(fpr)):
        return np.nan#, np.nan
    else:
        tpr_at_fpr = tpr[fpr <= fprNeeded][-1]
        #threshold_at_fpr = thresholds[fpr <= fprNeeded][-1]
        return tpr_at_fpr#, threshold_at_fpr

X_train_cmds, y_train, X_test_cmds, y_test = load_data()
print('X_train_cmds: ', len(X_train_cmds))
print('X_test_cmds: ', len(X_test_cmds))

X_train_cmds:  501573
X_test_cmds:  501570


In [29]:
def apply_sigma(X_cmds, y, sigma_yml, verbose=False):
    with open(sigma_yml, "r") as f:
        sigma = yaml.load(f, Loader=yaml.FullLoader)
    patterns = sigma['detection']['keywords']
    
    y_pred = []
    for i, cmd in enumerate(X_cmds):
        # match if pattern in cmd
        for pattern in patterns:
            #if re.search(pattern, cmd):
            if pattern in cmd:
                if verbose:
                    print(f"[!] True label: {y[i]} | Found '{pattern}' in '{cmd}'")
                y_pred.append(1)
                break
        else:
            y_pred.append(0)
        
    return y_pred

In [34]:
y_test_pred = apply_sigma(X_test_cmds, y_test, sigma_yml)

In [46]:
print(np.sum(y_test))
print(len(y_test) - np.sum(y_test))

266501
235069


In [45]:
mask = y_test == 1
true_positives = np.array(y_test_pred)[mask]
false_positives = np.array(y_test_pred)[~mask]
print("TP", np.sum(true_positives))
print("FP", np.sum(false_positives))

TP 8974
FP 0


In [36]:
tpr = get_tpr_at_fpr(y_test_pred, y_test, fprNeeded=1e-4)
acc = accuracy_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)
auc = roc_auc_score(y_test, y_test_pred)

print('TPR at FPR=1e-4: {:.4f}'.format(tpr))
print('Accuracy: {:.4f}'.format(acc))
print('F1 score: {:.4f}'.format(f1))
print('AUC: {:.4f}'.format(auc))

TPR at FPR=1e-4: 0.0337
Accuracy: 0.4866
F1 score: 0.0652
AUC: 0.5168
