In [8]:
import os
import yaml
import torch
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, roc_auc_score, roc_curve, auc

LIMIT = None
SEED = 33
ROOT = ".."

import sys
sys.path.append(ROOT)
from src.data_utils import load_data

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def get_tpr_at_fpr(predicted_probs, true_labels, fprNeeded=1e-5):
    # if isinstance(predicted_logits, torch.Tensor):
    #     predicted_probs = torch.sigmoid(predicted_logits).cpu().detach().numpy()
    # else:
    #     predicted_probs = sigmoid(predicted_logits)
    
    if isinstance(true_labels, torch.Tensor):
        true_labels = true_labels.cpu().detach().numpy()
    
    fpr, tpr, thresholds = roc_curve(true_labels, predicted_probs)
    if all(np.isnan(fpr)):
        return np.nan#, np.nan
    else:
        tpr_at_fpr = tpr[fpr <= fprNeeded][-1]
        #threshold_at_fpr = thresholds[fpr <= fprNeeded][-1]
        return tpr_at_fpr#, threshold_at_fpr



In [9]:
# sigma_yml_path = os.path.join(ROOT, "data", "rvrs_sigma.yml")
sigma_rule_folder = os.path.join(ROOT, "data", "signatures")
sigma_rule_yamls = [os.path.join(sigma_rule_folder, f) for f in os.listdir(sigma_rule_folder) if f.endswith(".yaml")]

*_, X_train_malicious_cmd, _, X_test_malicious_cmd, _ = load_data(ROOT, seed=33, limit=100)

print('X_train_cmds: ', len(X_train_malicious_cmd))


X_train_cmds:  50


In [10]:
def apply_sigma(X_cmds, y, sigma_yml, verbose=False):
    with open(sigma_yml, "r") as f:
        sigma = yaml.load(f, Loader=yaml.FullLoader)
    patterns = sigma['detection']['keywords']
    
    y_pred = []
    for i, cmd in enumerate(X_cmds):
        # match if pattern in cmd
        for pattern in patterns:
            if pattern in cmd:
                if verbose:
                    print(f"[!] True label: {y[i]} | Found '{pattern}' in '{cmd}'")
                y_pred.append(1)
                break
        else:
            y_pred.append(0)
        
    return y_pred

In [11]:
y_train_pred = apply_sigma(X_train_malicious_cmd, [1] * len(X_train_malicious_cmd), sigma_rule_yamls[0], verbose=False)

KeyError: 'keywords'

In [4]:
mask = y_train == 1
true_positives = np.array(y_train_pred)[mask]
false_positives = np.array(y_train_pred)[~mask]
print("TP", np.sum(true_positives))
print("FP", np.sum(false_positives))

TP 8974
FP 0


In [5]:
tpr = get_tpr_at_fpr(y_train_pred, y_train, fprNeeded=1e-5)
f1 = f1_score(y_train, y_train_pred)
acc = accuracy_score(y_train, y_train_pred)
auc = roc_auc_score(y_train, y_train_pred)

print('TPR at FPR=1e-5: {:.4f}%'.format(tpr*100))
print('F1 score: {:.4f}%'.format(f1*100))
print('Accuracy: {:.4f}%'.format(acc*100))
print('AUC: {:.4f}%'.format(auc*100))

TPR at FPR=1e-5: 3.3673%
F1 score: 6.5153%
Accuracy: 51.6848%
AUC: 51.6837%
