In [63]:
import os
import time
import torch
import pickle
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, roc_auc_score, roc_curve, auc

# importing one class models -- oc svm and isolation forest
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.linear_model import SGDOneClassSVM

# tokenizers
from src.preprocessors import CommandTokenizer, OneHotCustomVectorizer
from nltk.tokenize import wordpunct_tokenize


LIMIT = None
SEED = 33
ROOT = os.path.dirname(os.path.abspath('__file__'))
VOCAB_SIZE = 4096
MAX_LEN = 128

LOGS_FOLDER = "logs_one_class"
os.makedirs(LOGS_FOLDER, exist_ok=True)

TOKENIZER = wordpunct_tokenize
tokenizer = CommandTokenizer(tokenizer_fn=TOKENIZER, vocab_size=VOCAB_SIZE)

def load_data():
    train_base_parquet_file = [x for x in os.listdir(os.path.join(ROOT,'data/train_baseline.parquet/')) if x.endswith('.parquet')][0]
    test_base_parquet_file = [x for x in os.listdir(os.path.join(ROOT,'data/test_baseline.parquet/')) if x.endswith('.parquet')][0]
    train_rvrs_parquet_file = [x for x in os.listdir(os.path.join(ROOT,'data/train_rvrs.parquet/')) if x.endswith('.parquet')][0]
    test_rvrs_parquet_file = [x for x in os.listdir(os.path.join(ROOT,'data/test_rvrs.parquet/')) if x.endswith('.parquet')][0]

    # load as dataframes
    train_baseline_df = pd.read_parquet(os.path.join(ROOT,'data/train_baseline.parquet/', train_base_parquet_file))
    test_baseline_df = pd.read_parquet(os.path.join(ROOT,'data/test_baseline.parquet/', test_base_parquet_file))
    train_malicious_df = pd.read_parquet(os.path.join(ROOT,'data/train_rvrs.parquet/', train_rvrs_parquet_file))
    test_malicious_df = pd.read_parquet(os.path.join(ROOT,'data/test_rvrs.parquet/', test_rvrs_parquet_file))

    X_train_non_shuffled = train_baseline_df['cmd'].values.tolist() + train_malicious_df['cmd'].values.tolist()
    y_train = np.array([0] * len(train_baseline_df) + [1] * len(train_malicious_df), dtype=np.int8)
    X_train_cmds, y_train = shuffle(X_train_non_shuffled, y_train, random_state=SEED)

    X_test_non_shuffled = test_baseline_df['cmd'].values.tolist() + test_malicious_df['cmd'].values.tolist()
    y_test = np.array([0] * len(test_baseline_df) + [1] * len(test_malicious_df), dtype=np.int8)
    X_test_cmds, y_test = shuffle(X_test_non_shuffled, y_test, random_state=SEED)

    # ===========================================
    # DATASET LIMITS FOR TESTING
    # ===========================================
    X_train_cmds = X_train_cmds[:LIMIT]
    y_train = y_train[:LIMIT]
    
    X_test_cmds = X_test_cmds[:LIMIT]
    y_test = y_test[:LIMIT]

    return X_train_cmds, y_train, X_test_cmds, y_test

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def get_tpr_at_fpr(predicted_probs, true_labels, fprNeeded=1e-4):
    # if isinstance(predicted_logits, torch.Tensor):
    #     predicted_probs = torch.sigmoid(predicted_logits).cpu().detach().numpy()
    # else:
    #     predicted_probs = sigmoid(predicted_logits)
    
    if isinstance(true_labels, torch.Tensor):
        true_labels = true_labels.cpu().detach().numpy()
    
    fpr, tpr, thresholds = roc_curve(true_labels, predicted_probs)
    if all(np.isnan(fpr)):
        return np.nan#, np.nan
    else:
        tpr_at_fpr = tpr[fpr <= fprNeeded][-1]
        #threshold_at_fpr = thresholds[fpr <= fprNeeded][-1]
        return tpr_at_fpr#, threshold_at_fpr

X_train_cmds, y_train, X_test_cmds, y_test = load_data()
if LIMIT is not None:
    X_train_cmds = X_train_cmds[:LIMIT]
    y_train = y_train[:LIMIT]
    X_test_cmds = X_test_cmds[:LIMIT]
    y_test = y_test[:LIMIT]
    
print('X_train_cmds: ', len(X_train_cmds))
print('X_test_cmds: ', len(X_test_cmds))

X_train_cmds:  533014
X_test_cmds:  470129


### Since these are one-class models -- preserving only malicious samples in the training set

In [64]:
train_malicious_mask = y_train == 1
X_train_cmds_malicous = np.array(X_train_cmds)[train_malicious_mask]
y_train_malicous = np.array(y_train)[train_malicious_mask]

In [65]:
def fit_one_hot(X):
    oh = OneHotCustomVectorizer(tokenizer=TOKENIZER, max_features=VOCAB_SIZE)
    print("[*] Fitting One-Hot encoder...")
    now = time.time()
    oh.fit(X)
    print(f"[!] Finished fitting One-Hot encoder in {time.time() - now:.2f} seconds")
    return oh

# building one-hot encoder
oh_vectorizer_file = os.path.join(LOGS_FOLDER, f"onehot_vectorizer_{VOCAB_SIZE}_lim_{LIMIT}.pkl")
if os.path.exists(oh_vectorizer_file):
    print("[*] Loading One-Hot vectorizer...")
    oh = pickle.load(open(oh_vectorizer_file, "rb"))
else:
    oh = fit_one_hot(X_train_cmds_malicous)
    with open(oh_vectorizer_file, "wb") as f:
        pickle.dump(oh, f)


print("[*] Transforming train and test sets...")

now = time.time()
X_train_onehot_malicious = oh.transform(X_train_cmds_malicous)
print(f"[!] Finished transforming train set in {time.time() - now:.2f} seconds")

now = time.time()
X_test_onehot = oh.transform(X_test_cmds)
print(f"[!] Finished transforming test set in {time.time() - now:.2f} seconds")

print(f"[!] Shapes: X_train_onehot_malicious: {X_train_onehot_malicious.shape}, X_test_onehot: {X_test_onehot.shape}")

[*] Fitting One-Hot encoder...
[!] Finished fitting One-Hot encoder in 8.27 seconds
[*] Transforming train and test sets...
[!] Finished transforming train set in 29.07 seconds
[!] Finished transforming test set in 31.56 seconds
[!] Shapes: X_train_onehot_malicious: (266501, 4096), X_test_onehot: (470129, 4096)


In [66]:
def train_and_predict(model, X_train, X_test, y_test, name=""):
    # training
    print("[*] Training model...")
    now = time.time()
    model.fit(X_train)
    print(f"[*] Training model took {time.time() - now:.2f} seconds.")
    
    # dump model
    model_file = os.path.join(LOGS_FOLDER, f"model_{name}_{VOCAB_SIZE}_lim_{LIMIT}.pkl")
    with open(model_file, "wb") as f:
        pickle.dump(model, f)

    # predicting
    model_preds = model.predict(X_test)
    model_preds = np.array([1 if x == -1 else 0 for x in model_preds])

    # calculating metrics
    model_tpr = get_tpr_at_fpr(model_preds, y_test, fprNeeded=1e-4)
    model_f1 = f1_score(y_test, model_preds)
    model_acc = accuracy_score(y_test, model_preds)
    model_auc = roc_auc_score(y_test, model_preds)

    print('TPR at FPR=1e-4: {:.4f}%'.format(model_tpr*100))
    print('F1 score: {:.4f}%'.format(model_f1*100))
    print('Accuracy: {:.4f}%'.format(model_acc*100))
    print('AUC: {:.4f}%'.format(model_auc*100))

    return model_tpr, model_f1, model_acc, model_auc

### OneClassSVM is really slow model:
  - takes 30 sec (optimal speed, nu='0.1') / 2 mins (best scores, nu=0.5) per 50k samples 
  - takes ~2h 30 min (optimal speed, nu='0.1') / ?? h (best scores, nu=0.5) per full dataset

In [44]:
# oc svm
# for nu in [0.1, 0.3, 0.5, 0.7, 0.9]:
#     print(f"\n[!] Nu: {nu}")
#     oc_svm = OneClassSVM(kernel='rbf', gamma='auto', nu=nu)
#     _ = train_and_predict(oc_svm, X_train_onehot_malicious, X_test_onehot, y_test, name=f"oc_svm_nu_{nu}")


[!] Nu: 0.1
[*] Training model...
[*] Training model took 27.14 seconds.
TPR at FPR=1e-4: 9.7907%
F1 score: 17.8353%
Accuracy: 54.6500%
AUC: 54.8954%

[!] Nu: 0.3
[*] Training model...
[*] Training model took 68.38 seconds.
TPR at FPR=1e-4: 9.7987%
F1 score: 17.8478%
Accuracy: 54.6520%
AUC: 54.8973%

[!] Nu: 0.5
[*] Training model...
[*] Training model took 89.08 seconds.
TPR at FPR=1e-4: 0.0000%
F1 score: 51.7451%
Accuracy: 67.1780%
AUC: 67.3540%

[!] Nu: 0.7
[*] Training model...
[*] Training model took 103.54 seconds.
TPR at FPR=1e-4: 0.0000%
F1 score: 82.0666%
Accuracy: 81.5200%
AUC: 81.5058%

[!] Nu: 0.9
[*] Training model...
[*] Training model took 106.18 seconds.
TPR at FPR=1e-4: 0.0000%
F1 score: 84.1966%
Accuracy: 82.1080%
AUC: 82.0385%


### SGD implementation of OneClassSVM is much faster

In [69]:
# oc svm sgd version
for nu in [0.5, 0.9]:
    for tol in [1e-3, 1e-4, 1e-5]:
        print(f"\n[!] Nu: {nu} | tol: {tol}")
        oc_svm_sgd = SGDOneClassSVM(nu=nu, random_state=SEED, learning_rate='constant', eta0=0.1, tol=tol, max_iter=1000)
        _ = train_and_predict(oc_svm_sgd, X_train_onehot_malicious, X_test_onehot, y_test, name=f"oc_svm_sgd_nu_{nu}")


[!] Nu: 0.5
[*] Training model...


In [68]:
# isolation forest
for contamination in ["auto", 0.1, 0.3, 0.5]:
    for max_samples in ['auto', 0.01, 0.05, 0.1, 0.3]:
        print(f"\n[!] Contamination: {contamination} | Max samples: {max_samples}")
        isolation_forest = IsolationForest(n_estimators=100, max_samples=0.1, contamination=contamination, 
                                    max_features=1.0, bootstrap=False, n_jobs=-1, random_state=SEED)
        _ = train_and_predict(isolation_forest, X_train_onehot_malicious, X_test_onehot, y_test, name=f"isolation_forest_cont_{contamination}")


[!] Contamination: auto
[*] Training model...
[*] Training model took 2.92 seconds.
TPR at FPR=1e-4: 0.0000%
F1 score: 0.0000%
Accuracy: 50.0010%
AUC: 50.0000%

[!] Contamination: 0.5
[*] Training model...
[*] Training model took 11.51 seconds.
TPR at FPR=1e-4: 0.0000%
F1 score: 51.6309%
Accuracy: 67.2241%
AUC: 67.2235%
