## Imports

In [None]:
from tensorflow.keras import layers
from keras.metrics import AUC
from collections import Counter
from tensorflow import keras
from sklearn import metrics
from pathlib import Path
import tensorflow as tf
from tqdm import tqdm
import pandas as pd
import numpy as np
import random
import os

## Config

In [None]:
DATA_FOLDER: Path = "FraudedRawData"
LABEL_FILE: Path = "challengeToFill.csv"
DROPOUT_P: float = 1e-1
NORMALIZATION_EPSILON: float = 1e-6
EMBED_DIM: int = 32
SEGMENT_TOKENS_N: int = 100
TRANSFORMER_HEADS: int = 5
NN_UNITS: int = 64
TRAIN_SEGMETN_SIZE: int = 50
TEST_SEGMETN_SIZE: int = 100
TRAIN_TOKEN_N: int = SEGMENT_TOKENS_N * TRAIN_SEGMETN_SIZE
TEST_TOKEN_N: int = SEGMENT_TOKENS_N * TEST_SEGMETN_SIZE
MALISIOS_RATIO: float = 0.1
USER_FOR_TRAIN: int = 10
SEED: int = 42

In [None]:
np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)

## Loading Data

In [None]:
def load_segments(path: Path) -> tuple[dict, list, Counter]:
    users = {}
    sequences = []
    vocab = Counter()

    for person_file in os.listdir(path):
        if person_file.startswith("User"):
            user_file_path = os.path.join(path, person_file)
            with open(user_file_path, 'r') as file:
                tokens = file.read().split()

            user_vocabulary = Counter(tokens)
            vocab.update(user_vocabulary)

            train_tokens = tokens[:TRAIN_TOKEN_N]
            train_segments = [train_tokens[j:j + SEGMENT_TOKENS_N] for j in range(0, len(train_tokens) - SEGMENT_TOKENS_N + 1, 1)]
            sequences.extend(train_segments)

            test_tokens = tokens[TRAIN_TOKEN_N:]
            test_segments = [test_tokens[j:j + SEGMENT_TOKENS_N] for j in range(0, len(test_tokens), SEGMENT_TOKENS_N)]
            sequences.extend(test_segments)

            user_id = int(person_file[len("User"):])
            users[user_id] = (train_segments, test_segments)

    return users, sequences, vocab


In [None]:
def load_labels(users_size: int, path: Path) -> np.ndarray:
    all_data_size = TRAIN_SEGMETN_SIZE + TEST_SEGMETN_SIZE
    labels = np.zeros((users_size, all_data_size))
    with open(path, "r") as csvfile:
        df = pd.read_csv(csvfile)
        for user_id in range(users_size):
            if user_id < USER_FOR_TRAIN:
                labels[user_id, :] = df.iloc[user_id, 1:].to_numpy(dtype=int)
        return labels

In [None]:
users, sequences, vocab = load_segments(DATA_FOLDER)
labels = load_labels(len(users), LABEL_FILE)

## Model

In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [
                layers.Dense(ff_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=NORMALIZATION_EPSILON)
        self.layernorm2 = layers.LayerNormalization(epsilon=NORMALIZATION_EPSILON)
        self.dropout1 = layers.Dropout(DROPOUT_P)
        self.dropout2 = layers.Dropout(DROPOUT_P)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [None]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [None]:
def create_model(vocab_size, *, training: bool = True) -> keras.Model:
    input = keras.Input(shape=(SEGMENT_TOKENS_N,))
    emb = TokenAndPositionEmbedding(SEGMENT_TOKENS_N, vocab_size, EMBED_DIM)(input)
    transformer = TransformerBlock(EMBED_DIM, TRANSFORMER_HEADS, NN_UNITS)(
        emb, training=training
    )
    avg = layers.GlobalAveragePooling1D()(transformer)
    out = layers.Dense(2, activation="softmax")(avg)
    model = keras.Model(inputs=input, outputs=out)
    return model

## Feature Extraction

In [None]:
word_index = {word:idx for idx,word in enumerate(vocab)}
vocab_size = len(word_index)

In [None]:
def features(segment):
  return [word_index[command] for command in segment]

In [None]:
def generate_negative_samples(data: dict, _id: int, ratio: float = MALISIOS_RATIO, n_others: int = 10) -> tuple[np.ndarray, np.ndarray]:
  X: list = []
  y: list = []
  n_fake_samples = round(len(data[_id]["X"]) * ratio)
  labeld_one_idx = np.hstack([np.arange(_id),np.arange(_id+1,len(data))])
  uid_samples = np.random.choice(labeld_one_idx,n_others,replace=False)

  for i, uid in enumerate(uid_samples):
    for sid in range(n_fake_samples):
      X.append(data[uid]["X"][sid])
      y.append([0,1])

  return X, y

In [None]:
def create_train_test_set(users: dict, labels: np.ndarray, ratio: float = MALISIOS_RATIO, n_others: int = 10):
  users_data = {}

  for user_id in range(len(users)):
    users_data[user_id] = {"X":[],"y":[],"X_test":[],"y_test":[]}
    train_segments,test_segments = users[user_id]
    for seg_id in range(len(train_segments)):
      segment = train_segments[seg_id]
      sample = features(segment)
      users_data[user_id]["X"].append(sample)
      users_data[user_id]["y"].append([1,0])
      
    for seg_id in range(len(test_segments)):
      segment = test_segments[seg_id]
      sample = features(segment)
      label = labels[user_id][TRAIN_SEGMETN_SIZE + seg_id]
      users_data[user_id]["X_test"].append(sample)
      users_data[user_id]["y_test"].append(label)

  for user_id in range(len(users_data)):
    X_fake, y_fake = generate_negative_samples(users_data, user_id, ratio, n_others)
    users_data[user_id]["X"] += X_fake
    users_data[user_id]["y"] += y_fake
    users_data[user_id]["X"] = np.asarray(users_data[user_id]["X"])
    users_data[user_id]["y"] = np.asarray(users_data[user_id]["y"])
    users_data[user_id]["X_test"] = np.asarray(users_data[user_id]["X_test"])
    users_data[user_id]["y_test"] = np.asarray(users_data[user_id]["y_test"])

  return users_data

In [None]:
users_data = create_train_test_set(users, labels)

## Train

In [None]:
for user_id in range(40):
    model = create_model(vocab_size)
    model.compile("adam", "CategoricalCrossentropy", metrics=["accuracy",AUC(name="auc")])
    users_data[user_id]["Model"] = model
    # fit
    users_data[user_id]["Model"].fit(users_data[user_id]["X"],users_data[user_id]["y"],batch_size=128, epochs=1)
    # predict probs
    users_data[user_id]["pred_test_prob"] = users_data[user_id]["Model"].predict(users_data[user_id]["X_test"])
    users_data[user_id]["pred_train_prob"] = users_data[user_id]["Model"].predict(users_data[user_id]["X"])
    # convert to classes
    users_data[user_id]["pred_test"] = np.argmax(users_data[user_id]["pred_test_prob"], axis=-1)
    users_data[user_id]["pred_train"] = np.argmax(users_data[user_id]["pred_train_prob"], axis=-1)
    # eval on all training samples for final metrics score
    acc = metrics.accuracy_score(users_data[user_id]["y"][:,1],users_data[user_id]["pred_train"])
    fpr, tpr, threshold = metrics.roc_curve(users_data[user_id]["y"][:,1], users_data[user_id]["pred_train"])
    roc_auc = metrics.auc(fpr, tpr)
    print(f"metrics[Accuracy={round(acc,3)}, AUC={round(roc_auc,3)}]")


In [None]:
def smooth(arr, n=10):
  sorted_indices = np.argsort(arr)
  sorted_arr = arr[sorted_indices]
  smoothed_arr = arr.copy()
  threshold_value = sorted_arr[-n] if n <= len(arr) else sorted_arr[0]
  smoothed_arr[arr < threshold_value] = 0
  smoothed_arr[arr >= threshold_value] = 1
  return smoothed_arr

In [None]:
def test_and_validation(users_data):
    max_score_per_user = (
        90 * 1 + 10 * 9
    )
    max_score = max_score_per_user * 10
    score_normalizer = 100 
    result = np.zeros((30, SEGMENT_TOKENS_N), dtype=int)
    test_score = 0
    test_score_smooth = 0

    res_test_set = np.zeros((10, SEGMENT_TOKENS_N),  dtype=int)

    progress_bar = tqdm(range(40), desc="Evaluating Users")

    for user_id in progress_bar:
        if user_id < 10:
            smooth_pred = smooth(users_data[user_id]["pred_test_prob"][:, 1])

            correct_predictions = (
                users_data[user_id]["pred_test"] == users_data[user_id]["y_test"]
            )

            user_score = (
                correct_predictions & (users_data[user_id]["y_test"] == 1)
            ).sum() * 9 + (
                correct_predictions & (users_data[user_id]["y_test"] == 0)
            ).sum()
            malicious_found = (
                correct_predictions & (users_data[user_id]["y_test"] == 1)
            ).sum()
            benign_found = (
                correct_predictions & (users_data[user_id]["y_test"] == 0)
            ).sum()

            smooth_correct_predictions = smooth_pred == users_data[user_id]["y_test"]
            user_score_smooth = (
                smooth_correct_predictions & (users_data[user_id]["y_test"] == 1)
            ).sum() * 9 + (
                smooth_correct_predictions & (users_data[user_id]["y_test"] == 0)
            ).sum()

            acc = metrics.accuracy_score(
                users_data[user_id]["y_test"], users_data[user_id]["pred_test"]
            )
            fpr, tpr, _ = metrics.roc_curve(
                users_data[user_id]["y_test"], users_data[user_id]["pred_test"]
            )
            roc_auc = metrics.auc(fpr, tpr)

            test_score += user_score
            test_score_smooth += user_score_smooth
            res_test_set[user_id,] = users_data[user_id]["pred_test"].reshape(-1)
            # Update progress bar with current user metrics
            data = {
                    "User": user_id,
                    "Accuracy": round(acc, 3),
                    "AUC": round(roc_auc, 3),
                    "Score": user_score,
                    "SmoothScore": user_score_smooth,
                    "maliciousFound": f"{malicious_found * 10}%",
                    "BenignFound": f"{benign_found}%",
                }
            progress_bar.set_postfix(data)
            print(data)

        else:
            n_fakes = users_data[user_id]["pred_test"].sum()
            result[user_id - 10,] = users_data[user_id]["pred_test"].reshape(-1)
            # Update progress bar with current user metrics
            data = {"User": user_id, "Fakes Found": n_fakes}
            progress_bar.set_postfix(data)
            print(data)

    # Summary print statements
    print(
        f"Train Score:{round(test_score / max_score, 3)}% | Normalized = { min(1, round(test_score / score_normalizer, 3))}%"
    )
    print(
        f"Test Fakes:{round(res_test_set.sum() / 100, 3)}%)"
    )
    return np.asarray(result, dtype=int)

In [None]:
result = test_and_validation(users_data)

## Prediction

In [None]:
label_df: pd.DataFrame = pd.read_csv(LABEL_FILE)
label_df.iloc[10:, 51:] = result
label_df.iloc[:, 1:] = label_df.iloc[:, 1:].astype(int)
label_df.to_csv(LABEL_FILE)
label_df