# Bash Script coomand Classification

In [None]:
# Global Config
LABEL_PATH: str = "./label_data.csv"
USER_FILE_REGEX: str = "./FraudedRawData/User*"
ATTACK_SAMPLE_RATIO: float = 0.1
NORMALIZATION_EPSILON: float = 1e-6
DROPOUT: float = 0.1
SEGMENT_TOKEN: int = 100
EMBEDDING: int = 32
BATCH_SIZE: int = 128
EPOCHS: int = 5
HEADS: int = 5
NN_UNIT: int = 64
SEED: int = 42

In [None]:
from tensorflow.keras import layers
from tensorflow import keras
import tensorflow as tf
import numpy as np
from more_itertools import chunked
from keras.metrics import AUC
from sklearn.metrics import confusion_matrix
from functional import seq
from pathlib import Path
import pandas as pd
import collections
import glob
import re

In [None]:
np.random.seed(SEED)

### Helper Functions

In [None]:
def load_segment_data() -> pd.DataFrame:
    def get_file_conetent(file: Path):
        with open(file, "r") as f:
            return seq(f.readlines()).map(lambda line: line.strip()).to_list()

    def split_into_segments(content: list[str], *, segment_size: int = SEGMENT_TOKEN):
        return list(chunked(content, segment_size))

    def join_segments(content: list[list[str]]):
        return seq(content).map(lambda line: " ".join(line))

    files_paths: list[Path] = glob.glob(USER_FILE_REGEX)
    files: list[list[str]] = (
        seq(files_paths)
        .map(get_file_conetent)
        .map(split_into_segments)
        .map(join_segments)
    )
    files = seq(files_paths).map(lambda s: s.split("/")[-1]).zip(files).to_dict()
    segment_df: pd.DataFrame = pd.DataFrame.from_dict(files).transpose()
    segment_df.sort_index(inplace=True)
    segment_df = segment_df.reset_index()
    segment_df = segment_df.melt(
        id_vars="index", var_name="SegmentIndex", value_name="SegmentText"
    )
    segment_df.rename(columns={"index": "Id"}, inplace=True)
    return segment_df

In [None]:
def load_labels() -> pd.DataFrame:
    def extract_index(col_name):
        match = re.match(r"(\d+)-(\d+)", col_name)
        if match:
            return int(int(match.group(1)) / 100)
        return col_name

    label_df: pd.DataFrame = pd.read_csv(LABEL_PATH)
    label_df.set_index("Unnamed: 0", inplace=True)
    label_df.index.name = None
    label_df.rename(columns=lambda x: extract_index(x), inplace=True)
    label_df = label_df.astype(float)
    label_df.head()
    label_df = label_df.reset_index()
    label_df = label_df.melt(
        id_vars="index", var_name="SegmentIndex", value_name="Label"
    )
    label_df.rename(columns={"index": "Id"}, inplace=True)
    return label_df

In [None]:
def extract_features(vocab: dict[str, int], text: str):
    return np.array([vocab[word] for word in text.split(" ")])

In [None]:
def create_vocab(df: pd.DataFrame) -> dict:
    vocab_counter = collections.Counter()

    def count_words(text):
        tokens = [word for word in text.split(" ")]
        return collections.Counter(tokens)

    for segment in df["SegmentText"]:
        vocab_counter.update(count_words(segment))
    return dict(map(lambda x: (x[1], x[0]), enumerate(vocab_counter)))

In [None]:
def user_train_df(
    df: pd.DataFrame, *, id: int, ratio: float = ATTACK_SAMPLE_RATIO
) -> tuple[np.ndarray, np.ndarray]:
    user_data = df[df["Id"] == id]
    other_users_data = df[df["Id"] != id].sample(int(len(user_data) * ratio))
    other_users_data["Label"] = 1
    data = pd.concat([user_data, other_users_data], axis=0)
    X = np.vstack(data["Features"].values)
    labels = np.vstack(data["Label"]).astype(int)
    y = np.zeros((labels.shape[0], 2))
    y[np.arange(labels.shape[0]), labels.flatten()] = 1
    return X, y

In [None]:
def user_validate_df(df: pd.DataFrame, *, id: int) -> tuple[np.ndarray, np.ndarray]:
    data = df[df["Id"] == id]
    X = np.vstack(data["Features"].values)
    labels = np.vstack(data["Label"]).astype(int)
    y = np.zeros((labels.shape[0], 2))
    y[np.arange(labels.shape[0]), labels.flatten()] = 1
    return X, y

## Data - Process & Load

In [None]:
segments: pd.DataFrame = load_segment_data()
labels: pd.DataFrame = load_labels()
df = pd.merge(
    segments,
    labels,
    on=["Id", "SegmentIndex"],
    how="left",
)
df["Id"] = df["Id"].str.extract(r"User(\d+)").astype(int)
df = df.sort_values(by=["SegmentIndex", "Id"], ascending=[True, True])
df.head()

In [None]:
vocab_embeder: dict[str, int] = create_vocab(df)
df["Features"] = df["SegmentText"].apply(lambda x: extract_features(vocab_embeder, x))
df.head()

In [74]:
has_label: pd.Series = df["Label"].notna()
for_validation: pd.Series = df["SegmentIndex"] >= 50

validation_df = df[(has_label) & (for_validation)]
train_df = df[(has_label) & (~for_validation)]
test_df = df[~has_label]

train_df.shape, validation_df.shape, test_df.shape

((2000, 5), (1000, 5), (3000, 5))

## Model

In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=DROPOUT):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [
                layers.Dense(ff_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=NORMALIZATION_EPSILON)
        self.layernorm2 = layers.LayerNormalization(epsilon=NORMALIZATION_EPSILON)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [None]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [None]:
def create_model(vocab_size, *, training: bool = True) -> keras.Model:
    input = keras.Input(shape=(SEGMENT_TOKEN,))
    emb = TokenAndPositionEmbedding(SEGMENT_TOKEN, vocab_size, EMBEDDING)(input)
    transformer = TransformerBlock(EMBEDDING, HEADS, NN_UNIT)(emb, training=training)
    avg = layers.GlobalAveragePooling1D()(transformer)
    out = layers.Dense(2, activation="softmax")(avg)
    model = keras.Model(inputs=input, outputs=out)
    return model

## Metrics

## Train

In [57]:
models = collections.defaultdict(lambda: create_model(len(vocab_embeder)))
for id in range(5): #df["Id"].unique():
    model = models[id]
    print(20*"=", f"User-{id}" ,20*"=")
    X, y = user_train_df(train_df, id=id)
    model.compile("adam", "CategoricalCrossentropy", metrics=["accuracy",AUC(name="auc")])
    model.fit(X, y, batch_size=BATCH_SIZE, epochs=1)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - accuracy: 0.8364 - auc: 0.8298 - loss: 0.5515
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.3091 - auc: 0.1949 - loss: 0.8480
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.1273 - auc: 0.0605 - loss: 1.4451
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - accuracy: 0.2727 - auc: 0.3041 - loss: 0.7442
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - accuracy: 0.9091 - auc: 0.9045 - loss: 0.4314


## Evaluation

In [72]:
def smooth(arr, n=10):
    # Sort the input array indices based on values
    sorted_indices = np.argsort(arr)
    
    # Sort the array based on indices
    sorted_arr = arr[sorted_indices]
    
    # Get the threshold value for the top n segments
    threshold = sorted_arr[-n]
    
    # Create a result array initialized with zeros
    res_arr = np.zeros_like(arr)
    
    # Assign 1 to segments with values >= threshold
    res_arr[arr >= threshold] = 1
    
    return res_arr

In [73]:
for id in range(5):
    print(20*"=", f"User-{id}" ,20*"=")
    model = models[id]
    X_test,y_test = user_validate_df(validation_df, id=id)
    smooth_pred = smooth(model.predict(X_test)[:, 1])
    cm = confusion_matrix(y_test[:, 1], smooth_pred)
    tp = cm[1, 1] / y_test.shape[0]  # True positives (correct masqueraded segments)
    tn = cm[0, 0] / y_test.shape[0]  # True negatives (correct benign segments)
    fp = cm[0, 1] / y_test.shape[0]  # False positives (misclassified as masqueraded)
    fn = cm[1, 0] / y_test.shape[0] 
    print(f"{tp=}, {tn=}")

    # confusion_matrix
    # tp = ((smooth_pred == 1) == (y_test[:,1] == 1)).sum() / y_test.shape[0]
    # tn = ((smooth_pred == 0) == (y_test[:,1] == 0)).sum() / y_test.shape[0]
    # fp = ((smooth_pred == 0) == (y_test[:,1] == 0)).sum() / y_test.shape[0]
    # print(tp)
    # print(tn)
                         
    # score = 
    # print(f"Test Loss: {loss:.4f}")
    # print(f"Test Accuracy: {accuracy:.4f}")
    pass

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
tp=0.02, tn=0.82
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
tp=0.05, tn=0.85
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
tp=0.07, tn=0.87
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
tp=0.08, tn=0.88
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
tp=0.06, tn=0.86


User Data is saved inside FraudedRawData. Loading data happend into simple ways. First we load the data inself and create table for each user [Id, SegId ,SegmentText, Label]

In [60]:
SEED = 23
LABEL_PATH: str = "./label_data.csv"
USER_FILE_REGEX: str = "./FraudedRawData/User*"
ATTACK_SAMPLE_RATIO: float = 0.1

In [61]:
from more_itertools import chunked
from functional import seq
from pathlib import Path
import pandas as pd
import glob


def get_file_conetent(file: Path):
    with open(file, "r") as f:
        return seq(f.readlines()).map(lambda line: line.strip()).to_list()


def split_into_segments(content: list[str], *, segment_size: int = 100):
    return list(chunked(content, segment_size))


def join_segments(content: list[list[str]]):
    return seq(content).map(lambda line: " ".join(line))


# Find all user Files

files_paths: list[Path] = glob.glob(USER_FILE_REGEX)
files: list[list[str]] = (
    seq(files_paths).map(get_file_conetent).map(split_into_segments).map(join_segments)
)
files = seq(files_paths).map(lambda s: s.split("/")[-1]).zip(files).to_dict()
segment_df: pd.DataFrame = pd.DataFrame.from_dict(files).transpose()
segment_df.sort_index(inplace=True)
segment_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,140,141,142,143,144,145,146,147,148,149
User0,cat nawk nawk uname pwd echo echo ksh uname st...,xgvis ls ls sh sh xgvis sh sh xgvis Sqpe sendm...,uname pwd echo echo ksh ls sendmail movemail m...,mywsh mywsh xset cat nawk nawk uname pwd echo ...,led uname uname pwd echo echo ksh ls ksh ls ls...,ul sh man man col col neqn nroff xwsh ksh move...,sh ls sh sh sh xgvis sh sh xgvis rm sh ls sh s...,sh egrep sed sh sed sh sh sh sed sh sed sh sh ...,help sh less sh less sh less rm sh sh find cat...,rm sh sh find cat sed help sh less rm sh sh sh...,...,sendmail ksh cat more sendmail sendmail sendma...,true grep date lp find tail ls sed FIFO cat ge...,awk cat post rm generic ln ln generic lp sh ge...,sendmail sendmail sendmail sh MediaMai sendmai...,hostname id nawk getopt true true true grep da...,nawk getopt true true grep date lp find expr g...,generic gethost download enscript ksh hostname...,sed FIFO cat generic ls generic cat generic ls...,ls acroread acroread acroread expr cat acrorea...,ksh ksh nawk sendmail deroff sort spell spell ...
User1,cpp sh xrdb cpp sh xrdb mkpts hostname stty en...,id nawk getopt true true grep date lp find mkd...,find mkdir expr generic cat file ppost awk ppo...,sh MediaMai sendmail emacs-20 ls hostname id n...,generic generic date generic gethost download ...,tcpostio tcpostio tcpostio cat generic ls gene...,id nawk getopt true grep date lp find mkdir ex...,netscape mkpts hostname stty .java_wr expr exp...,expr expr dirname basename egrep egrep egrep e...,egrep egrep egrep expr expr expr dirname java ...,...,ps ps grep ps grep grep ps grep grep ps grep g...,tcsh make tcsh hostname stty fec driver tcsh m...,MediaMai hostname stty hostname stty telnet te...,tcsh make tcsh hostname stty fec driver tcsh m...,make tcsh hostname stty fec be driver tcsh ld_...,tcsh xterm emacs-20 netscape netscape cat mail...,tail ls sed FIFO generic hostname id nawk geto...,netscape netscape hostname id nawk getopt true...,id nawk getopt true true grep date lp find tai...,LOCK true ls sed FIFO cat date generic generic...
User10,cpp sh xrdb cpp sh xrdb mkpts hostname env csh...,netscape netscape rlogin rlogin tput movemail ...,tset launchef sh launchef movemail movemail la...,UNLOCK rmdir generic tektroni sh LOCK hostname...,csh virtex virtex virtex virtex virtex virtex ...,awk cat post rm generic ln ln generic lp getpg...,ghostvie hostname id nawk getopt true true tru...,FIFO cat date generic generic date generic dow...,stty tset resize movemail movemail sendmail se...,gzip sh sh hostname id nawk getopt true grep d...,...,more rm ls more ex hostname id nawk getopt tru...,movemail movemail movemail movemail movemail m...,env csh csh csh userenv sh csh kill wait4wm xh...,movemail movemail movemail sendmail sendmail s...,tellwm xprop endsessi xdm 4Dwm toolches xclock...,xdm toolches 4Dwm cpp sh xrdb cpp sh xrdb mkpt...,hostname tset hostname date env tcsh tcsh tcsh...,getopt true true grep date lp find expr generi...,userenv wait4wm xhost xsetroot reaper cat mail...,hostname cat mail csh hostname stty tset rlogi...
User11,touch touch cat ls sed ln rm sed ln rm chmod s...,hostname tty arch hostname tset arch stty ksh ...,sh gettxt hostname gettxt gettxt gettxt xconfi...,more more ls ls more ls ls cat more col sh col...,hostname arch cat tset tty stty ksh hostname a...,tty hostname hostname hostname arch arch arch ...,launchef sh sh rm MediaMai launchef launchef s...,endsessi xclock xbiff xclock xclock 4Dwm xcloc...,launchef launchef sh faces launchef launchef s...,.xinitrc hostname tty hostname arch hostname w...,...,rm ksh cat tty hostname arch tset stty ksh lau...,endsessi .xinitrc hostname tty hostname arch h...,MediaMai telnet rm ksh xterm netscape netscape...,netscape netscape launchef launchef sh netstat...,file post awk cat post rm generic ln ln generi...,generic ln ln generic lp getpgrp LOCK true ls ...,whoami .xinitrc cpp sh xrdb cat tty hostname a...,emacs-20 ksh uname nawk cpp cc1 gcc gcc ls a.o...,cpp cc1 as ld_ nm ld gcc gcc a.out emacs-20 un...,cpp cc1 as ld_ nm ld gcc gcc a.out emacs-20 un...
User12,cpp sh xrdb mkpts test [ stty tset [ uname env...,hostname [ cat [ stty tset [ uname mail mail m...,date lp find mkdir expr generic cat file ppost...,generic generic date generic download gethost ...,xconfirm endsessi tellwm tellwm xprop endsessi...,cat date generic generic date generic download...,ln ln generic lp getpgrp LOCK LOCK generic tcp...,sed FIFO cat date generic generic date generic...,ls sed FIFO rm UNLOCK rmdir generic tcppost sh...,hostname id nawk getopt true grep date lp find...,...,LOCK true ls sed FIFO cat date generic generic...,tcpostio tcpostio tcpostio cat generic ls gene...,[ cat [ stty tset [ uname mail emacs-20 hostna...,netscape netscape netscape netscape netscape n...,rm generic ln ln generic lp sh getpgrp LOCK tr...,sh xrdb mkpts test [ stty tset [ uname env ech...,generic date generic rm ls sed FIFO rm UNLOCK ...,hostname tset hostname date launchef sh launch...,mp cat file post awk cat post rm generic ln ln...,ls sed FIFO cat date generic generic date gene...


Now we need to flattend the df for simple table.

In [62]:
segment_df = segment_df.reset_index()
segment_df = segment_df.melt(
    id_vars="index", var_name="SegmentIndex", value_name="SegmentText"
)
segment_df.rename(columns={"index": "Id"}, inplace=True)
segment_df.head()

Unnamed: 0,Id,SegmentIndex,SegmentText
0,User0,0,cat nawk nawk uname pwd echo echo ksh uname st...
1,User1,0,cpp sh xrdb cpp sh xrdb mkpts hostname stty en...
2,User10,0,cpp sh xrdb cpp sh xrdb mkpts hostname env csh...
3,User11,0,touch touch cat ls sed ln rm sed ln rm chmod s...
4,User12,0,cpp sh xrdb mkpts test [ stty tset [ uname env...


### Load Label 
combine

In [63]:
import re


def extract_index(col_name):
    match = re.match(r"(\d+)-(\d+)", col_name)
    if match:
        return int(int(match.group(1)) / 100)
    return col_name


label_df: pd.DataFrame = pd.read_csv(LABEL_PATH)
label_df.set_index("Unnamed: 0", inplace=True)
label_df.index.name = None
label_df.rename(columns=lambda x: extract_index(x), inplace=True)
label_df = label_df.astype(float)
label_df.head()
label_df = label_df.reset_index()
label_df = label_df.melt(id_vars="index", var_name="SegmentIndex", value_name="Label")
label_df.rename(columns={"index": "Id"}, inplace=True)
label_df.head()

Unnamed: 0,Id,SegmentIndex,Label
0,User0,0,0.0
1,User1,0,0.0
2,User2,0,0.0
3,User3,0,0.0
4,User4,0,0.0


In [64]:
label_df = label_df.reset_index()
label_df = label_df.melt(id_vars="index", var_name="SegmentIndex", value_name="Label")
label_df.rename(columns={"index": "Id"}, inplace=True)
label_df.head()

ValueError: value_name (Label) cannot match an element in the DataFrame columns.

In [None]:
df = pd.merge(
    segment_df,
    label_df,
    on=["Id", "SegmentIndex"],
    how="left",
)
df["Id"] = df["Id"].str.extract(r"User(\d+)").astype(int)
df = df.sort_values(by=["SegmentIndex", "Id"], ascending=[True, True])
df.head()

In [None]:
import collections
import numpy as np

vocab_counter = collections.Counter()


def count_words(text):
    tokens = [word for word in text.split(" ")]
    return collections.Counter(tokens)


def convert_to_features(text: str):
    return np.array([vocab_indexer[word] for word in text.split(" ")])


for segment in df["SegmentText"]:
    vocab_counter.update(count_words(segment))
vocab_indexer = dict(map(lambda x: (x[1], x[0]), enumerate(vocab_counter)))
df["Features"] = df["SegmentText"].apply(convert_to_features)
df.head()

In [None]:
vocab_indexer

Divide into Train & Validation

In [None]:
has_label: pd.Series = df["Label"].notna()
for_validation: pd.Series = df["SegmentIndex"] >= 50

validation_df = df[(has_label) & (for_validation)]
train_df = df[(has_label) & (~for_validation)]
test_df = df[~has_label]

train_df.shape, validation_df.shape, test_df.shape

In [None]:
train_df.head()

## Model

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader


class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = nn.MultiheadAttention(embed_dim, num_heads)
        self.ffn = nn.Sequential(
            nn.Linear(embed_dim, ff_dim), nn.ReLU(), nn.Linear(ff_dim, embed_dim)
        )
        self.layernorm1 = nn.LayerNorm(embed_dim, eps=1e-6)
        self.layernorm2 = nn.LayerNorm(embed_dim, eps=1e-6)
        self.dropout1 = nn.Dropout(rate)
        self.dropout2 = nn.Dropout(rate)

    def forward(self, inputs, training=True):
        attn_output, _ = self.att(inputs, inputs, inputs)
        attn_output = self.dropout1(attn_output) if training else attn_output
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output) if training else ffn_output
        return self.layernorm2(out1 + ffn_output)


class TokenAndPositionEmbedding(nn.Module):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = nn.Embedding(vocab_size, embed_dim)
        self.pos_emb = nn.Embedding(maxlen, embed_dim)

    def forward(self, x):
        maxlen = x.size(-1)
        positions = torch.arange(0, maxlen, device=x.device).unsqueeze(0).expand_as(x)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions


class TransformerModel(nn.Module):
    def __init__(self, maxlen, vocab_size, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerModel, self).__init__()
        self.embedding = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
        self.transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim, rate)
        self.global_avg_pool = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Linear(embed_dim, 2)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer_block(
            x.permute(1, 0, 2)
        )  # Permute to (seq_len, batch, embed_dim)
        x = x.permute(1, 2, 0)  # Permute to (batch, embed_dim, seq_len)
        x = self.global_avg_pool(x).squeeze(-1)  # Global average pooling
        x = self.fc(x)
        x = self.softmax(x)
        return x


class MyDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [None]:
def user_train_df(train_df, *, id: int) -> tuple[np.ndarray, np.ndarray]:
    user_data = train_df[train_df["Id"] == id]
    other_users_data = train_df[train_df["Id"] != id].sample(int(len(user_data) * 0.1))
    other_users_data["Label"] = 1
    data = pd.concat([user_data, other_users_data], axis=0)
    X = torch.from_numpy(np.vstack(data["Features"].values))
    labels = np.vstack(data["Label"]).astype(int)
    y = np.zeros((labels.shape[0], 2))
    y[np.arange(labels.shape[0]), labels.flatten()] = 1
    return X, y

In [None]:
def train_model(id: int, n_epochs: int = 1, batch_size: int = 128):
    model = TransformerModel(
        maxlen=100, vocab_size=len(vocab_indexer), embed_dim=32, num_heads=2, ff_dim=64
    )
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-6)
    # samplex
    user_data = train_df[train_df["Id"] == id]
    other_users_data = train_df[train_df["Id"] != id].sample(int(len(user_data) * 0.1))
    other_users_data["Label"] = 1
    data = pd.concat([user_data, other_users_data], axis=0)
    X = torch.from_numpy(np.vstack(data["Features"].values))
    labels = np.vstack(data["Label"]).astype(int)
    y = np.zeros((labels.shape[0], 2))
    y[np.arange(labels.shape[0]), labels.flatten()] = 1
    #
    dataset = MyDataset(X, y)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    #
    for epoch in range(n_epochs):
        model.train()
        total_loss = 0.0
        pred_train = []
        y_true = []
        for batch in dataloader:
            inputs, labels = batch
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        total_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        pred_train.extend(predicted.cpu().numpy())
        y_true.extend(labels.cpu().numpy())

        y_true = np.array(y_true)[:, 1]
        pred_train = np.array(pred_train)
        y_true = np.array(y_true).flatten()
        pred_train = np.array(pred_train)
    acc = metrics.accuracy_score(y_true, pred_train)
    fpr, tpr, _ = metrics.roc_curve(y_true, pred_train)
    roc_auc = metrics.auc(fpr, tpr)
    print(f"User: {id}, Accuracy: {acc}, AUC: {roc_auc}")
    return model

In [None]:
for user in range(40):
    train_model(user)

In [None]:
# from IPython.display import Image
# import tensorflow as tf
# from tensorflow import keras
# from tensorflow.keras import layers

In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [
                layers.Dense(ff_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [
                layers.Dense(ff_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [None]:
for user in range(10):
    train_model(0)

In [None]:
def train_on_user(df: pd.DataFrame, *, id: int, ratio: float = ATTACK_SAMPLE_RATIO):
    model = TransformerModel(
        maxlen=100, vocab_size=len(vocab_indexer), embed_dim=32, num_heads=4, ff_dim=64
    )
    # samplex
    user_data = df[df["Id"] == id]
    other_users_data = df[df["Id"] != id].sample(int(len(user_data) * ratio))

    pass

In [None]:
train_on_user(train_df, id=0)

In [None]:
from more_itertools import chunked
from functional import seq
from pathlib import Path
import pandas as pd
import glob
import random
import numpy as np
from toolz import curry
from collections import Counter
import matplotlib.pyplot as plt

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
)

In [None]:
SEED = 23
LABEL_PATH: Path = "./label_data.csv"
DATA_REGEX: str = "./FraudedRawData/User*"

In [None]:
random.seed(SEED)
np.random.seed(SEED)

## Utils

### Plot

In [None]:
def dict_to_bar_plot(data, title: str):
    """
    Takes a dictionary as input and creates a bar plot.

    Parameters:
    data (dict): A dictionary where keys are categories and values are numerical data.

    Returns:
    None
    """
    keys = list(data.keys())
    values = list(data.values())

    plt.figure(figsize=(10, 5))
    plt.bar(keys, values, color="skyblue")
    plt.xlabel("Bash Command")
    plt.ylabel("Frequence")
    plt.title(title)
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()

### Transforms

In [None]:
def split_into_segments(content: list[str], *, segment_size: int = 100):
    return list(chunked(content, segment_size))

In [None]:
def join_segments(content: list[list[str]]):
    return seq(content).map(lambda line: " ".join(line))

In [None]:
@curry
def binary_label_by_user(user_id: int, df: pd.DataFrame):
    cpy = df.copy()
    cpy["label"] = cpy["userId"].apply(lambda x: int(x != f"User{user_id}"))
    return cpy

In [None]:
@curry
def filter_by_user_id(user_id: int, df: pd.DataFrame) -> pd.DataFrame:
    return (df[df["userId"] == f"User{user_id}"]).copy()

In [None]:
def flatten_df(df: pd.DataFrame, value_name: str) -> pd.DataFrame:
    """organize as a simple table format"""
    df = df.reset_index()
    df = df.melt(id_vars="index", var_name="segment", value_name=value_name)
    df.rename(columns={"index": "userId"}, inplace=True)
    return df

### Loading

In [None]:
def get_file_conetent(file: Path):
    with open(file, "r") as f:
        return seq(f.readlines()).map(lambda line: line.strip()).to_list()

In [None]:
def load_label_df(path: Path) -> pd.DataFrame:
    label_df: pd.DataFrame = pd.read_csv(path)
    label_df.set_index("Unnamed: 0", inplace=True)
    label_df.index.name = None
    return label_df

In [None]:
def load_text_data(regex: str):
    files_paths: list[Path] = glob.glob(regex)
    files: list[list[str]] = (
        seq(files_paths)
        .map(get_file_conetent)
        .map(split_into_segments)
        .map(join_segments)
    )
    files = seq(files_paths).map(lambda s: s.split("/")[-1]).zip(files).to_dict()
    df: pd.DataFrame = pd.DataFrame.from_dict(files).transpose()
    new_column_names = {i: f"{i*100}-{(i+1)*100}" for i in df.columns}
    df.rename(columns=new_column_names, inplace=True)
    df.sort_index(inplace=True)
    return df

### Steps:

In [None]:
def train(df: pd.DataFrame, *, models: dict[int, Pipeline]) -> dict[int, Pipeline]:
    for user in models.keys():
        tmp_df = binary_label_by_user(user)(df)
        models[user].fit(tmp_df["text"], tmp_df["label"])
    return models

In [None]:
def test(df: pd.DataFrame, *, models: dict[int, Pipeline]) -> pd.DataFrame:
    results = defaultdict(list)
    for user in models.keys():
        tmp_df = binary_label_by_user(user)(df)
        y_pred = models[user].predict(tmp_df["text"])
        y_label = tmp_df["label"]
        results["preecision"].append(precision_score(y_label, y_pred, average="binary"))
        results["recall"].append(recall_score(y_label, y_pred, average="binary"))
        results["acc"].append(accuracy_score(y_label, y_pred))
    return pd.DataFrame.from_dict(results)

In [None]:
def prediction(df: pd.DataFrame, *, models: dict[int, Pipeline]) -> pd.DataFrame:
    results = []
    for user in models.keys():
        filter_df = filter_by_user_id(user)(df)
        if not filter_df.empty:
            tmp_df = binary_label_by_user(user)(filter_df)
            tmp_df["label"] = models[user].predict(tmp_df["text"])
            results.append(tmp_df)
    return pd.concat(results, axis=0)

## Pipeline

In [None]:
def create_pipline() -> Pipeline:
    return Pipeline(
        [
            (
                "features",
                TfidfVectorizer(
                    sublinear_tf=True,
                    analyzer="word",
                    ngram_range=(2, 2),
                    token_pattern=r"\S+",
                    norm="l2",
                    min_df=0.0,
                    smooth_idf=False,
                    max_features=1000,
                ),
            ),
            ("model", RandomForestClassifier()),
        ]
    )

## Load Data

In [None]:
label_df = load_label_df(LABEL_PATH)
text_df = load_text_data(DATA_REGEX)

In [None]:
combined = pd.merge(
    flatten_df(text_df, value_name="text"),
    flatten_df(label_df, value_name="label"),
    on=["userId", "segment"],
    how="left",
)
combined["segmentIndex"] = combined["segment"].apply(
    lambda s: int(int(s.split("-")[0]) / 100)
)
combined

In [None]:
has_label: pd.Series = combined["label"].notna()
for_validation: pd.Series = combined["segmentIndex"] >= 50
#
validation_df = combined[(has_label) & (for_validation)]
train_df = combined[(has_label) & (~for_validation)]
test_df = combined[~has_label]

print(train_df.shape, validation_df.shape, test_df.shape)

In [None]:
atack_sample: pd.DataFrame = validation_df[
    (validation_df["label"] == 1.0) & (validation_df["userId"] == "User0")
]
benign_sample: pd.DataFrame = validation_df[
    (validation_df["label"] == 0) & (validation_df["userId"] == "User0")
]

atack_sample = atack_sample["text"].apply(lambda s: Counter(s.split(" "))).to_list()
benign_sample = benign_sample["text"].apply(lambda s: Counter(s.split(" "))).to_list()

for i in range(3):
    dict_to_bar_plot(atack_sample[i], title="Attack Segment")
    dict_to_bar_plot(benign_sample[i], title="Benign Segment")
    print("---" * 30)

## Validation

In [None]:
n_users: int = len(combined["userId"].unique())
models = {user: create_pipline() for user in range(n_users)}
models = train(train_df, models=models)
test(validation_df, models=models)

## Prediction

In [None]:
# train model with all of the data
models = train(pd.concat([validation_df, train_df], axis=0), models=models)
pred_df = prediction(test_df, models=models)
# restrcture for label excel
final_df = pd.concat([train_df, validation_df, pred_df], axis=0)[
    ["userId", "segment", "label"]
].pivot(index="userId", columns="segment", values="label")
# save it
final_df.to_csv("./challengeToFill.csv")
final_df.head(5)