## Devide Data into trai&test

In [50]:
from more_itertools import chunked
from functional import seq
from pathlib import Path
import pandas as pd
import glob
import random
import numpy as np
from toolz import curry

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
)

In [52]:
SEED = 23
LABEL_PATH: Path = "./challengeToFill.csv"
DATA_REGEX: str = "./FraudedRawData/User*"

In [53]:
random.seed(SEED)
np.random.seed(SEED)

## Utils

### Transforms

In [54]:
def split_into_segments(content: list[str], *, segment_size: int = 100):
    return list(chunked(content, segment_size))

In [55]:
def join_segments(content: list[list[str]]):
    return seq(content).map(lambda line: " ".join(line))

In [56]:
@curry
def binary_label_by_user(user_id: int, df: pd.DataFrame):
    cpy = df.copy()
    cpy["label"] = cpy["userId"].apply(lambda x: int(x != f"User{user_id}"))
    return cpy

In [57]:
@curry
def filter_by_user_id(user_id: int, df: pd.DataFrame) -> pd.DataFrame:
    return (df[df["userId"] == f"User{user_id}"]).copy()

In [58]:
def flatten_df(df: pd.DataFrame, value_name: str) -> pd.DataFrame:
    """organize as a simple table format"""
    df = df.reset_index()
    df = df.melt(id_vars="index", var_name="segment", value_name=value_name)
    df.rename(columns={"index": "userId"}, inplace=True)
    return df

### Loading

In [59]:
def get_file_conetent(file: Path):
    with open(file, "r") as f:
        return seq(f.readlines()).map(lambda line: line.strip()).to_list()

In [60]:
def load_label_df(path: Path) -> pd.DataFrame:
    label_df: pd.DataFrame = pd.read_csv(path)
    label_df.set_index("Unnamed: 0", inplace=True)
    label_df.index.name = None
    return label_df

In [61]:
def load_text_data(regex: str):
    files_paths: list[Path] = glob.glob(regex)
    files: list[list[str]] = (
        seq(files_paths)
        .map(get_file_conetent)
        .map(split_into_segments)
        .map(join_segments)
    )
    files = seq(files_paths).map(lambda s: s.split("/")[-1]).zip(files).to_dict()
    df: pd.DataFrame = pd.DataFrame.from_dict(files).transpose()
    new_column_names = {i: f"{i*100}-{(i+1)*100}" for i in df.columns}
    df.rename(columns=new_column_names, inplace=True)
    df.sort_index(inplace=True)
    return df

### Steps:

In [62]:
def train(df: pd.DataFrame, *, models: dict[int, Pipeline]) -> dict[int, Pipeline]:
    for user in models.keys():
        tmp_df = binary_label_by_user(user)(df)
        models[user].fit(tmp_df["text"], tmp_df["label"])
    return models

In [63]:
def test(df: pd.DataFrame, *, models: dict[int, Pipeline]) -> pd.DataFrame:
    results = defaultdict(list)
    for user in models.keys():
        tmp_df = binary_label_by_user(user)(df)
        y_pred = models[user].predict(tmp_df["text"])
        y_label = tmp_df["label"]
        results["preecision"].append(precision_score(y_label, y_pred, average="binary"))
        results["recall"].append(recall_score(y_label, y_pred, average="binary"))
        results["acc"].append(accuracy_score(y_label, y_pred))
    return pd.DataFrame.from_dict(results)

In [64]:
def prediction(df: pd.DataFrame, *, models: dict[int, Pipeline]) -> pd.DataFrame:
    results = []
    for user in models.keys():
        filter_df = filter_by_user_id(user)(df)
        if not filter_df.empty:
            tmp_df = binary_label_by_user(user)(filter_df)
            tmp_df["label"] = models[user].predict(tmp_df["text"])
            results.append(tmp_df)
    return pd.concat(results, axis=0)

## Pipeline

In [65]:
def create_pipline() -> Pipeline:
    return Pipeline(
        [
            (
                "features",
                TfidfVectorizer(
                    sublinear_tf=True,
                    analyzer="word",
                    ngram_range=(2, 2),
                    token_pattern=r"\S+",
                    norm="l2",
                    min_df=0.0,
                    smooth_idf=False,
                    max_features=1000,
                ),
            ),
            ("model", RandomForestClassifier(random_state=SEED)),
        ]
    )

## Load Data

In [66]:
label_df = load_label_df(LABEL_PATH)
text_df = load_text_data(DATA_REGEX)

In [67]:
combined = pd.merge(
    flatten_df(text_df, value_name="text"),
    flatten_df(label_df, value_name="label"),
    on=["userId", "segment"],
    how="left",
)
combined["segmentIndex"] = combined["segment"].apply(
    lambda s: int(int(s.split("-")[0]) / 100)
)
combined

Unnamed: 0,userId,segment,text,label,segmentIndex
0,User0,0-100,cat nawk nawk uname pwd echo echo ksh uname st...,0.0,0
1,User1,0-100,cpp sh xrdb cpp sh xrdb mkpts hostname stty en...,0.0,0
2,User10,0-100,cpp sh xrdb cpp sh xrdb mkpts hostname env csh...,0.0,0
3,User11,0-100,touch touch cat ls sed ln rm sed ln rm chmod s...,0.0,0
4,User12,0-100,cpp sh xrdb mkpts test [ stty tset [ uname env...,0.0,0
...,...,...,...,...,...
5995,User5,14900-15000,ls mc lc sh ls sh ex sh netstat netscape netsc...,0.0,149
5996,User6,14900-15000,cc1 as gcc gcc uname nawk ld_ nm ld gcc gcc un...,0.0,149
5997,User7,14900-15000,sh ld64_ driver sh gmake netscape netscape net...,0.0,149
5998,User8,14900-15000,sh grep nawk sh grep nawk sh grep sh grep sh g...,0.0,149


In [68]:
has_label: pd.Series = combined["label"].notna()
for_validation: pd.Series = combined["segmentIndex"] >= 50
#
validation_df = combined[(has_label) & (for_validation)]
train_df = combined[(has_label) & (~for_validation)]
test_df = combined[~has_label]

print(train_df.shape, validation_df.shape, test_df.shape)

(2000, 5) (1000, 5) (3000, 5)


## Validation

In [69]:
n_users: int = len(combined["userId"].unique())
models = {user: create_pipline() for user in range(n_users)}
models = train(train_df, models=models)
test(validation_df, models=models)

Unnamed: 0,preecision,recall,acc
0,0.974026,1.0,0.976
1,0.906344,1.0,0.907
2,0.969828,1.0,0.972
3,0.92688,1.0,0.929
4,0.965665,1.0,0.968
5,0.939457,1.0,0.942
6,0.949367,1.0,0.952
7,0.927835,1.0,0.93
8,0.932642,1.0,0.935
9,0.929752,1.0,0.932


## Prediction

In [70]:
models = train(pd.concat([validation_df, train_df], axis=0), models=models)
pred_df = prediction(test_df, models=models)
final_df = pd.concat([train_df, validation_df, pred_df], axis=0)[
    ["userId", "segment", "label"]
].pivot(index="userId", columns="segment", values="label")
final_df.to_csv("./final_result.csv")
final_df.head(5)

Unnamed: 0,userId,segment,text,label,segmentIndex
2002,User10,5000-5100,tcsh rshd rdistd tcsh rshd rdistd tcsh rshd rd...,1,50
2042,User10,5100-5200,sh launchef launchef sh launchef sh launchef h...,0,51
2082,User10,5200-5300,rlogin rlogin csh tput launchef sh launchef ne...,0,52
2122,User10,5300-5400,rshd rdistd tcsh rshd rdistd tcsh rshd rdistd ...,1,53
2162,User10,5400-5500,true true grep date lp find mkdir expr generic...,1,54
...,...,...,...,...,...
5833,User39,14500-14600,cat nawk nawk tset uname xdpyinfo nawk xdpyinf...,1,145
5873,User39,14600-14700,grep sed tcsh cat grep sed tcsh cat grep sed t...,0,146
5913,User39,14700-14800,msort tcsh cat sort tcsh cat grep sed tcsh cat...,0,147
5953,User39,14800-14900,elm elm ls more vim sh sh elm frm ksh sendmail...,1,148


In [81]:
final_df = pd.concat([train_df, validation_df, pred_df], axis=0)[
    ["userId", "segment", "label"]
].pivot(index="userId", columns="segment", values="label")
final_df.to_csv("./final_result.csv")
final_df.head(5)

segment,0-100,100-200,1000-1100,10000-10100,10100-10200,10200-10300,10300-10400,10400-10500,10500-10600,10600-10700,...,9000-9100,9100-9200,9200-9300,9300-9400,9400-9500,9500-9600,9600-9700,9700-9800,9800-9900,9900-10000
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
User0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
User1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
User10,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
User11,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
User12,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0
