## Devide Data into trai&test

In [1]:
from more_itertools import chunked
from functional import seq
from pathlib import Path
import pandas as pd
import glob

## Utils

In [2]:
def get_file_conetent(file: Path):
    with open(file, 'r') as f:
        return seq(f.readlines()).map(lambda line: line.strip()).to_list()

In [3]:
def split_into_segments(content: list[str], *, segment_size: int = 100):
    return list(chunked(content, segment_size))

In [4]:
def join_segments(content: list[list[str]]):
    return seq(content).map(lambda line: ' '.join(line))

In [5]:
def split_train_test(content: list[str], train_size: int = 50):
    return content[:train_size], content[train_size:]

## Organize data

In [6]:
label_df: pd.DataFrame = pd.read_csv('./challengeToFill.csv')
label_df.set_index('Unnamed: 0', inplace=True)
label_df.index.name = None
label_df.head(3)

Unnamed: 0,0-100,100-200,200-300,300-400,400-500,500-600,600-700,700-800,800-900,900-1000,...,14000-14100,14100-14200,14200-14300,14300-14400,14400-14500,14500-14600,14600-14700,14700-14800,14800-14900,14900-15000
User0,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
User1,0,0,0,0,0,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
User2,0,0,0,0,0,0,0,0,0,0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [7]:
files_paths: list[Path] = glob.glob('./FraudedRawData/User*')
files: list[list[str]] = seq(files_paths)\
    .map(get_file_conetent)\
    .map(split_into_segments)\
    .map(join_segments)
files = seq(files_paths)\
    .map(lambda s: s.split('/')[-1])\
    .zip(files)\
    .to_dict()
text_df: pd.DataFrame = pd.DataFrame.from_dict(files).transpose()
new_column_names = {i: f'{i*100}-{(i+1)*100}' for i in text_df.columns}
text_df.rename(columns=new_column_names, inplace=True)
text_df.sort_index(inplace=True)
text_df.head(2)

Unnamed: 0,0-100,100-200,200-300,300-400,400-500,500-600,600-700,700-800,800-900,900-1000,...,14000-14100,14100-14200,14200-14300,14300-14400,14400-14500,14500-14600,14600-14700,14700-14800,14800-14900,14900-15000
User0,cat nawk nawk uname pwd echo echo ksh uname st...,xgvis ls ls sh sh xgvis sh sh xgvis Sqpe sendm...,uname pwd echo echo ksh ls sendmail movemail m...,mywsh mywsh xset cat nawk nawk uname pwd echo ...,led uname uname pwd echo echo ksh ls ksh ls ls...,ul sh man man col col neqn nroff xwsh ksh move...,sh ls sh sh sh xgvis sh sh xgvis rm sh ls sh s...,sh egrep sed sh sed sh sh sh sed sh sed sh sh ...,help sh less sh less sh less rm sh sh find cat...,rm sh sh find cat sed help sh less rm sh sh sh...,...,sendmail ksh cat more sendmail sendmail sendma...,true grep date lp find tail ls sed FIFO cat ge...,awk cat post rm generic ln ln generic lp sh ge...,sendmail sendmail sendmail sh MediaMai sendmai...,hostname id nawk getopt true true true grep da...,nawk getopt true true grep date lp find expr g...,generic gethost download enscript ksh hostname...,sed FIFO cat generic ls generic cat generic ls...,ls acroread acroread acroread expr cat acrorea...,ksh ksh nawk sendmail deroff sort spell spell ...
User1,cpp sh xrdb cpp sh xrdb mkpts hostname stty en...,id nawk getopt true true grep date lp find mkd...,find mkdir expr generic cat file ppost awk ppo...,sh MediaMai sendmail emacs-20 ls hostname id n...,generic generic date generic gethost download ...,tcpostio tcpostio tcpostio cat generic ls gene...,id nawk getopt true grep date lp find mkdir ex...,netscape mkpts hostname stty .java_wr expr exp...,expr expr dirname basename egrep egrep egrep e...,egrep egrep egrep expr expr expr dirname java ...,...,ps ps grep ps grep grep ps grep grep ps grep g...,tcsh make tcsh hostname stty fec driver tcsh m...,MediaMai hostname stty hostname stty telnet te...,tcsh make tcsh hostname stty fec driver tcsh m...,make tcsh hostname stty fec be driver tcsh ld_...,tcsh xterm emacs-20 netscape netscape cat mail...,tail ls sed FIFO generic hostname id nawk geto...,netscape netscape hostname id nawk getopt true...,id nawk getopt true true grep date lp find tai...,LOCK true ls sed FIFO cat date generic generic...


In [8]:
text_df = text_df.reset_index()
text_df = text_df.melt(id_vars='index', var_name='segment', value_name='text')
text_df.rename(columns={'index': 'user_id'}, inplace=True)
text_df

Unnamed: 0,user_id,segment,text
0,User0,0-100,cat nawk nawk uname pwd echo echo ksh uname st...
1,User1,0-100,cpp sh xrdb cpp sh xrdb mkpts hostname stty en...
2,User10,0-100,cpp sh xrdb cpp sh xrdb mkpts hostname env csh...
3,User11,0-100,touch touch cat ls sed ln rm sed ln rm chmod s...
4,User12,0-100,cpp sh xrdb mkpts test [ stty tset [ uname env...
...,...,...,...
5995,User5,14900-15000,ls mc lc sh ls sh ex sh netstat netscape netsc...
5996,User6,14900-15000,cc1 as gcc gcc uname nawk ld_ nm ld gcc gcc un...
5997,User7,14900-15000,sh ld64_ driver sh gmake netscape netscape net...
5998,User8,14900-15000,sh grep nawk sh grep nawk sh grep sh grep sh g...


In [9]:
label_df = label_df.reset_index()
label_df = label_df.melt(id_vars='index', var_name='segment', value_name='label')
label_df.rename(columns={'index': 'user_id'}, inplace=True)
label_df

Unnamed: 0,user_id,segment,label
0,User0,0-100,0.0
1,User1,0-100,0.0
2,User2,0-100,0.0
3,User3,0-100,0.0
4,User4,0-100,0.0
...,...,...,...
5995,User35,14900-15000,
5996,User36,14900-15000,
5997,User37,14900-15000,
5998,User38,14900-15000,


In [11]:
label_df['user_id'] = label_df['user_id'].astype(str)
text_df['user_id'] = text_df['user_id'].astype(str)
df = pd.merge(text_df, label_df, on=['user_id', 'segment'], how='left')
df

Unnamed: 0,user_id,segment,text,label
0,User0,0-100,cat nawk nawk uname pwd echo echo ksh uname st...,0.0
1,User1,0-100,cpp sh xrdb cpp sh xrdb mkpts hostname stty en...,0.0
2,User10,0-100,cpp sh xrdb cpp sh xrdb mkpts hostname env csh...,0.0
3,User11,0-100,touch touch cat ls sed ln rm sed ln rm chmod s...,0.0
4,User12,0-100,cpp sh xrdb mkpts test [ stty tset [ uname env...,0.0
...,...,...,...,...
5995,User5,14900-15000,ls mc lc sh ls sh ex sh netstat netscape netsc...,0.0
5996,User6,14900-15000,cc1 as gcc gcc uname nawk ld_ nm ld gcc gcc un...,0.0
5997,User7,14900-15000,sh ld64_ driver sh gmake netscape netscape net...,0.0
5998,User8,14900-15000,sh grep nawk sh grep nawk sh grep sh grep sh g...,0.0
