## Devide Data into trai&test

In [None]:
from more_itertools import chunked
from functional import seq
from pathlib import Path
import pandas as pd
import glob

## Utils

In [None]:
def get_file_conetent(file: Path):
    with open(file, 'r') as f:
        return seq(f.readlines()).map(lambda line: line.strip()).to_list()

In [None]:
def split_into_segments(content: list[str], *, segment_size: int = 100):
    return list(chunked(content, segment_size))

In [None]:
def join_segments(content: list[list[str]]):
    return seq(content).map(lambda line: ' '.join(line))

## Organize data

In [None]:
label_df: pd.DataFrame = pd.read_csv('./challengeToFill.csv')
label_df.set_index('Unnamed: 0', inplace=True)
label_df.index.name = None
label_df.head(3)

In [None]:
files_paths: list[Path] = glob.glob('./FraudedRawData/User*')
files: list[list[str]] = seq(files_paths)\
    .map(get_file_conetent)\
    .map(split_into_segments)\
    .map(join_segments)
files = seq(files_paths)\
    .map(lambda s: s.split('/')[-1])\
    .zip(files)\
    .to_dict()
text_df: pd.DataFrame = pd.DataFrame.from_dict(files).transpose()
new_column_names = {i: f'{i*100}-{(i+1)*100}' for i in text_df.columns}
text_df.rename(columns=new_column_names, inplace=True)
text_df.sort_index(inplace=True)
text_df.head(2)

In [None]:
text_df = text_df.reset_index()
text_df = text_df.melt(id_vars='index', var_name='segment', value_name='text')
text_df.rename(columns={'index': 'user_id'}, inplace=True)
text_df

In [None]:
label_df = label_df.reset_index()
label_df = label_df.melt(id_vars='index', var_name='segment', value_name='label')
label_df.rename(columns={'index': 'user_id'}, inplace=True)
label_df

In [None]:
label_df['user_id'] = label_df['user_id'].astype(str)
text_df['user_id'] = text_df['user_id'].astype(str)
df = pd.merge(text_df, label_df, on=['user_id', 'segment'], how='left')
df

## Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train: pd.DataFrame = df[df['label'].notnull()]
test: pd.DataFrame = df[df['label'].isna()]

In [None]:
x_train , y_train = train['text'], train['label']
x_train, x_validation, y_train, y_validation = train_test_split(x_train, y_train)
# TODO: change to split by the user (all train should be in one batch and other user in test -> no information leak)

## Models

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(x_train)
tfidf_transformer = TfidfTransformer()
X_train_tf = tfidf_transformer.fit_transform(x_train_counts)
X_train_tf

In [None]:
count_vect = CountVectorizer()
x_validation_count = count_vect.transform(x_validation)
X_validation_tf = tfidf_transformer.transform(x_validation_count)
X_validation_tf

### TD-IDF

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tf, y_train)
y_pred = clf.predict(X_validation_tf)
(y_pred == y_validation)

### 