In [1]:
import os
import json
import regex as re
from tqdm.auto import tqdm
tqdm.pandas()
import math
import pandas as pd
import string
from pyvi.ViTokenizer import tokenize

In [2]:
_WORD_SPLIT = re.compile("([.,!?\"/':;)(])")
_DIGIT_RE = re.compile(br"\d")
STOP_WORDS = "\" \' [ ] . , ! : ; ?".split(" ")

def basic_tokenizer(sentence):
    """Very basic tokenizer: split the sentence into a list of tokens."""
    words = []
    for space_separated_fragment in sentence.strip().split():
        words.extend(_WORD_SPLIT.split(space_separated_fragment))
        # return [w.lower() for w in words if w not in stop_words and w != '' and w != ' ']
    return [w.lower() for w in words if w != '' and w != ' ' and w not in string.punctuation]

def create_sliding_window(text, size=256, overlap=32):
    actual_size = size - overlap
    windows = []
    n_windows = math.ceil(len(text)/actual_size)
    for i in range(n_windows):
        windows.append(" ".join(text[i*actual_size:i*actual_size + size]))
    return windows

In [None]:
data_path = "../../legal"
corpus_df = pd.read_csv(os.path.join(data_path, "corpus.csv"))

In [None]:
all_texts = []
all_bm25_texts = []
all_cids = []
all_index = []
for i, row in tqdm(corpus_df.iterrows(), total=corpus_df.shape[0]):
    text = row["text"].split(" ")
    cid = row["cid"]
    sliding_windows = create_sliding_window(text, size=256)
    bm25_windows = [" ".join(basic_tokenizer(w)) for w in sliding_windows]
    all_texts.extend(sliding_windows)
    all_bm25_texts.extend(bm25_windows)
    all_cids.extend([cid] * len(sliding_windows))
    all_index.extend([i] * len(sliding_windows))

  0%|          | 0/261597 [00:00<?, ?it/s]

In [5]:
df = pd.DataFrame()
df["cid"] = all_cids
df["text"] = all_texts
df["bm25_text"] = all_bm25_texts
df["i"] = all_index

In [None]:
# from pandarallel import pandarallel

# pandarallel.initialize(progress_bar=True, use_memory_fs=False, nb_workers=4)
# df["bm25_text"] = df["text"].parallel_apply(lambda x: " ".join(basic_tokenizer(x)))

In [6]:
df.to_csv("../processed/lagal_2024_10_31_cleaned_v2.csv",index=False)