In [1]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from tiktoken import get_encoding
import glob, os


In [33]:
def count_token(text:str) -> int:
    "count token of text"
    encoder = get_encoding('cl100k_base')
    tokens = encoder.encode(text)
    return len(tokens)

def save_chunks(chunks:list, chunk_folder:str, file_name:str):
    for idx, c in enumerate(chunks):
        chunk_path = os.path.join(chunk_folder, f"{file_name}_{idx}.txt")
        with open(chunk_path, 'w', encoding= 'utf-8') as f:
            f.write(c)
    print(f"Saved {len(chunks)} chunks")

In [34]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=512,
    chunk_overlap=20,
    length_function=count_token,
    is_separator_regex=False,
    separators=[
        "\n\n",
        "\n",
        " ",
        ".",
        ",",
        "\u200b",  # Zero-width space
        "\uff0c",  # Fullwidth comma
        "\u3001",  # Ideographic comma
        "\uff0e",  # Fullwidth full stop
        "\u3002",  # Ideographic full stop
        "",
    ],
)

## Chunk Luật tài chính


In [None]:
text_folder = '../data/raw/'
chunk_folder = '../data/chunk/TaiChinh'

os.makedirs(chunk_folder, exist_ok=True)

text_files = glob.glob(os.path.join(text_folder,"*/**.txt"))

for file in text_files[:2]:
    file_name = os.path.splitext(os.path.basename(file))[0]
    
    
    # Read txt files
    with open(file, 'r', encoding='utf-8') as f:
        text = f.read()

    # Chunk txt and save 
    chunks = text_splitter.split_text(text)
    save_chunks(chunks,chunk_folder, file_name)
