# Chunkerization

In [3]:
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
import pickle
from langchain.document_loaders import PyPDFLoader

In [4]:
loader = PyPDFLoader("../data/Constitución Española/Constitución española.pdf")
documents = loader.load()
for document in documents:
    document.metadata['filename'] = document.metadata['source']

path = "../data/Constitución Española/"
chunk_path = path + "chunks/"

### 1. CharacterSplitter

In [29]:
chunk_sizes = [200, 300, 400]

for i in chunk_sizes:
    text_splitter = CharacterTextSplitter(
        chunk_size=i,
        chunk_overlap=i/10,
        length_function=len,
        separator = '',
        is_separator_regex=False
    )

    texts = text_splitter.split_documents(documents)

    # Define the file path
    file_path = chunk_path + f"documents_charactersplitter_{i}.pkl"

    # Save the texts variable to a file
    with open(file_path, "wb") as file:
        pickle.dump(texts, file)


In [21]:
splitters = {}
chunk_sizes = [200, 300, 400]
for i in chunk_sizes:
    # Define the file path
    file_path = chunk_path + f"documents_charactersplitter_{i}.pkl"

    # Load the texts from the pickle file
    with open(file_path, "rb") as file:
        splitters[f"charactersplitter_{i}"] = pickle.load(file)

### 2. RecursiveCharacterSplitter

In [47]:
chunk_sizes = [200, 300, 400]

for i in chunk_sizes:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=i,
        chunk_overlap=i/10,
        length_function=len,
        is_separator_regex=False
    )

    texts = text_splitter.split_documents(documents)

    # Define the file path
    file_path = chunk_path + f"documents_recursivecharactersplitter_{i}.pkl"

    # Save the texts variable to a file
    with open(file_path, "wb") as file:
        pickle.dump(texts, file)

In [22]:
for i in chunk_sizes:
    # Define the file path
    file_path = chunk_path + f"documents_recursivecharactersplitter_{i}.pkl"

    # Load the texts from the pickle file
    with open(file_path, "rb") as file:
        splitters[f"recursivecharactersplitter_{i}"] = pickle.load(file)

### 3. SpanishArticleSplitter

In [30]:
from bs4 import BeautifulSoup
import re
from langchain.docstore.document import Document

class SpanishArticleSplitter():
    def create_documents_from_html(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()
        soup = BeautifulSoup(html_content, 'html.parser')
        documents = []
        articles = [h5.find_parent('div') for h5 in soup.find_all('h5', class_='articulo')]
        for a in articles:
            article_name = a.find('h5').text
            ptexts = [p.text if 'Bloque' not in p.text else '' for p in a.find_all('p')]
            article_text = '\n'.join(ptexts).replace('\n\n', '\n')[1:]
            metadata={}
            metadata['article_name'] = article_name
            metadata['filename'] = file_path
            metadata['source'] = file_path
            doc = Document(page_content=article_text, metadata=metadata)
            documents.append(doc)
        return documents

In [31]:
file_path = "../data/Constitución Española/BOE-A-1978-31229 Constitución Española.html"
splitter = SpanishArticleSplitter()
texts = splitter.create_documents_from_html(file_path)

# Define the file path
file_path = chunk_path + f"documents_spanisharticlesplitter.pkl"

# Save the texts variable to a file
with open(file_path, "wb") as file:
    pickle.dump(texts, file)

In [32]:
# Define the file path
file_path = chunk_path + f"documents_spanisharticlesplitter.pkl"

# Load the texts from the pickle file
with open(file_path, "rb") as file:
    splitters[f"spanisharticlesplitter"] = pickle.load(file)