In [None]:
# Import necessary libraries
from typing import Iterator
from bs4 import BeautifulSoup
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader

class CustomHTMLLoader(BaseLoader):
    """
    Custom document loader that parses HTML files to extract text and embedded metadata.
    
    This loader is specifically designed to handle the cleaned HTM documents, extracting the main text content and any metadata 
    defined within <meta> tags.
    """

    def __init__(self, file_path: str) -> None:
        """
        Initializes the document loader with a specific HTM file path.
        
        Args:
            file_path (str): The filesystem path to the HTM file that will be processed by this loader.
        """
        self.file_path = file_path

    def lazy_load(self) -> Iterator[Document]:
        """
        Lazily loads content and metadata from an HTM file.
        
        This method opens the specified HTM file, reads its contents, and extracts both the text content and any
        relevant metadata stored in <meta> tags. It yields a Document object containing this data, suitable for 
        use in various NLP contexts within the Langchain framework.
        
        Yields:
            Document: An object containing the extracted content and metadata.
        """
        with open(self.file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()

        soup = BeautifulSoup(html_content, 'html.parser')

        # Extract main textual content from the <body> element, if available; otherwise, use the entire HTML text.
        main_content = soup.find('body').text if soup.find('body') else soup.text

        # Collect metadata from <meta> tags; specifically, we look for tags with both 'name' and 'content' attributes.
        metadata = {meta['name']: meta['content'] for meta in soup.find_all('meta') if 'name' in meta.attrs and 'content' in meta.attrs}

        # Optionally include the path to the HTML source file in metadata for reference in debugging or logging.
        # Comment out or remove the following line in production to avoid exposing file paths.
        # metadata['source'] = self.file_path

        yield Document(page_content=main_content, metadata=metadata)


In [None]:
"""
This script processes HTML documents from a designated directory by first loading the documents using the above custom loader,
and then dividing the content of each document into smaller, overlapping chunks. 

The script sets each chunk to 1024 characters with a 128-character overlap between chunks to ensure continuity of context.
You can adjust the chunk size and overlap as needed based on the requirements of your LLM application.
"""

from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

# Define the directory path for cleaned HTML documents. Modify the path as per your local setup.
cleaned_filings_dir = "path/to/cleaned_filings_directory"

# Initialize the text splitter with specified chunk size and overlap
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=128)

# List to collect all document objects
documents = []

# Loop through each file in the specified directory that ends with '.htm'
for filename in os.listdir(cleaned_filings_dir):
    if filename.endswith('.htm'):
        # Generate the full path for each HTML file
        file_path = os.path.join(cleaned_filings_dir, filename)

        # Create an instance of the HTML loader for the current file
        loader = CustomHTMLLoader(file_path)

        # Load and append the document from the HTML file to the documents list
        documents.extend(loader.lazy_load())  # 'extend' is used to merge lists of documents

# After collecting all documents, split them into chunks using the defined text splitter
chunks = text_splitter.split_documents(documents)

# Optionally print or log the number of documents and chunks processed
print(f"Processed {len(documents)} documents into {len(chunks)} chunks.")
