In [1]:
from pathlib import Path
from langchain_core.documents import Document
from loguru import logger
from langchain_community.document_loaders.markdown import UnstructuredMarkdownLoader
from langchain_openai import OpenAIEmbeddings
from langchain_core.embeddings import Embeddings
from tqdm import tqdm
from typing import Any

In [9]:
file_list = list(Path("data").glob("*.md"))
logger.info("found %d files" % len(file_list))

[32m2024-04-29 17:13:30.092[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mfound 869 files[0m


In [11]:
def load_markdown_files(files: list[Path]) -> list[Document]:
    result = []
    for file in tqdm(files):
        loader = UnstructuredMarkdownLoader(file_path=file.as_posix(), mode="elements")
        result.append(loader.load())
    logger.info(f"Loaded {len(result)} files")
    return result

In [4]:
from dataclasses import dataclass


@dataclass
class EmbeddedPage:
    embedding: list[float]
    metadata: dict[str, Any]

In [24]:
def embed_doc(doc: Document, embeddings: Embeddings) -> list[EmbeddedPage]:
    texts: list[str] = [x.page_content for x in doc]  # type: ignore
    metadata: list[dict[str, Any]] = [x.metadata for x in doc]  # type: ignore
    embeds: list[list[float]] = embeddings.embed_documents(texts)
    pages = []
    for i in range(len(embeds)):
        pages.append(EmbeddedPage(embeds[i], metadata=f"{metadata[i]["source"]}-{i:04d}")) # type: ignore
    return pages

In [25]:
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

In [22]:
markdown_files = load_markdown_files(file_list)
logger.info("Loaded %d markdown files" % len(markdown_files))

100%|██████████| 869/869 [00:07<00:00, 112.18it/s]
[32m2024-04-29 17:26:57.589[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_markdown_files[0m:[36m6[0m - [1mLoaded 869 files[0m
[32m2024-04-29 17:26:57.590[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mLoaded 869 markdown files[0m


In [26]:
#
# for testing, create embeddings for one file
#
pages = embed_doc(markdown_files[0], embeddings)
logger.info('found %d pages in %s' % (len(pages), markdown_files[0][0].metadata))

[32m2024-04-29 17:27:38.938[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mfound 42 pages in {'source': 'data/demetra-20220420.md', 'last_modified': '2022-04-20T14:50:27', 'page_number': 1, 'languages': ['eng'], 'filetype': 'text/markdown', 'file_directory': 'data', 'filename': 'demetra-20220420.md', 'category': 'Title'}[0m


In [27]:
pages[0].metadata

'data/demetra-20220420.md-0000'

In [28]:
pages[1].metadata

'data/demetra-20220420.md-0001'

In [30]:
pages[41].metadata

'data/demetra-20220420.md-0041'