In [1]:
from pathlib import Path
from langchain_core.documents import Document
from loguru import logger
from langchain_community.document_loaders.markdown import UnstructuredMarkdownLoader
from langchain_openai import OpenAIEmbeddings
from langchain_core.embeddings import Embeddings
from tqdm import tqdm
from typing import Any

In [9]:
file_list = list(Path("data").glob("*.md"))
logger.info("found %d files" % len(file_list))

[32m2024-04-29 17:13:30.092[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mfound 869 files[0m


In [11]:
def load_markdown_files(files: list[Path]) -> list[Document]:
    result = []
    for file in tqdm(files):
        loader = UnstructuredMarkdownLoader(file_path=file.as_posix(), mode="elements")
        result.append(loader.load())
    logger.info(f"Loaded {len(result)} files")
    return result

In [50]:
from dataclasses import dataclass


@dataclass
class EmbeddedPage:
    embedding: list[float]
    metadata: str 
    text: str 

In [51]:
def embed_doc(doc: Document, embeddings: Embeddings) -> list[EmbeddedPage]:
    texts: list[str] = [x.page_content for x in doc]  # type: ignore
    metadata: list[dict[str, Any]] = [x.metadata for x in doc]  # type: ignore
    embeds: list[list[float]] = embeddings.embed_documents(texts)
    assert len(texts) == len(metadata) == len(embeds)
    pages = []
    for i in range(len(embeds)):
        pages.append(EmbeddedPage(embedding=embeds[i], metadata=f"{metadata[i]["source"]}-{i:04d}", text=texts[i])) 
    return pages

In [34]:
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

In [35]:
markdown_files = load_markdown_files(file_list)
logger.info("Loaded %d markdown files" % len(markdown_files))

100%|██████████| 869/869 [00:09<00:00, 95.48it/s] 
[32m2024-04-29 17:36:06.850[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_markdown_files[0m:[36m6[0m - [1mLoaded 869 files[0m
[32m2024-04-29 17:36:06.862[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mLoaded 869 markdown files[0m


In [53]:
#
# for testing, create embeddings for one file
#
pages = embed_doc(markdown_files[0], embeddings)
pages[0]

EmbeddedPage(embedding=[-0.018368314444937923, -0.0014883779975705216, -0.016793466641159424, -0.065584300139296, -0.015939810804880935, 0.04247672738674959, -0.022077300908090097, -0.013562822340295096, -0.011333014503405818, -0.021782937790051877, 0.03299820704624241, -0.0015665684627338909, 0.0025756851990760992, -0.00910320666651654, -0.008691097369940481, 0.005979269112356386, 0.023401939595874836, -0.0036685114180065974, 0.024594112552236, -0.006788770015267866, -0.007097852220530547, 0.016013402050051764, -0.016366639654342728, 0.013356767226345793, -0.01163473786354818, 0.005596595661922875, 0.013445076161757259, -0.019766544377893495, -0.007123609342604848, -0.014593096047396517, -0.00017247891334445388, -0.010633899536928034, 0.0055082867265114095, -0.02316644723879916, -0.005224961177661754, -0.009382853025636673, -0.0018222971782291663, -0.01430609130881734, -0.0017533056187121507, 0.0177648677246532, -0.004511128520943333, 0.0049305971283009845, 0.013342048604782606, -0.02

In [46]:
import chromadb

client = chromadb.Client()

In [None]:
collection = client.create_collection(name="ragsc")

In [54]:
for page in pages:
    collection.add(documents=page.text, ids=page.metadata, embeddings=page.embedding)

In [60]:
results = collection.query(
    query_embeddings=[embeddings.embed_query("why did Martha reach out")],
    n_results=5
)

In [61]:
results

{'ids': [['data/demetra-20220420.md-0012',
   'data/demetra-20220420.md-0032',
   'data/demetra-20220420.md-0026',
   'data/demetra-20220420.md-0040',
   'data/demetra-20220420.md-0010']],
 'distances': [[0.19989287853240967,
   0.4622857868671417,
   0.46320241689682007,
   0.48250025510787964,
   0.4864859879016876]],
 'metadatas': [[None, None, None, None, None]],
 'embeddings': None,
 'documents': [['Martha reached out because Cecile has cold feet',
   'Patricia',
   'rationale for the rest?',
   'to act as mentors',
   'CMARC']],
 'uris': None,
 'data': None}