In [1]:
#
# test out cleaning up markdown files
#

In [2]:
#
# read the files
#
from pathlib import Path
from loguru import logger

data_path = Path("data")
md_files = list(data_path.glob("*.md"))
logger.info(
    "found %d markdown files"
    % len(
        md_files,
    )
)

[32m2024-05-22 17:28:22.444[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mfound 0 markdown files[0m


In [3]:
import markdown
from bs4 import BeautifulSoup


def markdown_to_text(markdown_str: str) -> str:
    html = markdown.markdown(markdown_str)
    soup = BeautifulSoup(html, features="html.parser")
    lines = soup.get_text().split("\n")
    lines = [line.strip().lower() for line in lines if len(line) > 0]
    return "\n".join(lines)

In [4]:
input_file = md_files[0]
with open(input_file, mode="rt", encoding="utf-8") as f:
    data = f.read()
txt = markdown_to_text(data)
txt

IndexError: list index out of range

In [5]:
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(separator="\n", chunk_size=256, chunk_overlap=20)
docs = text_splitter.create_documents([txt])

In [6]:
docs

[Document(page_content="cdfm-20221130.md\nla proposition du gouvernement\nnotre proposition\non aurait besoin des garanties plus solides\ninquiétude pour la qualité\non va au délà de ce qu'on a fait par le passé\ncontre-positions?\nudem\n- automne de 2024"),
 Document(page_content="- automne de 2024\n- conditions qui doivent être réalisées\n- l'offre est intéressant\nlaval\n- faut être prudent\n- une occasion dans la crise\n- est-ce qu'il y a un risque que l'uq s'embarque dans la même direction\n- il faut leur donner quelque chose"),
 Document(page_content='contact priviligé\nfmoq -- tient beaucoup à cette solution\nil faut travailler avec eux\nles gains sont intéressants\nconclusion')]

In [7]:
from langchain_openai import OpenAI
from dotenv import load_dotenv

load_dotenv()
openai = OpenAI()

In [8]:
from langchain_openai import OpenAIEmbeddings

embedding_model = OpenAIEmbeddings()
txts = [txt.page_content for txt in docs]
embeddings = embedding_model.embed_documents(txts)

In [9]:
#
# try langchain's markdown splitter
#
from dataclasses import dataclass, field
from typing import Any
from datetime import datetime
from langchain_text_splitters.markdown import MarkdownTextSplitter
from langchain_core.documents import Document


@dataclass
class MarkdownPage:
    texts: list[str] = field(default_factory=list)
    chunks: list[Document] = field(default_factory=list)
    meta: dict[str, Any] = field(default_factory=dict)
    embeddings: list[list[float]] = field(default_factory=list)
    ids: list[str] = field(default_factory=list)

    def __str__(self):
        buffer = []
        if self.chunks:
            s = f"File:{self.meta["filename"]}"
            buffer.append(s)
            for i in range(len(self.chunks)):
                buffer.append(
                    f"   {self.chunks[i].page_content[0:10]}..{self.chunks[i].page_content[-10:]} [{self.embeddings[i][0]}..{self.embeddings[i][-1]}] {self.ids[i]}"
                )
        elif self.texts:
            s = f"File:{self.meta["filename"]}"
            buffer.append(s)
            for i in range(len(self.texts)):
                buffer.append(f"    {self.texts[i][0:10]}...{self.texts[i][-10:]}")
        else:
            buffer.append("Empty page!")
        return "\n".join(buffer)


md_files = list(Path("data").glob("*.md"))
input_files = md_files[1:10]
logger.debug("processing %d files" % len(input_files))
pages: list[MarkdownPage] = []
for input in input_files:
    page = MarkdownPage()
    modified: datetime = datetime.fromtimestamp(input.stat().st_mtime)
    page.meta = {"filename": input.as_posix(), "created": f"{modified}"}
    with open(input) as f:
        txt = f.read()
        page.texts.append(txt.lower())
    pages.append(page)
logger.debug("obtained %d pages" % len(pages))
splitter = MarkdownTextSplitter(chunk_size=256, chunk_overlap=25)
for i, page in enumerate(pages):
    page.chunks = splitter.create_documents(page.texts)
    logger.debug(f"Page {i}: {len(page.chunks)} chunks")

[32m2024-05-02 08:43:16.727[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m37[0m - [34m[1mprocessing 9 files[0m
[32m2024-05-02 08:43:16.730[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m47[0m - [34m[1mobtained 9 pages[0m
[32m2024-05-02 08:43:16.731[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m51[0m - [34m[1mPage 0: 7 chunks[0m
[32m2024-05-02 08:43:16.732[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m51[0m - [34m[1mPage 1: 3 chunks[0m
[32m2024-05-02 08:43:16.732[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m51[0m - [34m[1mPage 2: 24 chunks[0m
[32m2024-05-02 08:43:16.733[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m51[0m - [34m[1mPage 3: 4 chunks[0m
[32m2024-05-02 08:43:16.734[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m51[0m - [34m[1mPage 4: 25 chunks[0m
[32m2024-05-02 08:43:1

In [10]:
from langchain_openai.embeddings import OpenAIEmbeddings

model = OpenAIEmbeddings()

for page in pages:
    chunks = [chunk.page_content for chunk in page.chunks]
    page.embeddings = model.embed_documents(chunks)
    # logger.debug("%d chunks -> %d embeddings" % (len(chunks), len(page.embeddings)))
    prefix = f"{page.meta["filename"]}"
    for i in range(len(chunks)):
        page.ids.append(f"{prefix}({i})")

In [11]:
pages[0]

MarkdownPage(texts=['---\ntitle: tb rounds\nkeywords: tb\n---\n# tb rounds\n\n## case 1\n\n20 yi hiv- haiti\n\nhospitalized aug\n- severe fatigue, weakness, losss of weith\n- amonorhea, anemia,\n- ascitis\n- peritoneal granulomas\n- tb\n- rmp, inh, pyz, eth\n- sent for outpatient after 2 weeks \n\n- developed headaches and nausea\n- found unwell, readmitted \n- ct head, multiple lesions brain\n  - some midline shift\n\n- added levoflox, increased rif 600\n- dex for edema\n- pan-sensitive\n\n1 month later, ct improving\ndid not tolerate tapering of dex\nreadmitted\n  - recurrent swelling of brain\n  - added thalidomide high dose in hope of tapering steroids\n\n- unable to taper either dex, thalidomide\n\n- question about whether fluoroquinolones are appropriate for cns tb\n\n- question about dex and effect on drug levels of inh, rif\n\n- thalidomide is not supported by evidence \n- proposal to move to infleximab as an alternative to steroids \n\n- some concern about "low tnf" tb, assumi