In [1]:
#
# test out cleaning up markdown files
#

In [2]:
#
# read the files
#
from pathlib import Path 
from loguru import logger

data_path = Path('data')
md_files = list(data_path.glob("*.md"))
logger.info("found %d markdown files" % len(md_files,))

[32m2024-05-01 19:37:21.406[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m9[0m - [1mfound 869 markdown files[0m


In [3]:
import markdown
from bs4 import BeautifulSoup

def markdown_to_text(markdown_str:str)->str:
    html = markdown.markdown(markdown_str)
    soup = BeautifulSoup(html, features='html.parser')
    lines = soup.get_text().split("\n")
    lines = [line.strip().lower() for line in lines if len(line) > 0]
    return "\n".join(lines)

In [4]:
input_file = md_files[0]
with open(input_file,mode="rt", encoding="utf-8") as f:
    data = f.read()
txt = markdown_to_text(data)
txt

"demetra\ncovid\ntaking a big toll, especially if they can't work from home\nmunaca\nno acute crises\nnothing critical this week\nbut two tcp courses next week that will be hard to replace\nmm is working on alternate plans\nmay is the worst possible timing\ncritical accreditation visits\ncmarc\nanimal services approved enough to keep working\nmartha reached out because cecile has cold feet\nmay 2,3 coming in over a weekend\ngoing to meet with a lot of people\ndirector of operations\njarrod doing a great job\ntwo other candidates\npeople are engaged and hopeful that there can be real change\nhr\n- em needs to leave\nwell\nfinance\nother than 10m..\ncatherine wants $750k\nhuge bill related to it systems\nthis element of the budget needs to be rediscussed\naccreditation needed\nrationale for the rest?\nco\ndirector search underway: marie-eve\nother positions\nihpp\nsace\npatricia\ncreate aqi --> both education and systems that support the faculty\nspgh\nabsolutely no movement on the campu

In [5]:
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=256,
    chunk_overlap=20
)
docs = text_splitter.create_documents([txt])

In [6]:
docs

[Document(page_content="demetra\ncovid\ntaking a big toll, especially if they can't work from home\nmunaca\nno acute crises\nnothing critical this week\nbut two tcp courses next week that will be hard to replace\nmm is working on alternate plans\nmay is the worst possible timing"),
 Document(page_content='critical accreditation visits\ncmarc\nanimal services approved enough to keep working\nmartha reached out because cecile has cold feet\nmay 2,3 coming in over a weekend\ngoing to meet with a lot of people\ndirector of operations\njarrod doing a great job'),
 Document(page_content='two other candidates\npeople are engaged and hopeful that there can be real change\nhr\n- em needs to leave\nwell\nfinance\nother than 10m..\ncatherine wants $750k\nhuge bill related to it systems\nthis element of the budget needs to be rediscussed'),
 Document(page_content='accreditation needed\nrationale for the rest?\nco\ndirector search underway: marie-eve\nother positions\nihpp\nsace\npatricia\ncreate a

In [7]:
from langchain_openai import OpenAI
from dotenv import load_dotenv
load_dotenv()
openai = OpenAI()

In [8]:
from langchain_openai import OpenAIEmbeddings

embedding_model = OpenAIEmbeddings()
txts = [txt.page_content for txt in docs]
embeddings = embedding_model.embed_documents(txts)


In [37]:
#
# try langchain's markdown splitter
#
from dataclasses import dataclass, field
from typing import Any
from datetime import datetime
from langchain_text_splitters.markdown import MarkdownTextSplitter
from langchain_core.documents import Document

@dataclass
class MarkdownPage:
    texts:list[str] = field(default_factory=list)
    chunks:list[Document] = field(default_factory=list)
    meta: dict[str,Any] = field(default_factory=dict)
    embeddings: list[list[float]] = field(default_factory=list)
    ids: list[str] = field(default_factory=list)

    def __str__(self):
        buffer = []
        if self.chunks:
            s = f"File:{self.meta["filename"]}"
            buffer.append(s)
            for i in range(len(self.chunks)):
                buffer.append(f"   {self.chunks[i].page_content[0:10]}..{self.chunks[i].page_content[-10:]} [{self.embeddings[i][0]}..{self.embeddings[i][-1]}] {self.ids[i]}")
        elif self.texts:
            s = f"File:{self.meta["filename"]}"
            buffer.append(s)
            for i in range(len(self.texts)):
                buffer.append(f"    {self.texts[i][0:10]}...{self.texts[i][-10:]}")
        else:
            buffer.append("Empty page!")
        return "\n".join(buffer)
    
splitter = MarkdownTextSplitter()
md_files = list(Path('data').glob("*.md"))
input_files = md_files[1:10]
logger.debug("processing %d files" % len(input_files))
pages: list[MarkdownPage] = []
for input in input_files:
    page = MarkdownPage()
    modified: datetime = datetime.fromtimestamp(input.stat().st_mtime)
    page.meta = {"filename":input.as_posix(),"created":f"{modified}"}
    with open(input) as f:
        txt = f.read()
        page.texts.append(txt.lower())
    pages.append(page)
logger.debug("obtained %d pages" % len(pages))
splitter = MarkdownTextSplitter(chunk_size=256, chunk_overlap=25)
for i, page in enumerate(pages):
    page.chunks = splitter.create_documents(page.texts)
    logger.debug(f"Page {i}: {len(page.chunks)} chunks")
    
    


[32m2024-05-01 20:36:40.739[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m37[0m - [1mprocessing 9 files[0m
[32m2024-05-01 20:36:40.742[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m47[0m - [1mobtained 9 pages[0m
[32m2024-05-01 20:36:40.743[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m51[0m - [34m[1mPage 0: 5 chunks[0m
[32m2024-05-01 20:36:40.744[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m51[0m - [34m[1mPage 1: 2 chunks[0m
[32m2024-05-01 20:36:40.745[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m51[0m - [34m[1mPage 2: 20 chunks[0m
[32m2024-05-01 20:36:40.745[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m51[0m - [34m[1mPage 3: 0 chunks[0m
[32m2024-05-01 20:36:40.746[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36m<module>[0m:[36m51[0m - [34m[1mPage 4: 3 chunks[0m
[32m2024-05-01 20:36:40.747[0m | [34m[1m

In [38]:
from langchain_openai.embeddings import OpenAIEmbeddings
model = OpenAIEmbeddings()

for page in pages:
    chunks = [chunk.page_content for chunk in page.chunks]
    page.embeddings = model.embed_documents(chunks)
    # logger.debug("%d chunks -> %d embeddings" % (len(chunks), len(page.embeddings)))
    prefix = f"{page.meta["filename"]}"
    for i in range(len(chunks)):
        page.ids.append(f"{prefix}({i})")

File:data/arsenault-20220524-discipline.md
   # discipli..l'étudiant [-0.0003534394492312683..-0.0135494694164122] data/arsenault-20220524-discipline.md(0)
   désagréabl.. réligieux [-0.00593647298521481..-0.012724847262800624] data/arsenault-20220524-discipline.md(1)
   ## je fais..amilariser [0.009180410078334486..-0.017850069804628554] data/arsenault-20220524-discipline.md(2)
   j'ai reçu ..e taquines [-0.02339526939282548..0.004376598034155303] data/arsenault-20220524-discipline.md(3)
   les vieux ..t présente [-0.014342402217993483..-0.0009903166807309937] data/arsenault-20220524-discipline.md(4)
