# Note Extraction 🦜

In [None]:
import dotenv

dotenv.load_dotenv(dotenv.find_dotenv())

from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.documents import Document
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

from pathlib import Path
import yaml
import random
from tqdm import tqdm
from typing import Dict, Optional

random.seed(1337)

## Load and parse notes

In [None]:
VAULT_PATH = Path("../obsidian").resolve()

In [None]:
assert VAULT_PATH.exists()

note_paths = list(VAULT_PATH.rglob("*.md"))
print(f"Found {len(note_paths)} notes in the vault")

llm = ChatOpenAI(
    model="gpt-4o",
    temperature=0,
)

In [None]:
class Note(BaseModel):
    name: str
    frontmatter: Optional[Dict]
    body: str
    relative_path: str

    @classmethod
    def from_path(cls, path: Path) -> "Note":
        with path.open("r") as f:
            lines = f.readlines()

            if lines and lines[0].strip() == "---":
                frontmatter_lines = []
                for line in lines[1:]:
                    if line.strip() == "---":
                        break
                    frontmatter_lines.append(line)

                if frontmatter_lines:
                    frontmatter_content = "".join(frontmatter_lines)
                    frontmatter = yaml.safe_load(frontmatter_content)

                    if len(lines) > len(frontmatter_lines) + 2:
                        lines = lines[len(frontmatter_lines) + 2:]
                    else:
                        lines = []
            else:
                frontmatter = None

            body = "".join(lines)

        return cls(
            name=path.stem,
            frontmatter=frontmatter,
            body=body,
            relative_path=path.relative_to(VAULT_PATH).as_posix(),
        )

In [None]:
notes = [Note.from_path(note_path) for note_path in note_paths]

## Extract categories and summaries

In [None]:
persist_path = Path("notes_with_categories")

note_categories = {
    "Article Draft": "Draft of an article I've been writing for my newsletter. Contains some original writing.",
    "Call Note": "Notes taken from the meeting I've had. Has mentions of agenda and-or feedback.",
    "Diary Entry": "Note that contains musings about my feelings, emotions, events that happened in my life, reflections etc.",
    "General Note": "Any note that doesn't fit in other categories.",
    "Paper Highlights": "Highlights from a scientific paper I read. Usually refers to AI or computer vision.",
}

category_prompt = "\n".join(
    [f"- {name}: {description}" for name, description in note_categories.items()]
)


class Info(BaseModel):
    """Information about a note."""

    category: Optional[str] = Field(
        description=f"Can be one of the following:\n {category_prompt}"
    )
    summary: Optional[str] = Field(
        description="Concise one or two sentence summary of note's contents to be used for similarity search."
    )
    extract: Optional[str] = Field(
        description="The essential concepts withing the note, used to link it to semantically similar ones."
    )


prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert extraction algorithm. "
            "Only extract relevant information from the text. "
            "If you do not know the value of an attribute asked "
            "to extract, return null for the attribute's value.",
        ),
        ("human", "{text}"),
    ]
)


class NoteWithInfo(Note):
    category: Optional[str]
    summary: str
    extract: Optional[str]

    @classmethod
    def from_note(
        cls, note: Note, category: str, summary: str, extract: str
    ) -> "NoteWithInfo":
        return cls(
            name=note.name,
            frontmatter=note.frontmatter,
            body=note.body,
            relative_path=note.relative_path,
            category=category,
            summary=summary,
            extract=extract,
        )


runnable = prompt | llm.with_structured_output(schema=Info)

In [None]:
persist_path.mkdir(exist_ok=True, parents=True)
all_notes_path = persist_path.joinpath("all_notes.jsonl")

# 1. Load existing progress
notes_with_infos = {}

if all_notes_path.exists():
    with all_notes_path.open("r") as f:
        for line in f.readlines():
            note = NoteWithInfo.parse_raw(line)
            notes_with_infos[note.relative_path] = note
else:
    with all_notes_path.open("w"):
        pass

# 2. Process notes
for note in tqdm(notes):
    # Skip if already exists
    if note.location in notes_with_infos:
        continue

    # Process and convert
    response = runnable.invoke({"text": note.body})

    note_with_info = NoteWithInfo.from_note(
        note,
        category=response.category,
        summary=response.summary,
        extract=response.extract,
    )
    notes_with_infos[note_with_info.relative_path] = note_with_info

    # Save progress
    with all_notes_path.open("a") as f:
        f.write(f"{note_with_info.json()}\n")

## Find related notes

In [None]:
docs = []

for note in notes_with_infos.values():
    if note.extract is None:
        continue
    doc = Document(
        page_content=note.extract,
        metadata={
            k: v for k, v in note.dict().items() if type(v) in [str, int, float, bool]
        },
    )
    docs.append(doc)

db = Chroma.from_documents(
    docs,
    OpenAIEmbeddings(model="text-embedding-ada-002"),
    persist_directory="notes_vectorstore_2",
)

In [None]:
doc = docs[77]
print(doc.metadata["name"])
query = doc.page_content
db.similarity_search(query)

In [None]:
class RelevanceScore(BaseModel):
    """Information about a note."""

    is_relevant: Optional[bool] = Field(description="Whether two pieces of text are related or not.")
    reason: Optional[str] = Field(description="Reasoning behind the given score.")


prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert scoring algorithm. "
            "Rate whether two pieces of text provided to you are related. ",
        ),
        ("human", "{text}"),
    ]
)

runnable = prompt | llm.with_structured_output(schema=RelevanceScore)

In [None]:
def find_related(note, db):
    query = note.extract
    related_docs = db.similarity_search(query)
    related_names = [doc.metadata["name"] for doc in related_docs]
    return related_names