In [13]:
%pip install pandas
%pip install numpy
%pip install matplotlib
%pip install scikit-learn
%pip install nltk


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m

In [6]:
from dataclasses import dataclass
import subprocess
import markdown
import json
import os
from pathlib import Path
from typing import List, Dict, Optional, Any, Tuple, Set, Callable
import shutil
import logging
from datetime import datetime, timedelta
from urllib.parse import quote
import re
from collections import defaultdict

In [14]:
@dataclass
class Page:
    title: str
    path: Path
    content: str
    modified_date: datetime
    category: Optional[str]
    tags: List[str]
    description: Optional[str]
    is_index: bool = False
    css_classes: List[str] = None


class MetadataIndex:

    MARKDOWN_EXTENSIONS = [
        "meta",
        "toc",
        "fenced_code",
        "tables",
        "attr_list",
        "footnotes",
        "def_list",
        "admonition",
        "mdx_truly_sane_lists",
    ]
    SUPPORTED_CONTENT = {".md", ".markdown"}
    IGNORED_DIRECTORIES = {
        ".git",
        "__pycache__",
        "node_modules",
        ".github",
        "nlp.venv",
        "site",
        "venv",
        ".venv",
    }
    DEFAULT_CSS_CLASSES = ["markdown-content", "content"]
    DEFAULT_PERMISSIONS = {"directory": 0o755, "file": 0o644}

    def __init__(self, input_dir: str):
        self.input_dir = Path(input_dir)
        self.pages: Dict[Path, Page] = {}
        self.categories: Dict[str, List[Page]] = defaultdict(list)
        self.tags: Dict[str, List[Page]] = defaultdict(list)
        self.markdown_converter = markdown.Markdown(
            extensions=self.MARKDOWN_EXTENSIONS,
            output_format="html5",
            tab_length=4,
        )

    def _walk_directory(self, directory: Path) -> List[Path]:
        return [
            item
            for item in directory.rglob("*")
            if not any(ignored in item.parts for ignored in self.IGNORED_DIRECTORIES)
        ]

    def _extract_metadata(self, file_path: Path, content: str) -> dict:

        md = markdown.Markdown(extensions=self.MARKDOWN_EXTENSIONS)
        md.convert(content)

        default_title = file_path.stem.replace("-", " ").title()

        if not hasattr(md, "Meta"):
            return {
                "title": default_title,
                "category": None,
                "tags": [],
                "description": None,
            }

        metadata = {
            "title": md.Meta.get("title", [default_title])[0],
            "category": md.Meta.get("category", [None])[0],
            "tags": [],
            "description": md.Meta.get("description", [None])[0],
        }

        if "tags" in md.Meta and md.Meta["tags"][0]:
            metadata["tags"] = [
                tag.strip() for tag in md.Meta["tags"][0].split(",") if tag.strip()
            ]

        return metadata

    def _process_markdown(self, file_path: Path) -> None:

        try:
            content = file_path.read_text(encoding="utf-8")
            metadata = self._extract_metadata(file_path, content)

            relative_path = file_path.relative_to(self.input_dir)
            is_index = file_path.stem.lower() == "index"
            tags = [tag.strip().lower() for tag in metadata["tags"] if tag.strip()]

            page = Page(
                title=metadata["title"],
                path=relative_path,
                content=content,
                modified_date=datetime.fromtimestamp(file_path.stat().st_mtime),
                category=metadata["category"],
                tags=tags,
                description=metadata["description"],
                is_index=is_index,
                css_classes=self.DEFAULT_CSS_CLASSES,
            )

            self.pages[file_path] = page
            self.categories[page.category].append(page)
            self.tags[page.title] = page

        except Exception as e:
            logger.error(f"Failed to process {file_path}: {str(e)}")

    def _index_all_markdown_files(self) -> None:
        """
        Walk through the input directory and process all markdown files
        """
        print(f"Indexing markdown files in {self.input_dir.resolve()}...")
        for file_path in self._walk_directory(self.input_dir):

            if file_path.suffix in self.SUPPORTED_CONTENT:
                self._process_markdown(file_path)

    def _get_all_tags(self) -> Set[str]:
        """
        Get all unique tags from the indexed pages
        """
        all_tags = set()
        for page in self.pages.values():
            all_tags.update(page.tags)
        return all_tags

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string


nltk.download("stopwords")
nltk.download("wordnet")


def preprocess_text(text):
    text = text.lower()
    text = "".join([char for char in text if char not in string.punctuation])
    text = re.sub(r"\d+", "", text)
    stop_words = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()
    text = text.replace("title", "").replace("category", "").replace("tags", "").replace("description", "")
    tokens = text.split()
    tokens = [
        lemmatizer.lemmatize(token)
        for token in tokens
        if token not in stop_words and len(token) > 2
    ]
    return " ".join(tokens)

[nltk_data] Downloading package stopwords to /Users/emm12/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/emm12/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [53]:
page_idx = MetadataIndex("..")
page_idx._index_all_markdown_files()
print(f"Indexed {len(page_idx.pages)} markdown files.")
documents = [
    page.content for page in page_idx.pages.values() if page.content
]

Indexing markdown files in /Users/emm12/repos/notes...
Indexed 171 markdown files.


In [66]:

processed_docs = [
    preprocess_text(doc) for doc in documents
]  

vectorizer = TfidfVectorizer(max_features=1000, min_df=2, max_df=0.02)
tfidf_matrix = vectorizer.fit_transform(processed_docs)

feature_names = vectorizer.get_feature_names_out()
doc_word_counts = (tfidf_matrix > 0).sum(axis=0)
multi_doc_words = [
    (feature_names[i], doc_word_counts[0, i])
    for i in range(len(feature_names))
    if doc_word_counts[0, i] > 1
]  

multi_doc_words.sort(key=lambda x: x[1], reverse=True)

word_links = {}
for word, count in multi_doc_words[:100]:  

    docs_with_word = [i for i, doc in enumerate(processed_docs) if word in doc.split()]
    word_links[word] = {"count": int(count), "documents": docs_with_word}

print(f"Found {len(word_links)} interesting words appearing in multiple documents")

Found 100 interesting words appearing in multiple documents
