In [None]:
import re
import time
import json
import requests
from bs4 import BeautifulSoup, NavigableString

BASE_URL = "https://vangoghletters.org"
SEARCH_URL = (
    "https://vangoghletters.org/vg/search/advanced?"
    "originaltext=original&translation=translation&annotations=notes&from=1"
    "&date_from=1872-09-29&date_until=1890-07-31&order=date"
    "&correspondent_id=4+15+18&id_range=1-902&id_type=jlb_id"
)

### ───────────────────────── helpers ────────────────────────── ###

def get_search_links():
    """Returns a list of letter URLs from the advanced-search results (corresponding to all of the letters written by Van Gogh and not to him)."""
    resp = requests.get(SEARCH_URL)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    links = []
    for a in soup.select("#searchresult ol li a"):
        href = a.get("href", "")
        if href.startswith("/vg/letters/let"):
            links.append(BASE_URL + href)
    return links

def fetch_letter_soup(url):
    """Downloads a letter page and returns BeautifulSoup."""
    resp = requests.get(url)
    resp.raise_for_status()
    return BeautifulSoup(resp.content, "html.parser")

def extract_letter_texts(soup):
    """Grabs the letter-box text (with inline [n] anchors preserved)."""
    letters = []
    for lb in soup.find_all("div", class_="letterbox"):
        paras = []
        for p in lb.find_all("div", class_="p"):
            parts = []
            for node in p.children:
                if isinstance(node, NavigableString):
                    txt = node.strip()
                    if txt:
                        parts.append(txt)
                elif node.name == "span" and "anchor" in node.get("class", []):
                    parts.append(f"[{node.get_text(strip=True)}]")
                else:  # <i>, <b>, etc.
                    inner = node.get_text(" ", strip=True)
                    if inner:
                        parts.append(inner)
            if parts:
                paras.append(" ".join(parts))
        if paras:
            letters.append("\n".join(paras))
    return letters

def extract_notes(soup):
    """Returns {note-num: note-text} from the notes section."""
    notes = {}
    for nd in soup.find_all("div", class_="notediv"):
        num  = nd.find("span", class_="notenum").get_text(strip=True)
        full = nd.get_text(" ", strip=True)
        notes[num] = full[len(num) + 1 :].strip()  # strip leading “num. ”
    return notes

### ───────────────────────── main scraper ───────────────────── ###

def scrape_from_search(delay=0.25, dump_file=None):
    links = get_search_links()
    print(f"Found {len(links)} letters in the search results.\n")

    scraped = []

    for idx, url in enumerate(links, 1):
        try:
            soup   = fetch_letter_soup(url)
            texts  = extract_letter_texts(soup)
            notes  = extract_notes(soup)

            # letter ID like “let193”
            m   = re.search(r"(let\d+)", url)
            lid = m.group(1) if m else f"url{idx:03d}"

            # preview text
            if texts:
                clean = texts[0].replace("\n", " ")
                preview_text = (clean[:50] + "...") if len(clean) > 50 else clean
            else:
                preview_text = "(no text)"

            # preview notes (first five)
            note_items = list(notes.items())[:5]
            if note_items:
                notes_preview = " | ".join(
                    f"[{n}] {t[:30]}{'...' if len(t)>30 else ''}"
                    for n, t in note_items
                )
            else:
                notes_preview = "(no notes)"

            print(f"[{idx:03d}] {lid} → \"{preview_text}\"; notes: {notes_preview}")

            scraped.append(
                {
                    "id":   lid,
                    "url":  url,
                    "texts": texts,
                    "notes": notes,
                }
            )

        except Exception as exc:
            print(f"[{idx:03d}] ERROR for {url} → {exc}")

        time.sleep(delay)   #be kind

    if dump_file:
        with open(dump_file, "w", encoding="utf-8") as f:
            json.dump({"letters": scraped}, f, ensure_ascii=False, indent=2)
        print(f"\nSaved full JSON to {dump_file}")

if __name__ == "__main__":
    scrape_from_search(delay=0.25, dump_file="vangogh_filtered_letters.json")


Found 658 letters in the search results.

[001] let001 → "The Hague, 29 September 1872. My dear Theo, Thanks..."; notes: [a.] Expression meaning ‘to make th... | [1.] Theo attended secondary school... | [2.] The trotting races took place ... | [3.] The programme included a displ... | [4.] The family of Carl Adolph Haan...
[002] let002 → "The Hague, 13 December 1872. My dear Theo, That wa..."; notes: [1.] The news was that on 1 January... | [2.] Van Gogh worked as an assistan...
[003] let003 → "The Hague, January 1873 My dear Theo, I heard from..."; notes: [1.] Theo boarded with the family o... | [2.] Tobias Victor Schmidt was Theo... | [3.] Vincent sent 25 guilders of th... | [4.] Emile Louis Vernier made a gre... | [5.] This portrait photograph was t...
[004] let004 → "The Hague, 28 Jan. 1873 My dear Theo, It’s good th..."; notes: [1.] Poor health had forced Uncle H... | [2.] Cornelis (Cor) Marinus van Gog... | [3.] The Rijksmuseum was housed at ... | [4.] It is not known which painti

In [55]:
from dataclasses import dataclass
from typing import List, Dict, Any

@dataclass
class Letter:
    id: str
    url: str
    texts: List[str]
    notes: Dict[str, str]

def load_letters(json_path: str) -> List[Letter]:
    """
    Loads a JSON file produced by scrape_from_search(...)
    and returns a list of Letter objects.
    """
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    letters: List[Letter] = []
    for entry in data.get("letters", []):
        letters.append(
            Letter(
                id=entry["id"],
                url=entry["url"],
                texts=entry.get("texts", []),
                notes=entry.get("notes", {})
            )
        )
    return letters

if __name__ == "__main__":
    # Example usage:
    letters = load_letters("vangogh_filtered_letters.json")
    print(f"Loaded {len(letters)} letters.")
    # Inspect the first one:
    first = letters[0]
    print(f"ID: {first.id}\nURL: {first.url}")
    print("Preview text:", first.texts[0][:100], "...")
    print("First notes:")
    for num, text in list(first.notes.items())[:5]:
        print(f"  [{num}] {text}")
    #print total letter word count
    print("Total word count:", sum(len(t.split()) for t in first.texts))
    #print in total of all letters
    print("Total word count of all letters:", sum(len(t.split()) for l in letters for t in l.texts))
    #print total number of notes
    print("Total number of notes:", sum(len(l.notes) for l in letters))

Loaded 658 letters.
ID: let001
URL: https://vangoghletters.org/vg/letters/let001/letter.html
Preview text: The Hague, 29 September 1872.
My dear Theo,
Thanks for your letter, I was glad to hear that you got  ...
First notes:
  [a.] Expression meaning ‘to make the most of an opportunity’. In the context of this letter, it could also be meant literally.
  [1.] Theo attended secondary school in Oisterwijk in the province of North Brabant. He walked the 6 km to school from his parents ’ house in Helvoirt. The fact that Vincent assumes Theo must have felt ‘anxious’ during these long walks must have something to do with the stormy autumn weather: they were having at the time, which included frequent showers, strong winds and occasional thunderstorms ( KNMI ).
  [2.] The trotting races took place on Saturday, 28 September at 11.00 in the Haagse Bos, during the Nationale- en Internationale tentoonstelling (National and International Exhibition) held on the Malieveld from 21-30 September 1872 o