diff --git a/README.md b/README.md new file mode 100644 index 0000000..5e5b81e --- /dev/null +++ b/README.md @@ -0,0 +1,31 @@ +# LXX Morphology Downloader + +This repository contains a small utility for downloading the LXX Morphology corpus from the UPenn gopher mirror. + +## Usage + +```bash +python scripts/download_lxxmorph.py --output-dir lxxmorph +``` + +By default, it pulls from: + +``` +https://ccat.sas.upenn.edu/gopher/text/religion/biblical/lxxmorph/ +``` + +The downloader defaults to fetching only `*.mlxx` files so the local corpus stays limited to the MLXX data. +You can override the base URL with `--base-url` or adjust the filename filter with `--include` if the mirror +changes or you need additional file types. + +## Chat question template + +Use the template below when you want an analysis run based on the local `*.mlxx` files and the +`MT-LXX_Parallel_Manual (1).pdf` instructions: + +``` +Please analyze using the local .mlxx files. Follow the MT-LXX_Parallel_Manual (1).pdf +step-by-step and return rows in this format: + +Reference,Hebrew (MT),Greek (LXX),CATSS Notation & Analysis +``` diff --git a/scripts/download_lxxmorph.py b/scripts/download_lxxmorph.py new file mode 100644 index 0000000..cb2716c --- /dev/null +++ b/scripts/download_lxxmorph.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +"""Download the LXX Morphology corpus directory from UPenn's gopher mirror.""" +from __future__ import annotations + +import argparse +import html.parser +import os +import pathlib +import re +import urllib.parse +import urllib.request + +DEFAULT_BASE_URL = "https://ccat.sas.upenn.edu/gopher/text/religion/biblical/lxxmorph/" + + +class LinkParser(html.parser.HTMLParser): + def __init__(self) -> None: + super().__init__() + self.links: list[str] = [] + + def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: + if tag.lower() != "a": + return + for key, value in attrs: + if key.lower() == "href" and value: + self.links.append(value) + + +def fetch_bytes(url: str, user_agent: str | None = None) -> bytes: + request = urllib.request.Request(url) + if user_agent: + request.add_header("User-Agent", user_agent) + with urllib.request.urlopen(request) as response: + return response.read() + + +def parse_links(content: bytes, base_url: str) -> list[str]: + text = content.decode("utf-8", errors="replace") + if " None: + path.mkdir(parents=True, exist_ok=True) + + +def save_file(url: str, output_dir: pathlib.Path, allow_pattern: re.Pattern[str] | None, user_agent: str | None) -> None: + parsed = urllib.parse.urlparse(url) + filename = os.path.basename(parsed.path) + if not filename: + return + if allow_pattern and not allow_pattern.search(filename): + return + target_path = output_dir / filename + if target_path.exists(): + return + data = fetch_bytes(url, user_agent=user_agent) + target_path.write_bytes(data) + + +def download_recursive( + base_url: str, + output_dir: pathlib.Path, + visited: set[str], + allow_pattern: re.Pattern[str] | None, + user_agent: str | None, +) -> None: + if base_url in visited: + return + visited.add(base_url) + + content = fetch_bytes(base_url, user_agent=user_agent) + links = parse_links(content, base_url) + for link in links: + if link.endswith("/"): + subdir_name = os.path.basename(os.path.normpath(urllib.parse.urlparse(link).path)) + subdir_path = output_dir / subdir_name + ensure_directory(subdir_path) + download_recursive(link, subdir_path, visited, allow_pattern, user_agent) + else: + save_file(link, output_dir, allow_pattern, user_agent) + + +def main() -> None: + parser = argparse.ArgumentParser(description="Download files from the LXX Morphology directory.") + parser.add_argument( + "--base-url", + default=DEFAULT_BASE_URL, + help="Base URL for the LXX Morphology directory listing.", + ) + parser.add_argument( + "--include", + default="\\.mlxx$", + help="Regex pattern of filenames to download (default: only .mlxx files).", + ) + parser.add_argument( + "--output-dir", + default="lxxmorph", + help="Directory to store downloaded files.", + ) + parser.add_argument( + "--user-agent", + default="Mozilla/5.0 (compatible; LXXMorphDownloader/1.0)", + help="User-Agent header to send with requests.", + ) + args = parser.parse_args() + + output_dir = pathlib.Path(args.output_dir) + ensure_directory(output_dir) + allow_pattern = re.compile(args.include) if args.include else None + download_recursive(args.base_url, output_dir, set(), allow_pattern, args.user_agent) + + +if __name__ == "__main__": + main()