bib2html.py – Convert a single BibTeX entry into a formatted HTML <li> element.

Usage
-----
    # 1. From a file that contains ONE entry
    python3 bib2html.py entry.bib

    # 2. Directly from the command line (quote the whole entry)
    python3 bib2html.py "@article{Doe2024,...}"

    # 3. Pipe the entry through stdin
    cat entry.bib | python3 bib2html.py

The output is written to stdout, e.g.:

<li>
  <span class="author">Doe, J.; Smith, A.</span>.
  <span class="title">A Great Paper on Something</span>.
  <span class="publisher">Journal of Awesome Research</span>,
  <span class="year">2024</span>.
</li>


In [None]:


import sys
import re
from pathlib import Path
from typing import Dict, Optional

In [None]:

def strip_braces_or_quotes(value: str) -> str:
    """Remove surrounding braces {…} or quotes \"…\" and collapse whitespace."""
    value = value.strip()
    # Remove outermost braces or quotes (may be nested)
    while (value.startswith("{") and value.endswith("}")) or (
        value.startswith('"') and value.endswith('"')
    ):
        value = value[1:-1].strip()
    # Collapse internal whitespace
    return re.sub(r"\s+", " ", value)


def parse_bibtex(entry: str) -> dict[str, str]:
    """
    Very small BibTeX parser – returns a dict with the fields we care about.
    It works for a *single* entry only.
    """
    # Remove comments and line‑breaks that are not inside braces
    entry = re.sub(r"%.*$", "", entry, flags=re.MULTILINE)  # % comments
    entry = " ".join(entry.split())  # collapse whitespace

    # Grab the part between the outermost braces
    m = re.search(r"@\w+\s*{\s*[^,]+,\s*(.*)}\s*$", entry, flags=re.DOTALL)
    if not m:
        raise ValueError("Could not locate the field block of the BibTeX entry.")
    fields_block = m.group(1)

    # Split on commas that are *outside* braces
    parts = re.split(r",(?=(?:[^{}]*{[^{}]*})*[^{}]*$)", fields_block)

    fields: dict[str, str] = {}
    for part in parts:
        if "=" not in part:
            continue
        key, val = part.split("=", 1)
        key = key.strip().lower()
        val = strip_braces_or_quotes(val)
        fields[key] = val
    return fields


def format_authors(author_field: str) -> str:
    """
    Turn a BibTeX author string into “Last, F.; Last2, F.” format.
    Example:  "Doe, Jane and Smith, Alan" → "Doe, J.; Smith, A."
    """
    authors = [a.strip() for a in re.split(r"\s+and\s+", author_field, flags=re.I)]

    def format_one(name: str) -> str:
        # BibTeX can be "Last, First Middle" or "First Middle Last"
        if "," in name:
            last, first = [p.strip() for p in name.split(",", 1)]
        else:
            parts = name.split()
            last = parts[-1]
            first = " ".join(parts[:-1])
        # Build initials (first letter of each part of the given name)
        initials = ". ".join([p[0].upper() for p in first.split() if p]) + "."
        return f"{last}, {initials}"

    return "; ".join(format_one(a) for a in authors)


def choose_publisher(fields: dict[str, str]) -> Optional[str]:
    """Return the most appropriate venue field."""
    for key in ("journal", "booktitle", "publisher", "howpublished"):
        if key in fields:
            return fields[key]
    return None


def build_html(fields: Dict[str, str]) -> str:
    """Create the final <li>…</li> string."""
    parts = []

    # Author
    if "author" in fields:
        author_html = f'<span class="author">{format_authors(fields["author"])}</span>.'
        parts.append(author_html)

    # Title
    if "title" in fields:
        title_html = f'<span class="title">{fields["title"]}</span>.'
        parts.append(title_html)

    # Publisher / journal / booktitle
    pub = choose_publisher(fields)
    if pub:
        pub_html = f'<span class="publisher">{pub}</span>,'
        parts.append(pub_html)

    # Year
    if "year" in fields:
        year_html = f'<span class="year">{fields["year"]}</span>.'
        parts.append(year_html)

    # Join with spaces and wrap in <li>
    inner = " ".join(parts)
    return f"<li>\n  {inner}\n</li>"


# ----------------------------------------------------------------------
# Main driver
# ----------------------------------------------------------------------
# def main() -> None:
#     raw = read_input()
#     try:
#         fields = parse_bibtex(raw)
#     except Exception as exc:
#         sys.stderr.write(f"Error parsing BibTeX entry: {exc}\n")
#         sys.exit(1)

#     html = build_html(fields)
#     print(html)


# if __name__ == "__main__":
#     main()

NameError: name 'Dict' is not defined

In [None]:
entry = """
@InProceedings{Slowig2025,
  author    = {Slowig, Benjamin and Schröder, Max and Biskup, Till and Bruhn, Karen and Fortmann-Grote, Carsten and Gloede, Carolin and Krüger, Frank and Matela, Julia and Paul-Stüve, Thilo and Schmitt, Fabian and Waltemath, Dagmar and Wohlers, Inken and Yordanova, Kristina},
  booktitle = {Proceedings of the 2nd Conference on Research Data Infrastructure},
  title     = {Let's bring the communities together: The approach of the federal state initiatives for RDM in Schleswig-Holstein and Mecklenburg-Vorpommern},
  year      = {2025},
  address   = {Aachen, Germany},
  month     = aug,
  publisher = {Zenodo},
  copyright = {Creative Commons Attribution 4.0 International},
  doi       = {10.5281/ZENODO.16735903},
}
"""

In [None]:
parse_bibtex(entry)