In [34]:
import re

BIBTEX_FIELDS = [
    "address", "annote", "author", "booktitle", "chapter", "crossref", "edition",
    "editor", "howpublished", "institution", "journal", "key", "month", "note",
    "number", "organization", "pages", "publisher", "school", "series", "title",
    "type", "volume", "year", "abstract", "doi", "file", "isbn", "issn", "keywords",
    "language", "location", "url", "version", "eprint", "archivePrefix", "primaryClass",
    "bibsource",
]

BIBTEX_ENTRY_TYPES = [
    "article", "book", "booklet", "conference", "inbook", "incollection",
    "inproceedings", "manual", "mastersthesis", "misc", "phdthesis",
    "proceedings", "techreport", "unpublished",
]

def entry_start(line):
    for entry_type in BIBTEX_ENTRY_TYPES:
        entry_str = "@" + entry_type + "{"
        if entry_str in line:
            return entry_type
    return None

def is_field(line: str):
    """
    Check if the input line starts with any given BibTeX field, allowing for leading spaces or tabs,
    and any spaces/tabs around the equals sign.

    Example match:
        '    author= {John Doe},'
        '\tjournal  ="Some Journal"'

    Returns the field name if it matches, None otherwise.
    """
    for field_name in BIBTEX_FIELDS:
        match = re.match(rf"{re.escape(field_name)}\s*=\s*", line.strip())
        if match:
            return field_name

    return None

In [35]:
def process_bib_file(input_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    for i,line in enumerate(lines):

        if is_field(line) == "title":
            line, count = re.subn(r'\{([A-Za-z])\}', r'\1', line)
            line, count = re.subn('"{', '{', line)
            line, count = re.subn('}"', '}', line)
            # Protect capital letters in titles
            line = re.sub(r'(?<!\{)(\b[A-Z]{1,}\b)(?!\})',r'{\1}',line)
            lines[i] = line

        # Wrap field content using brackets
        if is_field(line) is not None and "\",\n" in line:
            line = line.replace('= "', '= {')
            line = line.replace('",\n', '},\n')
            lines[i] = line

        # Remove double spacing and tabs
        line, count = re.subn('\s+', ' ', line)
        line, count = re.subn('\t+', ' ', line)
        line = "\t"+line.strip() + "\n"
        lines[i] = line

        # Remove empty lines
        if len(line.strip()) == 0:
            lines[i] = line.strip()

        # Add empty line between entries
        if entry_start(line) is not None:
            lines[i] = "\n" + line.strip() + "\n"

        # Remove spaces or tabs on entry end wrap
        if line == "\t}\n":
            lines[i] = "}\n"

        # Restring maximum number of authors
        if is_field(line) == "author":
            authors = line.strip()[len("author"):].strip()[1:].strip()[1:-2].split(" and ")
            if len(authors) > 10:
                authors = authors[0] + " and et al."
            else:
                authors = " and ".join(authors)
            line = "\tauthor = {" + authors + "},\n"
            line, _ = re.subn('and others', 'and et al.', line)
            lines[i] = line

    with open("references_reworked.bib", 'w', encoding='utf-8') as f:
        f.writelines(lines)

process_bib_file("references.bib")