In [6]:
import re

In [7]:
def parse_nomenclator(filepath):
    """
    Parse a botanical nomenclator file into a standardized dictionary.

    Returns a dict where each name (accepted or synonym) maps to:
    {
        "species": str,               # e.g., 'Perityle canescens'
        "authors": str,               # e.g., 'Everly'
        "accepted_name": str,         # accepted species name
        "accepted_authors": str,      # authorship of the accepted name
        "relationship": "accepted" | "synonym"
    }
    """
    species_dict = {}
    accepted_species = None
    accepted_authors = None

    def split_species_and_authors(name_str):
        parts = name_str.strip().split()
        species = " ".join(parts[:2])
        authors = " ".join(parts[2:]) if len(parts) > 2 else ""
        return species, authors

    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.rstrip()
            if not line.strip():
                continue

            indent = len(line) - len(line.lstrip())
            stripped = line.strip()
            is_synonym_line = stripped.startswith(('=', '+'))

            # Remove leading =/+ markers
            clean = stripped.lstrip('=+').strip()

            # Case: synonym explicitly mapped to accepted name
            if '=' in clean and is_synonym_line:
                parts = [p.strip() for p in re.split(r'\s*=\s*', clean)]
                if len(parts) == 2:
                    syn_full, acc_full = parts
                    syn_species, syn_authors = split_species_and_authors(syn_full)
                    acc_species, acc_authors = split_species_and_authors(acc_full)

                    # Store synonym entry
                    species_dict[syn_species] = {
                        "species": syn_species,
                        "authors": syn_authors,
                        "accepted_name": acc_species,
                        "accepted_authors": acc_authors,
                        "relationship": "synonym"
                    }

                    # Ensure accepted name also recorded
                    if acc_species not in species_dict:
                        species_dict[acc_species] = {
                            "species": acc_species,
                            "authors": acc_authors,
                            "accepted_name": acc_species,
                            "accepted_authors": acc_authors,
                            "relationship": "accepted"
                        }

                continue

            # Case: synonym indented under last accepted name
            if indent > 0 and accepted_species:
                syn_species, syn_authors = split_species_and_authors(clean)
                species_dict[syn_species] = {
                    "species": syn_species,
                    "authors": syn_authors,
                    "accepted_name": accepted_species,
                    "accepted_authors": accepted_authors,
                    "relationship": "synonym"
                }
                continue

            # Case: new accepted name (no =/+ at start)
            if not is_synonym_line:
                accepted_species, accepted_authors = split_species_and_authors(clean)
                species_dict[accepted_species] = {
                    "species": accepted_species,
                    "authors": accepted_authors,
                    "accepted_name": accepted_species,
                    "accepted_authors": accepted_authors,
                    "relationship": "accepted"
                }

            # Case: synonym with no mapped accepted name (e.g., standalone "+Name Author")
            elif is_synonym_line and accepted_species:
                syn_species, syn_authors = split_species_and_authors(clean)
                species_dict[syn_species] = {
                    "species": syn_species,
                    "authors": syn_authors,
                    "accepted_name": accepted_species,
                    "accepted_authors": accepted_authors,
                    "relationship": "synonym"
                }

    return species_dict


In [8]:
nomenclator_filepath = '../data/nomenclator.txt'
nomenclator = parse_nomenclator(nomenclator_filepath)


In [9]:
nomenclator

{'Alomia tenuifolia': {'species': 'Alomia tenuifolia',
  'authors': '(Phil.) Benth. & Hook. ex Reiche',
  'accepted_name': 'Perityle tenuifolius',
  'accepted_authors': '(Phil.) I.H. Lichter-Marck',
  'relationship': 'synonym'},
 'Perityle tenuifolius': {'species': 'Perityle tenuifolius',
  'authors': '(Phil.) I.H. Lichter-Marck',
  'accepted_name': 'Perityle tenuifolius',
  'accepted_authors': '(Phil.) I.H. Lichter-Marck',
  'relationship': 'accepted'},
 'Amauria brandegeeana': {'species': 'Amauria brandegeeana',
  'authors': '(Rose) Rydb.',
  'accepted_name': 'Perityle brandegeeana',
  'accepted_authors': 'Rose',
  'relationship': 'synonym'},
 'Perityle brandegeeana': {'species': 'Perityle brandegeeana',
  'authors': 'Rose',
  'accepted_name': 'Perityle brandegeeana',
  'accepted_authors': 'Rose',
  'relationship': 'accepted'},
 'Amauria carterae': {'species': 'Amauria carterae',
  'authors': 'A.M. Powell',
  'accepted_name': 'Perityle carterae',
  'accepted_authors': '(A.M. Powell) 

In [10]:
# Lookup an accepted name
nomenclator['Galinsogeopsis canescens']

{'species': 'Galinsogeopsis canescens',
 'authors': '(Everly) I.H. Lichter-Marck',
 'accepted_name': 'Galinsogeopsis canescens',
 'accepted_authors': '(Everly) I.H. Lichter-Marck',
 'relationship': 'accepted'}

In [11]:
# Lookup a synonym
nomenclator['Perityle canescens']

{'species': 'Perityle canescens',
 'authors': 'Everly',
 'accepted_name': 'Galinsogeopsis canescens',
 'accepted_authors': '(Everly) I.H. Lichter-Marck',
 'relationship': 'synonym'}

In [8]:
import re
from collections import defaultdict

def parse_nomenclator_nested(text):
    nested_dict = defaultdict(lambda: {"synonyms": [], "tentative": []})
    current_accepted = None

    lines = text.strip().splitlines()

    for line in lines:
        original = line
        line = line.rstrip().lstrip()
        if not line:
            continue

        indent = len(original) - len(original.lstrip())

        # Split by symbols while keeping them
        tokens = re.findall(r'[=+]?[^=+\n]+', line)

        for token in tokens:
            token = token.strip().rstrip(',')
            if not token:
                continue

            if token.startswith('='):
                name = token[1:].strip()
                if current_accepted:
                    nested_dict[current_accepted]["synonyms"].append(name)
            elif token.startswith('+'):
                name = token[1:].strip()
                if current_accepted:
                    nested_dict[current_accepted]["tentative"].append(name)
            else:
                # This is likely a new accepted name
                current_accepted = token
                if current_accepted not in nested_dict:
                    nested_dict[current_accepted] = {"synonyms": [], "tentative": []}

    return nested_dict


In [9]:
filepath = '../data/nomenclator.txt'

with open(filepath, 'r', encoding='utf-8') as f:
    text = f.read()

In [10]:
mapping = parse_nomenclator(text)

In [11]:
mapping

{'Alomia tenuifolia (Phil.) Benth. & Hook. ex Reiche': 'Perityle tenuifolius (Phil.) I.H. Lichter-Marck',
 'Amauria brandegeeana (Rose) Rydb.': 'Perityle brandegeeana Rose',
 'Amauria carterae A.M. Powell': 'Perityle carterae (A.M. Powell) I.H. Lichter-Marck',
 'Amauria rotundifolia Benth.': 'Perityle rotundifolia (Benth.) T.S. Brandegee',
 'Chlamysperma arenarioides Hook. & Arn.': 'galeana pratensis (Kunth) Rydb.',
 'Closia anthemoides Phil.': 'Perityle emoryi Torr.',
 'Closia cotula J. Remy': 'Perityle emoryi Torr.',
 'Closia elata Phil.': 'Perityle emoryi Torr.',
 'Closia foliosa Phil.': 'Perityle emoryi Torr.',
 'Closia viridis Phil.': 'Perityle emoryi Torr.',
 'Corellia montana A.M. Powell': 'Galinsogeopsis montana (A.M. Powell) I.H. Lichter-Marck',
 'Galeana hastata La Llave': 'galeana pratensis (Kunth) Rydb',
 '': 'Perityle rotundata (Rydb.) Shinners',
 'Galinsogeopsis pennellii (B.L. Turner) I.H. Lichter-Marck': 'Galinsogeopsis hofmeisteria (Rydb.) I.H. Lichter-Marck',
 'Perity