<a href="https://colab.research.google.com/github/eliconger/eliconger/blob/main/LatinTaxonParser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# Eli Conger, Research for Reptile Taxonomy
# Program to find Latin words within compound taxon names
# using a dictionary of known Latin words and a Trie for fast substring lookup

# Importing necessary modules
from collections import defaultdict
import re
import pandas as pd  # Import pandas for reading Excel files
import numpy as np   # To handle NaN values robustly

# Define a class for the Trie Node
class TrieNode:
    def __init__(self):
        self.children = {}              # Dictionary to store child nodes
        self.is_end_of_word = False     # Flag to indicate if node marks end of a valid word

# Define a class for the Trie structure
class Trie:
    def __init__(self):
        self.root = TrieNode()         # Initialize root node

    # Insert a word into the trie
    def insert(self, word):
        node = self.root
        for char in word:              # Loop through each character in the word
            if char not in node.children:
                node.children[char] = TrieNode()  # Create a new node if character not found
            node = node.children[char]            # Move to the child node
        node.is_end_of_word = True     # Mark end of word

    # Search for the longest substrings in a given text that are in the Trie
    def search_substrings(self, text):
        matches = []                                 # List to store all matches found
        n = len(text)
        for i in range(n):                           # Loop through each character index in the text
            node = self.root
            longest_match = None                     # To keep track of longest valid word match
            j = i
            while j < n and text[j] in node.children:
                node = node.children[text[j]]         # Move to next character node
                j += 1
                if node.is_end_of_word:
                    longest_match = (i, j, text[i:j]) # Update longest match if end of word reached
            if longest_match:
                matches.append(longest_match)        # Add only the longest match found at this start
        return matches

# Function to build a Trie from a list of Latin words --- in this case, they were previously extracted from excel file
# Also performs optional morphological stemming of Latin word endings

def build_trie(latin_words):
    suffixes = ["us", "um", "a", "is", "os", "es", "ata", "atus", "atae", "ae", "ior"]
    trie = Trie()                                  # Initialize a Trie
    for word in latin_words:
        word = word.lower()                        # Convert to lowercase
        trie.insert(word)                          # Insert full word
        for suffix in suffixes:
            if word.endswith(suffix):
                stem = word[:-len(suffix)]         # Strip suffix to get potential stem
                if len(stem) > 2:                  # Only insert reasonable-length stems
                    trie.insert(stem)              # Insert stem into the Trie
    return trie

# Function to tokenize a taxon using Trie matches
# This version also uses scoring logic to prefer longer matches and avoid overlaps

def tokenize_taxon(taxon, trie):
    taxon = taxon.lower()                         # Normalize to lowercase
    matches = trie.search_substrings(taxon)       # Find all matches using the Trie
    matches.sort(key=lambda x: (x[0], -(x[1] - x[0])))  # Sort by start index, then longest match

    tokens = []
    used = [False] * len(taxon)                   # Track used indices to avoid overlaps

    for start, end, word in matches:
        if not any(used[start:end]):              # Only accept if no overlap with previous matches
            tokens.append(word)                   # Add the word to the result
            for i in range(start, end):
                used[i] = True                    # Mark indices as used

    return tokens

# Function to count occurrences of Latin roots from parsed CSV file

def count_latin_root_occurrences(csv_path):
    df = pd.read_csv(csv_path)                    # Load parsed results from CSV
    root_counts = defaultdict(int)                # Dictionary to count roots

    for roots in df["Latin Roots"]:
        if isinstance(roots, str):                # Ensure value is a string before splitting
            for root in roots.split(","):
                cleaned_root = root.strip()
                if cleaned_root:
                    root_counts[cleaned_root] += 1     # Increment count for each root

    count_df = pd.DataFrame(
        list(root_counts.items()),
        columns=["Latin Root", "Count"]
    ).sort_values(by="Count", ascending=False)

    count_df.to_csv("latin_root_occurrences.csv", index=False)  # Save to file
    print("Latin root occurrences saved to 'latin_root_occurrences.csv'")
    return count_df



In [9]:
# Example usage:
if __name__ == "__main__":
    # Ask user for file paths
    latin_path = input("Enter path to Latin words text file: ")  # Path to .txt file (one word per line)
    taxon_path = input("Enter path to Excel file with taxons: ")  # Path to .xlsx file

    # Read Latin words file (that were extracted from the og file) (one per line)
    with open(latin_path, 'r', encoding='utf-8') as f:
        latin_words = [line.strip() for line in f if line.strip()]

    # Read taxons from Excel file, assume second column (index 1)
    df = pd.read_excel(taxon_path)               # Read Excel file into DataFrame
    #taxons = df.iloc[:, 1].dropna().astype(str).tolist()  # Get WHOLE second column, drop NaNs, convert to strings
    #taxons = df.iloc[:, 1].dropna().astype(str).apply(lambda x: x.split()[1] if len(x.split()) > 1 else x).tolist() # Get only second word in cells
    taxons = df.iloc[:, 1].dropna().astype(str).apply(lambda x: x.split()[-1]).tolist() # gets LAST word


    # Build the Trie from Latin dictionary with stem variations
    trie = build_trie(latin_words)

    # Parse each taxon and print results
    for taxon in taxons:
        parts = tokenize_taxon(taxon, trie)      # Tokenize using stem-aware Trie
        print(f"{taxon}: {parts}")              # Display result

    # Save results to CSV
    output_data = [{"Taxon": taxon, "Latin Roots": ", ".join(tokenize_taxon(t, trie))} for t in taxons]
    output_df = pd.DataFrame(output_data)
    output_df.to_csv("parsed_taxon_roots.csv", index=False)
    print("Results saved to 'parsed_taxon_roots.csv'")

    # Count occurrences from the saved CSV
    count_latin_root_occurrences("parsed_taxon_roots.csv")


Enter path to Latin words text file: /content/drive/MyDrive/ReptileTaxonProject/extracted_latin_words.txt
Enter path to Excel file with taxons: /content/drive/MyDrive/ReptileTaxonProject/Latin Taxons, etym starts with"Named after Latin" .xlsx
fimbriata: ['fimbriata']
fuscolabialis: ['fusc', 'labialis']
graminea: ['graminea']
annectans: ['an', 'nec', 'an']
pallidipectoris: ['pallid']
grandis: ['grandis']
maculatus: ['maculatus']
spinicauda: ['spin', 'cauda']
armata: ['arma']
crucigera: ['ru']
niger: ['niger']
spinalis: ['spinalis']
breviceps: ['brev']
gracilicauda: ['gracil', 'cauda']
lineatus: ['lineatus']
plumbeus: ['plumbe']
subocularis: ['subocularis']
cucullata: ['cucull']
felinus: ['fel', 'in']
nivaria: ['nivari']
aculeata: ['l']
armata: ['arma']
gracilimembris: ['gracil']
hispida: []
cristata: ['crista']
montana: ['montan']
persimilis: ['persimilis']
planiceps: ['plan']
spinosa: ['spin']
bilineatus: ['bilineat']
fronticincta: ['in']
fuscus: ['fuscus']
laevis: ['laevis']
tenuis: [

In [None]:
┌────────────────────────────┐
│ Start Program (__main__)  │
└────────────┬──────────────┘
             │
             ▼
┌────────────────────────────────────────┐
│ Prompt user for Latin words (.txt)     │
│ and taxon names file (.xlsx) paths     │
└────────────┬───────────────────────────┘
             │
             ▼
┌────────────────────────────────────────┐
│ Read Latin words file → List[str]      │
└────────────┬───────────────────────────┘
             │
             ▼
┌────────────────────────────────────────┐
│ Read Excel file                        │
│ → Extract last word from each taxon    │
│ → List[str] (taxons)                   │
└────────────┬───────────────────────────┘
             │
             ▼
┌────────────────────────────────────────┐
│ Build Trie from Latin words            │
│ • Insert full word                     │
│ • Insert stemmed versions              │
└────────────┬───────────────────────────┘
             │
             ▼
┌────────────────────────────────────────┐
│ For each taxon in list:                │
│ • Normalize                            │
│ • Search Trie for longest substrings   │
│ • Prioritize non-overlapping matches   │
└────────────┬───────────────────────────┘
             │
             ▼
┌────────────────────────────────────────┐
│ Print taxon with identified Latin roots│
└────────────┬───────────────────────────┘
             │
             ▼
┌────────────────────────────────────────┐
│ Save results to 'parsed_taxon_roots.csv'│
└────────────┬───────────────────────────┘
             │
             ▼
┌────────────────────────────────────────┐
│ Count occurrences of each Latin root   │
│ → Save to 'latin_root_occurrences.csv' │
└────────────┬───────────────────────────┘
             │
             ▼
┌──────────────────────┐
│ End Program          │
└──────────────────────┘
