<a href="https://colab.research.google.com/github/eliconger/eliconger/blob/main/LatinTaxonParser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Program to find Latin words within compound taxon names
# using a dictionary of known Latin words and a Trie for fast substring lookup

# Importing necessary modules
from collections import defaultdict
import re
import pandas as pd  # Import pandas for reading Excel files
import numpy as np   # To handle NaN values robustly

# Define a class for the Trie Node
class TrieNode:
    def __init__(self):
        self.children = {}              # Dictionary to store child nodes
        self.is_end_of_word = False     # Flag to indicate if node marks end of a valid word

# Define a class for the Trie structure
class Trie:
    def __init__(self):
        self.root = TrieNode()         # Initialize root node

    # Insert a word into the trie
    def insert(self, word):
        node = self.root
        for char in word:              # Loop through each character in the word
            if char not in node.children:
                node.children[char] = TrieNode()  # Create a new node if character not found
            node = node.children[char]            # Move to the child node
        node.is_end_of_word = True     # Mark end of word

    # Search for the longest substrings in a given text that are in the Trie
    def search_substrings(self, text):
        matches = []                                 # List to store all matches found
        n = len(text)
        for i in range(n):                           # Loop through each character index in the text
            node = self.root
            longest_match = None                     # To keep track of longest valid word match
            j = i
            while j < n and text[j] in node.children:
                node = node.children[text[j]]         # Move to next character node
                j += 1
                if node.is_end_of_word:
                    longest_match = (i, j, text[i:j]) # Update longest match if end of word reached
            if longest_match:
                matches.append(longest_match)        # Add only the longest match found at this start
        return matches

# Function to build a Trie from a list of Latin words
# Also performs optional morphological stemming of Latin word endings

def build_trie(latin_words):
    suffixes = ["us", "um", "a", "is", "os", "es", "ata", "atus", "atae", "ae", "ior"]
    trie = Trie()                                  # Initialize a Trie
    for word in latin_words:
        word = word.lower()                        # Convert to lowercase
        trie.insert(word)                          # Insert full word
        for suffix in suffixes:
            if word.endswith(suffix):
                stem = word[:-len(suffix)]         # Strip suffix to get potential stem
                if len(stem) > 2:                  # Only insert reasonable-length stems
                    trie.insert(stem)              # Insert stem into the Trie
    return trie

# Function to tokenize a taxon using Trie matches
# This version also uses scoring logic to prefer longer matches and avoid overlaps

def tokenize_taxon(taxon, trie):
    taxon = taxon.lower()                         # Normalize to lowercase
    matches = trie.search_substrings(taxon)       # Find all matches using the Trie
    matches.sort(key=lambda x: (x[0], -(x[1] - x[0])))  # Sort by start index, then longest match

    tokens = []
    used = [False] * len(taxon)                   # Track used indices to avoid overlaps

    for start, end, word in matches:
        if not any(used[start:end]):              # Only accept if no overlap with previous matches
            tokens.append(word)                   # Add the word to the result
            for i in range(start, end):
                used[i] = True                    # Mark indices as used

    return tokens

# Function to count occurrences of Latin roots from parsed CSV file

def count_latin_root_occurrences(csv_path):
    df = pd.read_csv(csv_path)                    # Load parsed results from CSV
    root_counts = defaultdict(int)                # Dictionary to count roots

    for roots in df["Latin Roots"]:
        if isinstance(roots, str):                # Ensure value is a string before splitting
            for root in roots.split(","):
                cleaned_root = root.strip()
                if cleaned_root:
                    root_counts[cleaned_root] += 1     # Increment count for each root

    count_df = pd.DataFrame(
        list(root_counts.items()),
        columns=["Latin Root", "Count"]
    ).sort_values(by="Count", ascending=False)

    count_df.to_csv("latin_root_occurrences.csv", index=False)  # Save to file
    print("Latin root occurrences saved to 'latin_root_occurrences.csv'")
    return count_df

# Main program
if __name__ == "__main__":
    # Ask user for file paths
    latin_path = input("Enter path to Latin words text file: ")  # Path to .txt file (one word per line)
    taxon_path = input("Enter path to Excel file with taxons: ")  # Path to .xlsx file

    # Read Latin words from file (one per line)
    with open(latin_path, 'r', encoding='utf-8') as f:
        latin_words = [line.strip() for line in f if line.strip()]

    # Read taxons from Excel file, assume second column (index 1)
    df = pd.read_excel(taxon_path)               # Read Excel file into DataFrame
    taxons = df.iloc[:, 1].dropna().astype(str).tolist()  # Get SECOND column, drop NaNs, convert to strings
    #taxons = df.iloc[:, 0].dropna().astype(str).tolist() #FIRST COLUMN CODE


    # Build the Trie from Latin dictionary with stem variations
    trie = build_trie(latin_words)

    # Parse each taxon and collect results
    output_data = []
    for taxon in taxons:
        parts = tokenize_taxon(taxon, trie)             # Tokenize taxon name
        output_data.append({"Taxon": taxon, "Latin Roots": ", ".join(parts)})

    # Save parsed tokens to CSV
    output_df = pd.DataFrame(output_data)
    output_df.to_csv("parsed_taxon_roots.csv", index=False)
    print("Parsed taxon roots saved to 'parsed_taxon_roots.csv'")

    # Count and save Latin root occurrences
    count_latin_root_occurrences("parsed_taxon_roots.csv")


Enter path to Latin words text file: /content/extracted_latin_words.txt
Enter path to Excel file with taxons: /content/extracted_words.csv


ValueError: Excel file format cannot be determined, you must specify an engine manually.

In [None]:
# Code to extract second word in taxons
import pandas as pd

def extract_second_words(df):
    first_col = df.columns[0]
    words = df[first_col].apply(
        lambda x: x.split()[1] if isinstance(x, str) and len(x.split()) > 1 else None
    )
    return pd.DataFrame({'SecondWord': words.dropna()})

def main():
    file_path = input("Enter the path to your Excel file: ").strip()

    try:
        df = pd.read_excel(file_path)
        second_words_df = extract_second_words(df)

        output_file = "extracted_words.csv"
        second_words_df.to_csv(output_file, index=False)

        print(f"\nExtraction complete. Words saved to '{output_file}'.")
    except Exception as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    main()



Enter the path to your Excel file: /content/Latin Taxons, etym starts with"Named after Latin" .xlsx

Extraction complete. Words saved to 'extracted_words.csv'.


This next code is the original Trie implementation without backtracking capabilities