In [1]:
import re
#from fathomnet.api import worms, taxa
import csv
from pynoddgcs.connect import GCS
import tempfile
import json
import pyworms
import re

In [2]:
REFRESH = True

In [3]:
SOURCE_BUCKET = "nmfs_odp_hq"
SOURCE_FILE = "nodd_tools/datasets/gfisher/annotations.json"

In [4]:
client = GCS()

In [5]:
with tempfile.NamedTemporaryFile() as f:
    client.download(SOURCE_BUCKET, SOURCE_FILE, f.name)
    raw_coco = json.load(f)

In [6]:
observed_categories = [cat["name"] for cat in raw_coco["categories"]]
observed_categories

['MYCTEROPERCAMICROLEPIS-170022104',
 'SERRANUSPHOEBE-170024208',
 'SERIOLAFASCIATA-170113103',
 'RHOMBOPLITESAURORUBENS-170152001',
 'LUTJANUSSYNAGRIS-170151113',
 'CALLIONYMIDAE-170420000',
 'EPINEPHELUSMORIO-170021211',
 'HALICHOERES-170281200',
 'PTEROIS-168011900',
 'BODIANUSPULCHELLUS-170280201',
 'HAEMULONAUROLINEATUM-170191003',
 'LUTJANUSGRISEUS-170151109',
 'POMACENTRIDAE-170270000',
 'UNKNOWNFISH',
 'SERIOLA-170113100',
 'PRISTIGENYSALTA-170050401',
 'CARANXBARTHOLOMAEI-170110801',
 'BALISTESCAPRISCUS-189030502',
 'LUTJANUSCAMPECHANUS-170151107',
 'CHAETODONOCELLATUS-170260307',
 'HOLACANTHUSBERMUDENSIS-170290102',
 'MYCTEROPERCAPHENAX-170022105',
 'CEPHALOPHOLISCRUENTATA-170020401',
 'LUTJANUS-170151100',
 'SCOMBEROMORUSMACULATUS-170440803',
 'CARANXCRYSOS-170110803',
 'CHAETODONSEDENTARIUS-170260309',
 'PAGRUSPAGRUS-170212302',
 'DIPLECTRUMFORMOSUM-170020903',
 'CHAETODON-170260300',
 'CALAMUS-170210600',
 'SERIOLARIVOLIANA-170113105',
 'LUTJANUSVIVANUS-170151114',
 'BALIS

In [7]:
with open('Species List.csv', 'r') as f:
    reader = csv.reader(f)
    header = next(reader)
    name_map = {concatenated_name: better_name for _, concatenated_name, better_name, _, _ in reader}
name_map

{'ABUDEFDUFSAXATILIS-170270101': 'ABUDEFDUF_SAXATILIS',
 'ABUDEFDUFTAURUS-170270102': 'ABUDEFDUF_TAURUS',
 'ACANTHURUSBAHIANUS-170160101': 'ACANTHURUS_BAHIANUS',
 'ACANTHURUSCHIRURGUS-170160103': 'ACANTHURUS_CHIRURGUS',
 'ACANTHURUSCOERULEUS-170160102': 'ACANTHURUS_COERULEUS',
 'ALBULAVULPES-121030101': 'ALBULA_VULPES',
 'ALECTISCILIARIS-170110101': 'ALECTIS_CILIARIS',
 'AMBLYCIRRHITUSPINOS-170590101': 'AMBLYCIRRHITUS_PINOS',
 'ANISOTREMUSSURINAMENSIS-170190106': 'ANISOTREMUS_SURINAMENSIS',
 'ANISOTREMUSVIRGINICUS-170190105': 'ANISOTREMUS_VIRGINICUS',
 'ANTENNARIUSOCELLATUS-195020101': 'ANTENNARIUS_OCELLATUS',
 'ANTHIASASPERILINGUIS-170026001': 'ANTHIAS_ASPERILINGUIS',
 'ANTHIASNICHOLSI-170026002': 'ANTHIAS_NICHOLSI',
 'ANTHIASTENUIS-170026003': 'ANTHIAS_TENUIS',
 'ANTHIASWOODSI-170026004': 'ANTHIAS_WOODSI',
 'ANTIGONIA-162030100': 'ANTIGONIA',
 'ANTIGONIACAPROS-162030101': 'ANTIGONIA_CAPROS',
 'APOGONAFFINI-170060204': 'APOGON_AFFINI',
 'APOGONALUTUS-170060212': 'APOGON_ALUTUS',
 'APO

In [8]:
new_categories = {}
failed_categories = []
for cat in observed_categories:
    try:
        new_categories[cat] = name_map[cat]
    except:
        failed_categories.append(cat)
assert not failed_categories

## Other updates

### Updating "crowd" type categories to have the "iscrowd" attribute

"SCHOOL" and possibly others are "crowd" type categories.

In [9]:
crowd_categories = set(["SCHOOL"])
crowd_category_ids = set([cat["id"] for cat in raw_coco["categories"] if cat["name"] in crowd_categories])
for ann in raw_coco["annotations"]:
    if ann["id"] in crowd_category_ids:
        ann["iscrowd"] = 1
crowd_category_ids

{114}

In [10]:
import pyworms
import requests
import re
from urllib.parse import quote

import pyworms
import re
from difflib import SequenceMatcher

class TaxonomyPipeline:
    def __init__(self):
        # Regex to strip open nomenclature (sp, spp, complex, etc.)
        self.suffix_regex = re.compile(r'\s+(sp\.|spp\.|complex|s\.l\.|cf\.|var\.|subsp\.)', re.IGNORECASE)
        
        # Base URL for WoRMS REST API (Kept for reference, though we use pyworms mostly)
        self.WORMS_API_URL = "https://www.marinespecies.org/rest/AphiaRecordsByName"

    def resolve(self, moniker):
        result = {
            "original": moniker,
            "status": "UNRESOLVED",
            "accepted_name": None,
            "aphia_id": None,
            "candidates": [],
            "notes": ""
        }

        # --- Attempt 1: Exact Lookup (Safe & Fast) ---
        match = self._query_worms_exact(moniker)
        if match:
            return self._format_result(result, match, "RESOLVED")

        # --- Attempt 2: Parent Lookup (Safe & Fast) ---
        clean_name, cleaned = self._clean_name(moniker)
        if cleaned:
            match = self._query_worms_exact(clean_name)
            if match:
                res = self._format_result(result, match, "PARTIAL")
                res["notes"] = f"Stripped suffix: '{moniker}' -> '{clean_name}'"
                return res

        # --- Attempt 3: Smart Genus Search (The Fix for 'spinimana') ---
        # Logic: If standard lookup failed, pull the Genus and look for close spellings
        # This fixes "Stenocionops spinimana" (invalid) -> "Stenocionops spinimanus" (valid)
        genus_candidates = self._smart_genus_match(moniker)
        if genus_candidates:
            result["status"] = "AMBIGUOUS"
            result["candidates"] = genus_candidates
            result["notes"] = "Found similar species in same Genus (Latin ending fix?)"
            return result

        # --- Attempt 4: Single-Page Fuzzy Search ---
        # We manually call the API to get page 1 only.
        candidates = self._fuzzy_search(clean_name)
        if candidates:
            result["status"] = "AMBIGUOUS"
            result["candidates"] = candidates
            result["notes"] = f"Ambiguous. Found {len(candidates)} candidates (Page 1)."
            return result

        result["notes"] = "No biological match found."
        return result

    def _query_worms_exact(self, name):
        """Uses pyworms library for exact matching."""
        try:
            res = pyworms.aphiaRecordsByName(name, like=False)
            if res and len(res) > 0:
                return res[0]
        except Exception:
            pass
        return None

    def _smart_genus_match(self, name):
        """
        Splits 'Genus species', queries the Genus, and checks for spelling proximity.
        """
        parts = name.split()
        if len(parts) < 2: 
            return [] # Not a binomial name

        genus = parts[0]
        epithet = parts[1] # e.g., 'spinimana'

        try:
            # 1. Get all species in this Genus (using 'like' on the Genus name)
            # We search "Genus%" to get the children.
            # marine_only=True speeds this up significantly.
            genus_records = pyworms.aphiaRecordsByName(genus + '%', like=True, marine_only=True)
            
            if not genus_records:
                return []

            candidates = []
            seen_names = set()

            for rec in genus_records:
                valid_name = rec.get('valid_name')
                if not valid_name or valid_name in seen_names:
                    continue
                
                # Verify it is actually in this genus (fuzzy search can return similar genera)
                if not valid_name.startswith(genus):
                    continue

                seen_names.add(valid_name)

                # 2. Compare the species part (Epithet)
                target_parts = valid_name.split()
                if len(target_parts) < 2: continue
                
                target_epithet = target_parts[1]

                # 3. Calculate similarity score (0.0 to 1.0)
                # This catches 'spinimana' vs 'spinimanus' (high score)
                similarity = SequenceMatcher(None, epithet, target_epithet).ratio()
                
                # Threshold > 0.80 usually catches gender endings without false positives
                if similarity > 0.80:
                    candidates.append(valid_name)
            
            return candidates[:5] # Return top 5 matches

        except Exception:
            pass
        return []

    def _fuzzy_search(self, name):
        """
        Fuzzy lookup using pyworms. 
        Note: pyworms does NOT loop. It returns Page 1 (first 50 items) only.
        """
        try:
            # Add wildcard '%' to the end for the search
            res = pyworms.aphiaRecordsByName(name + '%', like=True, marine_only=True)
            if res:
                # Return distinct valid names
                return list(set([r['valid_name'] for r in res]))
        except Exception:
            pass
        return []

    def _clean_name(self, name):
        match = self.suffix_regex.search(name)
        if match:
            return name[:match.start()], True
        return name, False

    def _format_result(self, result, record, status):
        result["status"] = status
        result["accepted_name"] = record.get('valid_name', 'N/A')
        result["aphia_id"] = record.get('valid_AphiaID', record.get('AphiaID'))
        
        if record.get('status') == 'unaccepted':
             result["notes"] += f" [Synonym: {record.get('scientificname')} -> {record.get('valid_name')}]"
        return result

In [11]:
pipeline = TaxonomyPipeline()

In [12]:
full_map = {}
if not REFRESH:
    with open('worms_map.csv') as f:
        reader = csv.reader(f)
        header = next(reader)
        full_map = dict(reader)

In [13]:
new_categories.keys() - full_map.keys()

{'ALBULAVULPES-121030101',
 'ALECTISCILIARIS-170110101',
 'ANISOTREMUSVIRGINICUS-170190105',
 'ANOMURA-999100401',
 'ANTHIINAE-999020300',
 'ARCHOSARGUSPROBATOCEPHALUS-170213601',
 'BALISTESCAPRISCUS-189030502',
 'BALISTESVETULA-189030504',
 'BODIANUSPULCHELLUS-170280201',
 'BODIANUSRUFUS-170280202',
 'CALAMUS-170210600',
 'CALAMUSBAJONADO-170210602',
 'CALAMUSLEUCOSTEUS-170210604',
 'CALAMUSNODOSUS-170210608',
 'CALAMUSPRORIDENS-170210605',
 'CALLIONYMIDAE-170420000',
 'CANTHIDERMISSUFFLAMEN-189030402',
 'CANTHIGASTERJAMESTYLERI-189080102',
 'CANTHIGASTERROSTRATA-189080101',
 'CARANGIDAE-170110000',
 'CARANXBARTHOLOMAEI-170110801',
 'CARANXCRYSOS-170110803',
 'CARANXRUBER-170110807',
 'CARCHARHINUS-108020200',
 'CARCHARHINUSFALCIFORMIS-108020202',
 'CARCHARHINUSPLUMBEUS-108020208',
 'CAULOLATILUSCHRYSOPS-170070104',
 'CAULOLATILUSCYANOPS-170070101',
 'CAULOLATILUSMICROPS-170070103',
 'CENTROPRISTISOCYURUS-170024804',
 'CENTROPRISTISPHILADELPHICA-170024805',
 'CEPHALOPHOLISCRUENTATA-17

In [14]:
worms_translations = {}
for moniker in new_categories.keys() - full_map.keys():
    search_term = new_categories[moniker].replace("_", " ")
    res = pipeline.resolve(search_term)
    resolved = res['accepted_name'] if res['accepted_name'] else "---"
    print(f"{res['original']:<25} | {res['status']:<10} | {resolved:<25} | {res['notes']}")
    
    worms_translations[new_categories[moniker]] = res

PRONOTOGRAMMUS MARTINICENSIS | RESOLVED   | Pronotogrammus martinicensis | 
LUTJANUS ANALIS           | RESOLVED   | Lutjanus analis           | 
MYCTEROPERCA INTERSTITIALIS | RESOLVED   | Mycteroperca interstitialis | 
SERRANUS ANNULARIS        | RESOLVED   | Serranus annularis        | 
CHROMIS ENCHRYSURUS       | RESOLVED   | Chromis enchrysurus       | 
SCOMBEROMORUS MACULATUS   | RESOLVED   | Scomberomorus maculatus   | 
CARANX RUBER              | RESOLVED   | Caranx ruber              | 
CANTHIGASTER JAMESTYLERI  | RESOLVED   | Canthigaster jamestyleri  | 
SERRANUS ATROBRANCHUS     | RESOLVED   | Serranus atrobranchus     | 
PRISTIGENYS ALTA          | RESOLVED   | Pristigenys alta          | 
SERRANUS PHOEBE           | RESOLVED   | Serranus phoebe           | 
CALAMUS                   | RESOLVED   | Calamus                   | 
LUTJANUS SYNAGRIS         | RESOLVED   | Lutjanus synagris         | 
HOLOCENTRUS               | RESOLVED   | Holocentrus               | 
OSTICHTHYS

In [15]:
for name, res in worms_translations.items():
    if res['status'] != 'RESOLVED':
        print(name)
        print(res)

OSTICHTHYS_TRACHYPOMUS
{'original': 'OSTICHTHYS TRACHYPOMUS', 'status': 'UNRESOLVED', 'accepted_name': None, 'aphia_id': None, 'candidates': [], 'notes': 'No biological match found.'}
STENOCIONOPS_SPINIMANA
{'original': 'STENOCIONOPS SPINIMANA', 'status': 'UNRESOLVED', 'accepted_name': None, 'aphia_id': None, 'candidates': [], 'notes': 'No biological match found.'}


In [16]:
worms_translations

{'PRONOTOGRAMMUS_MARTINICENSIS': {'original': 'PRONOTOGRAMMUS MARTINICENSIS',
  'status': 'RESOLVED',
  'accepted_name': 'Pronotogrammus martinicensis',
  'aphia_id': 282365,
  'candidates': [],
  'notes': ''},
 'LUTJANUS_ANALIS': {'original': 'LUTJANUS ANALIS',
  'status': 'RESOLVED',
  'accepted_name': 'Lutjanus analis',
  'aphia_id': 159792,
  'candidates': [],
  'notes': ''},
 'MYCTEROPERCA_INTERSTITIALIS': {'original': 'MYCTEROPERCA INTERSTITIALIS',
  'status': 'RESOLVED',
  'accepted_name': 'Mycteroperca interstitialis',
  'aphia_id': 273878,
  'candidates': [],
  'notes': ''},
 'SERRANUS_ANNULARIS': {'original': 'SERRANUS ANNULARIS',
  'status': 'RESOLVED',
  'accepted_name': 'Serranus annularis',
  'aphia_id': 273890,
  'candidates': [],
  'notes': ''},
 'CHROMIS_ENCHRYSURUS': {'original': 'CHROMIS ENCHRYSURUS',
  'status': 'RESOLVED',
  'accepted_name': 'Chromis enchrysurus',
  'aphia_id': 304153,
  'candidates': [],
  'notes': ''},
 'SCOMBEROMORUS_MACULATUS': {'original': 'SC

In [31]:
manual_lookups = {
    "STENOCIONOPS_SPINIMANA":"Stenocionops spinimanus",
    "OSTICHTHYS_TRACHYPOMUS":"Ostichthys trachypoma"
}

In [32]:
for full_name, root_name in name_map.items():
    if root_name not in worms_translations:
        continue
    res = worms_translations[root_name]
    print(res)
    if res['status'] == 'RESOLVED':
        full_map[full_name] = res['accepted_name']
    else:
        full_map[full_name] = manual_lookups[root_name]

{'original': 'HYPORTHODUS NIGRITUS', 'status': 'RESOLVED', 'accepted_name': 'Hyporthodus nigritus', 'aphia_id': 475100, 'candidates': [], 'notes': ''}


In [33]:
full_map

{'ALBULAVULPES-121030101': 'Albula vulpes',
 'ALECTISCILIARIS-170110101': 'Alectis ciliaris',
 'ANISOTREMUSVIRGINICUS-170190105': 'Anisotremus virginicus',
 'ARCHOSARGUSPROBATOCEPHALUS-170213601': 'Archosargus probatocephalus',
 'BALISTESCAPRISCUS-189030502': 'Balistes capriscus',
 'BALISTESVETULA-189030504': 'Balistes vetula',
 'BODIANUSPULCHELLUS-170280201': 'Bodianus pulchellus',
 'BODIANUSRUFUS-170280202': 'Bodianus rufus',
 'CALAMUSBAJONADO-170210602': 'Calamus bajonado',
 'CALAMUSLEUCOSTEUS-170210604': 'Calamus leucosteus',
 'CALAMUSNODOSUS-170210608': 'Calamus nodosus',
 'CALAMUSPRORIDENS-170210605': 'Calamus proridens',
 'CALLIONYMIDAE-170420000': 'Callionymidae',
 'CANTHIDERMISSUFFLAMEN-189030402': 'Canthidermis sufflamen',
 'CANTHIGASTERJAMESTYLERI-189080102': 'Canthigaster jamestyleri',
 'CARANXBARTHOLOMAEI-170110801': 'Caranx bartholomaei',
 'CARANXCRYSOS-170110803': 'Caranx crysos',
 'CARANXRUBER-170110807': 'Caranx ruber',
 'CAULOLATILUSCHRYSOPS-170070104': 'Caulolatilus 

In [34]:
with open('worms_map.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(["GFISHER name", "WORMS name"])
    writer.writerows(full_map.items())