In [9]:
from pygbif import species
from tqdm import tqdm  # optional, for progress bar
from rockdaisy.nomenclator import Nomenclator


In [8]:
def gbif_backbone_lookup(nomenclator, verbose=False):
    """
    Query GBIF backbone for all names in a Nomenclator instance.

    Returns:
        dict: {name: gbif_match_dict}
    """
    matches = {}
    all_names = nomenclator.all_names()

    for name in tqdm(all_names):
        try:
            match = species.name_backbone(name=name)
            matches[name] = match
            if verbose:
                print(f"{name}: {match.get('scientificName', 'No match')}")
        except Exception as e:
            matches[name] = {"error": str(e)}
            if verbose:
                print(f"{name}: ERROR - {e}")

    return matches


def extract_usage_keys_from_matches(matches, min_conf=80):
    """
    Extract usageKeys from GBIF match results (in memory).
    
    Args:
        matches (dict): Output from gbif_backbone_lookup()
        min_conf (int): Minimum confidence to include match
    
    Returns:
        dict: {name: usageKey}
    """
    usage_keys = {}
    for name, match in matches.items():
        if isinstance(match, dict) and "usageKey" in match and match.get("confidence", 0) >= min_conf:
            usage_keys[name] = match["usageKey"]
    return usage_keys






from pygbif import occurrences
from tqdm import tqdm

def fetch_occurrences_for_keys(usage_keys, limit_per_taxon=100, verbose=False):
    """
    Fetch GBIF occurrence records for a dictionary of usage keys.

    Args:
        usage_keys (dict): {name: usageKey}
        limit_per_taxon (int): Number of occurrences to fetch per taxon
        verbose (bool): Whether to print live status

    Returns:
        dict: {name: [occurrence dicts]}
    """
    all_occurrences = {}

    for name, key in tqdm(usage_keys.items(), desc="Fetching GBIF occurrences"):
        try:
            response = occurrences.search(taxonKey=key, limit=limit_per_taxon)
            all_occurrences[name] = response.get('results', [])
            if verbose:
                print(f"{name}: {len(all_occurrences[name])} records")
        except Exception as e:
            all_occurrences[name] = {"error": str(e)}
            if verbose:
                print(f"{name}: ERROR - {e}")

    return all_occurrences





import pandas as pd

def occurrences_to_dataframe(occ_data):
    """
    Flatten nested GBIF occurrence data into a DataFrame.
    
    Args:
        occ_data (dict): Output from fetch_occurrences_for_keys()
    
    Returns:
        pandas.DataFrame
    """
    rows = []
    for name, records in occ_data.items():
        if isinstance(records, list):
            for r in records:
                r['query_name'] = name
                rows.append(r)
    return pd.DataFrame(rows)


In [11]:
nomenclator_filepath = '../data/nomenclator.txt'
nomenclator = Nomenclator(nomenclator_filepath)

In [12]:
# Step 1: Run your existing match function
matches = gbif_backbone_lookup(nomenclator)

# Step 2: Extract usage keys
usage_keys = extract_usage_keys_from_matches(matches)

# Step 3: Fetch occurrences
occ_data = fetch_occurrences_for_keys(usage_keys, limit_per_taxon=100)

# Step 4: Flatten to DataFrame (optional)
df = occurrences_to_dataframe(occ_data)

# Preview
df[["query_name", "scientificName", "decimalLatitude", "decimalLongitude"]].head()


100%|██████████| 229/229 [02:12<00:00,  1.73it/s]
Fetching GBIF occurrences: 100%|██████████| 228/228 [04:02<00:00,  1.06s/it]


Unnamed: 0,query_name,scientificName,decimalLatitude,decimalLongitude
0,Perityle tenuifolius,Perityle tenuifolius (Phil.) Lichter-Marck,-26.345373,-79.892358
1,Perityle tenuifolius,Perityle tenuifolius (Phil.) Lichter-Marck,-26.34662,-79.886282
2,Perityle tenuifolius,Lycapsus tenuifolius Phil.,,
3,Perityle tenuifolius,Lycapsus tenuifolius Phil.,-26.35,-79.86667
4,Perityle tenuifolius,Lycapsus tenuifolius Phil.,,


In [13]:
df

Unnamed: 0,key,datasetKey,publishingOrgKey,installationKey,hostingOrganizationKey,publishingCountry,protocol,lastCrawled,lastParsed,crawlId,...,organismName,verbatimIdentification,organismQuantity,organismQuantityType,waterBody,island,coordinatePrecision,samplingEffort,parentEventID,acceptedNameUsageID
0,3764504470,50c9509d-22c7-4a22-a47d-8c48425ef4a7,28eb1a3f-1c15-4a95-931a-4af90ecb574d,997448a8-f762-11e1-a439-00145eb45e9a,28eb1a3f-1c15-4a95-931a-4af90ecb574d,CL,DWC_ARCHIVE,2025-05-08T21:36:07.572+00:00,2025-05-12T14:28:36.026+00:00,542,...,,,,,,,,,,
1,3455569773,50c9509d-22c7-4a22-a47d-8c48425ef4a7,28eb1a3f-1c15-4a95-931a-4af90ecb574d,997448a8-f762-11e1-a439-00145eb45e9a,28eb1a3f-1c15-4a95-931a-4af90ecb574d,CL,DWC_ARCHIVE,2025-05-08T21:36:07.572+00:00,2025-05-12T12:05:49.961+00:00,542,...,,,,,,,,,,
2,1456004768,821cc27a-e3bb-4bc5-ac34-89ada245069d,bc092ff0-02e4-11dc-991f-b8a03c50a862,b2ff19b5-d74f-40d2-82f1-ec5db01e8e31,bc092ff0-02e4-11dc-991f-b8a03c50a862,US,EML,2025-05-02T11:18:50.636+00:00,2025-05-02T13:21:15.208+00:00,448,...,,,,,,,,,,
3,3125266488,9b7d1acf-b22f-4a1f-b6e8-f1ddd744dc07,8a471700-4ce8-11db-b80e-b8a03c50a862,cb27a4ab-3360-4cf5-b5de-48a06a9c47ae,96710dc8-fecb-440d-ae3e-c34ae8a9616f,US,DWC_ARCHIVE,2025-05-09T15:48:15.665+00:00,2025-05-09T16:01:03.401+00:00,307,...,,,,,,,,,,
4,1096722507,0943f690-fde5-11dd-83f4-b8a03c50a862,6ba9a8cc-513a-4a51-bf93-6f5de8040a96,4346b227-ca68-4d54-8a77-909148492e0b,4c415e40-1e21-11de-9e40-a0d6ecebb8bf,SE,EML,2025-01-21T14:10:51.862+00:00,2025-02-06T17:56:31.809+00:00,323,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8719,2629576867,98c51784-da02-4f0e-81a4-f3635cb3cba7,ff90b050-c256-11db-b71b-b8a03c50a862,e081c7d7-2dab-42fd-94c4-9f275767fe7d,ff90b050-c256-11db-b71b-b8a03c50a862,MX,EML,2025-04-09T06:16:51.208+00:00,2025-04-09T06:33:39.060+00:00,249,...,,,,,,,,,,
8720,1456233227,821cc27a-e3bb-4bc5-ac34-89ada245069d,bc092ff0-02e4-11dc-991f-b8a03c50a862,b2ff19b5-d74f-40d2-82f1-ec5db01e8e31,bc092ff0-02e4-11dc-991f-b8a03c50a862,US,EML,2025-05-02T11:18:50.636+00:00,2025-05-02T13:37:28.883+00:00,448,...,,,,,,,,,,
8721,3346235279,d8cd16ba-bb74-4420-821e-083f2bac17c2,ada9d123-ddb4-467d-8891-806ea8d94230,17a83780-3060-4851-9d6f-029d5fcb81c9,fbca90e3-8aed-48b1-84e3-369afbd000ce,GB,EML,2025-05-10T12:04:02.898+00:00,2025-05-10T12:26:11.953+00:00,218,...,,,,,,,,,,
8722,4165582317,50c9509d-22c7-4a22-a47d-8c48425ef4a7,28eb1a3f-1c15-4a95-931a-4af90ecb574d,997448a8-f762-11e1-a439-00145eb45e9a,28eb1a3f-1c15-4a95-931a-4af90ecb574d,US,DWC_ARCHIVE,2025-05-08T21:36:07.572+00:00,2025-05-12T11:40:03.461+00:00,542,...,,,,,,,,,,


In [14]:
df.to_csv('../data/gbif/gbif_occurrences.csv', index=False)