In [1]:
!pip install -r https://raw.githubusercontent.com/gbif-norway/marine-species-checker/main/requirements.txt
from pygbif import occurrences as occ
import pandas as pd
import pyworms

In [2]:
# Insert your dataset's key
dataset_key = 'd391c193-0fc0-4a96-bdac-6043dd9516d1'

In [3]:
occs = occ.search(datasetKey=dataset_key)  # The first 300 occurrences
offset = 0
all_occs = []

# Loop to continuously fetch 300 occurrences from GBIF until we have run out of occurrences to check 
while occs['count'] > offset:
    current_set = []
    for x in occs['results']:
        record = {'id': x['occurrenceID']}
        # We want to check to see if the scientificNameID is already present for each record
        record['oldlsid'] = x['scientificNameID'] if 'scientificNameID' in x else None
        if 'species' in x:
            record['sn'] = x['species']
        elif 'genus' in x:
            record['sn'] = x['genus']
        elif 'family' in x:
            record['sn'] = x['family']
        elif 'order' in x:
            record['sn'] = x['order']
        else:
            record['sn'] = x['scientificName']
        current_set.append(record)
    
    # Match the 300 records against WoRMS and add the LSID
    worms = pyworms.aphiaRecordsByMatchNames([x['sn'] for x in current_set], marine_only=True)
    for i in range(len(current_set)):
        if len(worms[i]) and 'lsid' in worms[i][0]:
            current_set[i]['lsid'] = worms[i][0]['lsid']
        else:
            current_set[i]['lsid'] = None
    all_occs.extend(current_set)
    offset += 300
    occs = occ.search(datasetKey=dataset_key, offset=offset)

In [6]:
# all_occs now also holds the relevant WoRMS LSID data
data = pd.DataFrame(all_occs)

In [8]:
count = data['lsid'].count()
perc = (data['lsid'].count() / len(data)) * 100
'Number of records with a match in WoRMS: {}, ({}% of total)'.format(count, perc)

'Number of records with a match in WoRMS: 1016, (100.0% of total)'

In [9]:
count = data['oldlsid'].count()
perc = (data['oldlsid'].count() / len(data)) * 100
'Number of records which already have a WoRMS LSID in scientificNameID: {}, ({}% of total)'.format(count, perc)

'Number of records which already have a WoRMS LSID in scientificNameID: 0, (0.0% of total)'

In [15]:
# Write to csv for download
data.to_csv('./output.csv', index=False)