# Data download from GBIF

In [7]:
# NOTE: in order for "pygbif" to run properly, change the following:

# https://github.com/gbif/pygbif/issues/93
# solution: edit pygbif/caching.py
# change in line 2: requests_cache.core to requests_cache

# also, you'll need to create an .env file!

from pygbif import species
from pygbif import occurrences as occ
from dwca.read import DwCAReader
import time
import os

In [4]:
SPECIES = 'Vanessa atalanta'
YEAR_MIN = 2018
YEAR_MAX = 2021

In [5]:
gbif_result = species.name_suggest(SPECIES)
key = gbif_result[0]['key']
print(f"species name (to verify):\n\t{gbif_result[0]['scientificName']}")

species name (to verify):
	Vanessa atalanta (Linnaeus, 1758)


In [6]:
# DOWNLOAD REQUEST

# FYI: gbif uses 2-letter country code (as per ISO-3166-1)
# https://www.iso.org/iso-3166-country-codes.html
# as far as I'm concerned, multiple countries have to be split into several
# downloads

gbif_occ = occ.download([
    f'taxonKey = {key}',
    f'year = {YEAR_MIN},{YEAR_MAX}',
    'hasCoordinate = TRUE',
    'country = DE'
])

download_key = gbif_occ[0]
gbif_occ

INFO:Your download key is 0357833-210914110416597


('0357833-210914110416597',
 {'creator': 'vegan_schnitzel',
  'notification_address': ['robat.wright@gmail.com'],
  'send_notification': 'true',
  'created': 2022,
  'predicate': {'type': 'and',
   'predicates': [{'type': 'equals', 'key': 'TAXON_KEY', 'value': '1898286'},
    {'type': 'equals', 'key': 'YEAR', 'value': '2018,2021'},
    {'type': 'equals', 'key': 'HAS_COORDINATE', 'value': 'TRUE'},
    {'type': 'equals', 'key': 'COUNTRY', 'value': 'DE'}]}})

In [13]:
# GET THE DOWNLOAD

# create directory for data storage
if not os.path.exists('gbif'):
    os.mkdir('gbif')

# wait <delay> seconds if download hasn't been prepared by gbif yet
delay = 60
while True:
    try:
        occ.download_get(key=download_key, path='gbif')
        print('*** download successful! ***')
        break
    except Exception:
        print(f'download not ready yet, trying again in {delay/60} min...\n')
        time.sleep(delay)

INFO:Download file size: 4510409 bytes
INFO:On disk at gbif/0357833-210914110416597.zip


*** download successful! ***


In [20]:
# ACCESS DATA

with DwCAReader(f'gbif/{download_key}.zip') as dwca:

    # check the core file of the archive (occurrence, taxon, ...)
    print(f"core type is: {dwca.descriptor.core.type}")

    # check the available extensions
    print(f"available extensions: {[ext.split('/')[-1] for ext in dwca.descriptor.extensions_type]}")

    core_file_name = dwca.descriptor.core.file_location
    # load data into pandas frame, focussing on pre-selected columns
    core_df = dwca.pd_read(
        relative_path=core_file_name,
        usecols=['species', 'speciesKey', 'eventDate', 'year', 'month', 'day',
                 'countryCode','decimalLongitude', 'decimalLatitude'],
        parse_dates=True,
    )

# alternatively, save as csv file to make later re-import easier
core_df.to_csv(path_or_buf=f'gbif/{download_key}.csv', index=False)

core type is: http://rs.tdwg.org/dwc/terms/Occurrence
available extensions: ['Multimedia', 'Occurrence']


In [21]:
core_df

Unnamed: 0,eventDate,year,month,day,countryCode,decimalLatitude,decimalLongitude,speciesKey,species,geodeticDatum
0,2017-07-05T00:00:00,2017,7.0,5.0,GB,52.820494,-1.304008,1898286,Vanessa atalanta,WGS84
1,2017-07-29T00:00:00,2017,7.0,29.0,GB,52.728251,-0.979701,1898286,Vanessa atalanta,WGS84
2,2018-09-23T00:00:00,2018,9.0,23.0,GB,52.749491,-1.482920,1898286,Vanessa atalanta,WGS84
3,2015-08-02T00:00:00,2015,8.0,2.0,GB,52.634093,-0.568145,1898286,Vanessa atalanta,WGS84
4,2017-06-03T00:00:00,2017,6.0,3.0,GB,52.870687,-0.827795,1898286,Vanessa atalanta,WGS84
...,...,...,...,...,...,...,...,...,...,...
1133097,2015-07-22T00:00:00,2015,7.0,22.0,FR,48.585530,2.017580,1898286,Vanessa atalanta,WGS84
1133098,2008-07-23T00:00:00,2008,7.0,23.0,FR,48.617820,2.066200,1898286,Vanessa atalanta,WGS84
1133099,2007-03-28T00:00:00,2007,3.0,28.0,FR,48.729580,2.100970,1898286,Vanessa atalanta,WGS84
1133100,2004-04-15T00:00:00,2004,4.0,15.0,FR,48.705250,2.084150,1898286,Vanessa atalanta,WGS84
