# GBIF analysis
https://pygbif.readthedocs.io/en/latest/index.html

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd


# read CSV species metadata
def get_data():
    data_path = "~/p-dsgt_clef2025-0/shared/plantclef/data"
    file_name = "species_metadata.csv"
    data_path = f"{data_path}/{file_name}"
    df = pd.read_csv(data_path)
    return df


df = get_data()
df.head(10)

Unnamed: 0,species_id,species,genus,family
0,1355868,Lactuca virosa L.,Lactuca,Asteraceae
1,1355869,Crepis capillaris (L.) Wallr.,Crepis,Asteraceae
2,1355870,Crepis foetida L.,Crepis,Asteraceae
3,1355871,Hypochaeris glabra L.,Hypochaeris,Asteraceae
4,1355872,Hypochaeris radicata L.,Hypochaeris,Asteraceae
5,1355873,Arctotis venusta Norl.,Arctotis,Asteraceae
6,1355880,Carduus macrocephalus Desf.,Carduus,Asteraceae
7,1355881,Carduus tenuiflorus Curtis,Carduus,Asteraceae
8,1355882,Cynara cardunculus L.,Cynara,Asteraceae
9,1355884,Centaurea calcitrapa L.,Centaurea,Asteraceae


In [32]:
from pygbif import species
from pygbif import occurrences as occ

# Get the taxon key for a species
df = get_data()
species_list = df["species"].tolist()[:2]
keys = [species.name_backbone(name)["usageKey"] for name in species_list]
# species_names = "Lotus alpinus"
responses = [occ.search(taxonKey=key, limit=1000000) for key in keys]
responses

[{'offset': 0,
  'limit': 300,
  'endOfRecords': False,
  'count': 43446,
  'results': [{'key': 5067755450,
    'datasetKey': '7ebef267-9d72-4c21-a276-cc84281a8590',
    'publishingOrgKey': 'd9bea9d3-13a5-4768-bbf4-560b9aa95a73',
    'installationKey': '19893c10-381e-4534-9bb8-6c37d03ad29e',
    'hostingOrganizationKey': '3c5e4331-7f2f-4a8d-aa56-81ece7014fc8',
    'publishingCountry': 'AU',
    'protocol': 'DWC_ARCHIVE',
    'lastCrawled': '2025-04-09T08:03:29.967+00:00',
    'lastParsed': '2025-04-09T08:16:30.151+00:00',
    'crawlId': 192,
    'extensions': {'http://rs.gbif.org/terms/1.0/Multimedia': [{'http://purl.org/dc/terms/format': 'image/jpeg',
       'http://rs.tdwg.org/dwc/terms/occurrenceID': '1f1c6291-b80b-4c09-91e2-527f61e83e2b',
       'http://purl.org/dc/terms/identifier': 'https://images.ala.org.au/image/proxyImageThumbnailLarge?imageId=ce0cb3c6-3f05-47c9-b571-6547c05d4b01'},
      {'http://purl.org/dc/terms/format': 'image/jpeg',
       'http://rs.tdwg.org/dwc/terms/oc

In [38]:
from collections import defaultdict

species_dict = defaultdict(set)  # Use a set to avoid duplicate countries
for response in responses:
    for result in response["results"]:
        species_name = result["species"]
        country = result.get(
            "country", "Unknown"
        )  # Handle cases where "country" might be missing
        # Add country to species_dict
        species_dict[species_name].add(country)

# Convert sets to lists for easier handling
species_dict = {key: list(value) for key, value in species_dict.items()}
species_dict

{'Lactuca virosa': ['Italy',
  'Germany',
  'United Kingdom of Great Britain and Northern Ireland',
  'New Zealand',
  'Spain',
  'Portugal',
  'United States of America',
  'France',
  'Netherlands',
  'Switzerland',
  'Australia',
  'Greece'],
 'Crepis capillaris': ['Denmark',
  'Germany',
  'Chile',
  'United Kingdom of Great Britain and Northern Ireland',
  'New Zealand',
  'Spain',
  'Switzerland',
  'United States of America',
  'France',
  'Argentina',
  'Netherlands',
  'Jersey',
  'Australia',
  'Mexico',
  'Luxembourg']}

In [1]:
import os
import json

# Directory where your JSON files are stored
json_dir = os.path.expanduser(
    "~/p-dsgt_clef2025-0/shared/plantclef/data/genai/02_gbif/"
)

# Prepare a list to store data
data = []

# Loop through all JSON files in the directory
for filename in os.listdir(json_dir):
    if filename.endswith(".json"):
        filepath = os.path.join(json_dir, filename)
        with open(filepath, "r") as f:
            content = json.load(f)
            for species, countries in content.items():
                data.append({"species": species, "countries": countries})

# Convert to pandas DataFrame
df = pd.DataFrame(data)
df.head()

Unnamed: 0,species,countries
0,Omalotheca supina (L.) DC.,"[Germany, Sweden, United Kingdom of Great Brit..."
1,"Myriolimon ferulaceum (L.) Lledó, Erben & M.B....","[Spain, Portugal, France]"
2,Androsace chamaejasme Wulfen,"[Germany, Slovakia, Kazakhstan, Liechtenstein,..."
3,Euphorbia margalidiana Kuhbier & Lewej.,"[Spain, Unknown, Algeria]"
4,Carthamus arborescens L.,"[Spain, Morocco, Gibraltar]"


In [2]:
len(df)

7806

In [3]:
france_df = df[df["countries"].apply(lambda x: "France" in x)]
num_species_france = france_df.shape[0]
print(f"Number of species with occurrences in France: {num_species_france}")

Number of species with occurrences in France: 5185
