# GBIF analysis
https://pygbif.readthedocs.io/en/latest/index.html

In [5]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
import pandas as pd


# read CSV species metadata
def get_data():
    data_path = "~/p-dsgt_clef2025-0/shared/plantclef/data"
    file_name = "species_metadata.csv"
    data_path = f"{data_path}/{file_name}"
    df = pd.read_csv(data_path)
    return df


df = get_data()
df.head(10)

Unnamed: 0,species_id,species,genus,family
0,1355868,Lactuca virosa L.,Lactuca,Asteraceae
1,1355869,Crepis capillaris (L.) Wallr.,Crepis,Asteraceae
2,1355870,Crepis foetida L.,Crepis,Asteraceae
3,1355871,Hypochaeris glabra L.,Hypochaeris,Asteraceae
4,1355872,Hypochaeris radicata L.,Hypochaeris,Asteraceae
5,1355873,Arctotis venusta Norl.,Arctotis,Asteraceae
6,1355880,Carduus macrocephalus Desf.,Carduus,Asteraceae
7,1355881,Carduus tenuiflorus Curtis,Carduus,Asteraceae
8,1355882,Cynara cardunculus L.,Cynara,Asteraceae
9,1355884,Centaurea calcitrapa L.,Centaurea,Asteraceae


In [7]:
from pygbif import species
from pygbif import occurrences as occ

# Get the taxon key for a species
df = get_data()
species_list = df["species"].tolist()[:5]
keys = [species.name_backbone(name)["usageKey"] for name in species_list]
# species_names = "Lotus alpinus"
responses = [occ.search(taxonKey=key, limit=1000000) for key in keys]
responses[0]["results"][0]

{'key': 5067755450,
 'datasetKey': '7ebef267-9d72-4c21-a276-cc84281a8590',
 'publishingOrgKey': 'd9bea9d3-13a5-4768-bbf4-560b9aa95a73',
 'installationKey': '19893c10-381e-4534-9bb8-6c37d03ad29e',
 'hostingOrganizationKey': '3c5e4331-7f2f-4a8d-aa56-81ece7014fc8',
 'publishingCountry': 'AU',
 'protocol': 'DWC_ARCHIVE',
 'lastCrawled': '2025-04-09T08:03:29.967+00:00',
 'lastParsed': '2025-04-09T08:16:30.151+00:00',
 'crawlId': 192,
 'extensions': {'http://rs.gbif.org/terms/1.0/Multimedia': [{'http://purl.org/dc/terms/format': 'image/jpeg',
    'http://rs.tdwg.org/dwc/terms/occurrenceID': '1f1c6291-b80b-4c09-91e2-527f61e83e2b',
    'http://purl.org/dc/terms/identifier': 'https://images.ala.org.au/image/proxyImageThumbnailLarge?imageId=ce0cb3c6-3f05-47c9-b571-6547c05d4b01'},
   {'http://purl.org/dc/terms/format': 'image/jpeg',
    'http://rs.tdwg.org/dwc/terms/occurrenceID': '1f1c6291-b80b-4c09-91e2-527f61e83e2b',
    'http://purl.org/dc/terms/identifier': 'https://images.ala.org.au/image/p

In [None]:
from collections import Counter

# Get the taxon key for a species
df = get_data()
species_list = df["species"].tolist()[:5]

species_data = {}
for species_name in species_list[:2]:
    taxon_key = species.name_backbone(species_name)["usageKey"]
    response = occ.search(taxonKey=taxon_key, limit=1000000)
    countries = []
    for result in response["results"]:
        country = result.get("country", "Unknown")
        countries.append(country)

    counter_countries = Counter(countries)
    species_data[species_name] = {
        country: count for country, count in counter_countries.items()
    }

species_data

{'Lactuca virosa L.': {'Australia': 10,
  'United States of America': 48,
  'Spain': 1,
  'New Zealand': 16,
  'France': 56,
  'United Kingdom of Great Britain and Northern Ireland': 26,
  'Portugal': 5,
  'Netherlands': 129,
  'Switzerland': 5,
  'Germany': 2,
  'Greece': 1,
  'Italy': 1},
 'Crepis capillaris (L.) Wallr.': {'United Kingdom of Great Britain and Northern Ireland': 5,
  'Luxembourg': 1,
  'New Zealand': 44,
  'United States of America': 5,
  'Australia': 57,
  'Chile': 1,
  'Spain': 4,
  'Argentina': 4,
  'Netherlands': 168,
  'Switzerland': 6,
  'Germany': 1,
  'France': 1,
  'Mexico': 1,
  'Denmark': 1,
  'Jersey': 1}}

In [None]:
import os
import json

# Directory where your JSON files are stored
json_dir = os.path.expanduser(
    "~/p-dsgt_clef2025-0/shared/plantclef/data/genai/02_gbif/countries"
)

# Prepare a list to store data
data = []

# Loop through all JSON files in the directory
for filename in os.listdir(json_dir):
    if filename.endswith(".json"):
        filepath = os.path.join(json_dir, filename)
        with open(filepath, "r") as f:
            content = json.load(f)
            for species, countries in content.items():
                data.append({"species": species, "countries": countries})

# Convert to pandas DataFrame
df = pd.DataFrame(data)
df.head()

Unnamed: 0,species,countries
0,Omalotheca supina (L.) DC.,"[Germany, Sweden, United Kingdom of Great Brit..."
1,"Myriolimon ferulaceum (L.) Lledó, Erben & M.B....","[Spain, Portugal, France]"
2,Androsace chamaejasme Wulfen,"[Germany, Slovakia, Kazakhstan, Liechtenstein,..."
3,Euphorbia margalidiana Kuhbier & Lewej.,"[Spain, Unknown, Algeria]"
4,Carthamus arborescens L.,"[Spain, Morocco, Gibraltar]"


In [2]:
len(df)

7806

In [3]:
france_df = df[df["countries"].apply(lambda x: "France" in x)]
num_species_france = france_df.shape[0]
print(f"Number of species with occurrences in France: {num_species_france}")

Number of species with occurrences in France: 5185


### occ.count_countries()

In [None]:
# Get the taxon key for a species
df = get_data()
species_list = df["species"].tolist()[:5]

species_data = {}
for species_name in species_list[:1]:
    taxon_key = species.name_backbone(species_name)["usageKey"]
    response = occ.count_countries(publishingCountry="FR")
print(response)

{'FRANCE': 194749853, 'GERMANY': 1855783, 'ITALY': 1123703, 'SPAIN': 1022432, 'UNITED_STATES': 940999, 'RÉUNION': 725010, 'UNITED_KINGDOM': 712293, 'SWITZERLAND': 589942, 'NETHERLANDS': 568703, 'NEW_CALEDONIA': 498123, 'FRENCH_GUIANA': 414252, 'CZECH_REPUBLIC': 406939, 'AUSTRIA': 379481, 'BELGIUM': 363438, 'GUADELOUPE': 356756, 'BRAZIL': 321389, 'MADAGASCAR': 253247, 'CANADA': 188565, 'FRENCH_POLYNESIA': 184303, 'POLAND': 179892, 'MARTINIQUE': 139060, 'CÔTE_DIVOIRE': 124122, 'RUSSIAN_FEDERATION': 119758, 'SLOVAKIA': 109268, 'PORTUGAL': 104004, 'SENEGAL': 100711, 'HUNGARY': 89043, 'GREECE': 88463, 'SWEDEN': 87754, 'UNKNOWN': 85929, 'INDIA': 84853, 'FRENCH_SOUTHERN_TERRITORIES': 81827, 'CAMEROON': 79498, 'FINLAND': 78240, 'BURKINA_FASO': 72584, 'AUSTRALIA': 68474, 'VIETNAM': 68466, 'MAYOTTE': 65849, 'MEXICO': 65782, 'PAPUA_NEW_GUINEA': 62424, 'CHINA': 61047, 'CROATIA': 59690, 'SOUTH_AFRICA': 58770, 'DENMARK': 58343, 'TURKEY': 57522, 'NORWAY': 54300, 'ALGERIA': 52989, 'NIGER': 50569, 'MOR