In [1]:
from pygbif import occurrences
from pygbif import species
import zipfile
import pandas as pd
import os
import time

In [None]:
plants = pd.read_csv('plant_names.csv', sep=';')

if "speciesKey" not in plants.columns:
    plants["speciesKey"] = None
if "taxonKey" not in plants.columns:
    plants["taxonKey"] = None
if "ourID" not in plants.columns:
    plants["ourID"] = None
    

taxonomy_cols = ['kingdom', 'phylum', 'class', 'order',
                 'family', 'genus', 'species']

# Opret kolonner hvis de mangler
for col in taxonomy_cols:
    if col not in plants.columns:
        plants[col] = None



for idx, row in plants.iterrows():
    if row["ourID"] == None:
        plants.at[idx, "ourID"] = "OUR"+str(idx+1).zfill(3)
    if row["speciesKey"] == None or row["taxonKey"] == None:
        try:
            # Slå op i GBIF backbone
            match = species.name_backbone(name=row["Latin name"], rank="species")
            taxonKey = match.get("usageKey")          # præcist matchet navn
            speciesKey = match.get("speciesKey") or taxonKey  # accepted species
            
            plants.at[idx, "taxonKey"] = taxonKey
            plants.at[idx, "speciesKey"] = speciesKey
            
            # ---- Taksonomi ----
            for col in taxonomy_cols:
                plants.at[idx, col] = match.get(col)
            
        except Exception as e:
            print(f'Could not match {row["Latin name"]}: {e}')



if False:
    plants.to_csv("plant_names.csv",sep=';', index = False)


In [None]:
country_code = "DK"
start_year = 2023
user = os.getenv("GBIF_USER")   # Sæt disse i dine miljøvariabler
email = os.getenv("GBIF_EMAIL") # GBIF_EMAIL
pwd = os.getenv("GBIF_PWD") 


taxonKeys = list(plants["taxonKey"])
# --- JSON query med flere predicates ---
query = {
    "type": "and",
    "predicates": [
        {
            "type": "in",
            "key": "TAXON_KEY",
            "values": [str(tk) for tk in taxonKeys]
        },
        {
            "type": "equals",
            "key": "COUNTRY",
            "value": country_code
        },
        {
            "type": "greaterThanOrEquals",
            "key": "YEAR",
            "value": start_year
        },
        { "type": "in",
          "key": "BASIS_OF_RECORD",
          "values": ["OBSERVATION", "MACHINE_OBSERVATION", "HUMAN_OBSERVATION"]
        }
    ]
}

# --- Opret download ---
download_key, _ = occurrences.download(query, user=user, email=email, pwd=pwd)

print("Download key:", download_key)

In [5]:
# GBIF download key
# 20/11 2025 kl 19: key = 0001019-251120083545085
# 20/11 2025 kl 21: key = 0001207-251120083545085
download_key = "0001207-251120083545085"


# occurrences.download_get(download_key, path=".")

zip_path = f"{download_key}.zip"
print(f"Downloaded {zip_path}")

with zipfile.ZipFile(zip_path, 'r') as z:
    # Tag den første fil i zip (typisk occurrence-fil)
    occurrence_file = z.namelist()[0]
    print("open file:", occurrence_file)
    
    with z.open(occurrence_file) as f:
        # Prøv tab-separeret først, hvis fejl prøv med ',' som separator
        try:
            df = pd.read_csv(f, sep='\t', low_memory=False)
        except pd.errors.ParserError:
            f.seek(0)
            df = pd.read_csv(f, sep=',', low_memory=False)


Downloaded 0001207-251120083545085.zip
open file: 0001207-251120083545085.csv


In [6]:
#Adding name to datasets
from pygbif import registry 


keys = df["datasetKey"].unique()
key_to_name_map = {}


for key in keys:
    try:
        # Hent metadata
        metadata = registry.datasets(uuid=key)
        
        # Gem titlen i ordbogen
        key_to_name_map[key] = metadata['title']
        
    except Exception as e:
        # Hvis noget går galt (f.eks. ugyldig nøgle), sæt en placeholder
        print(f"Fejl ved hentning af {key}: {e}")
        key_to_name_map[key] = "Unknown Dataset"

# 3. Opret den nye kolonne ved at 'mappe' nøglerne til navnene
df['datasetName'] = df['datasetKey'].map(key_to_name_map)



In [7]:
print("number of rows:", len(df))

# Licenser. NB! "CC_BY_NC_4_0" og "CC_BY_NC_SA_4_0" duer ikke til kommercielt brug - men er et studieprojekt kommercielt brug? - vi skal lige have styr på det
allowed = ["CC0", "CC_BY_4_0", "CC_BY_SA_4_0"]
df= df[df['license'].isin(allowed)]

print("number of rows:", len(df))

number of rows: 113510
number of rows: 98904


In [None]:
mapping = dict(zip(plants["speciesKey"], plants["ourID"]))
df["ourID"] = df["speciesKey"].map(mapping)

In [None]:

observations = df[['ourID','speciesKey','taxonKey', 'eventDate', 'year', 'month', 'day',
 'decimalLatitude', 'decimalLongitude', 
 'coordinateUncertaintyInMeters',
 'basisOfRecord', 'datasetKey',
 'datasetName','recordedBy',
 'individualCount', 'occurrenceID', 'gbifID', 'license']]

del df

In [None]:
data_to_save = {
    'plants': plants,
    'observations': observations
}

# Gem til pickle
pd.to_pickle(data_to_save, 'GBIF_data.pkl')
