In [73]:
import collections
from pathlib import Path
import pickle

import IPython.display as ipd
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import requests
from tqdm.auto import tqdm

In [2]:
schema_in = pa.schema([
    ("timestamp", pa.timestamp("us")),
    ("location_lat", pa.float64()),
    ("location_long", pa.float64()),
    ("study_id", pa.string()),
    ("individual_id", pa.string()),
    ("deployment_id", pa.int64()),
    ("tag_id", pa.string())
])

In [3]:
table_mbk = pq.read_table("../movebank/locations_gps.parquet").drop(["visible", "sensor_type"]).cast(schema_in)

In [4]:
table_ung = pq.read_table("../ungulates/locations_simon.parquet")

In [5]:
# assign negative IDs to non-Movebank trajectories
depl_ids_ung = {key: -1 - i for i, key in enumerate(sorted(set(table_ung["deployment_id"].to_numpy())))}
table_ung = table_ung.set_column(
    table_ung.column_names.index("deployment_id"), "deployment_id",
    pa.array([depl_ids_ung[key] for key in table_ung["deployment_id"].to_numpy()])
)

In [6]:
table_ung = table_ung.cast(schema_in)

In [7]:
pq.write_to_dataset(table_mbk, "locations/", partition_cols=["study_id"])
pq.write_to_dataset(table_ung, "locations/", partition_cols=["study_id"])

In [7]:
individuals_df = pd.concat([
    pq.read_table("../movebank/individuals.parquet").to_pandas()[["id", "taxon_canonical_name"]].rename(columns={"taxon_canonical_name": "taxon"}),
    pq.read_table("../ungulates/individuals_simon.parquet").to_pandas()[["id", "species"]].rename(columns={"species": "taxon"})
], axis="index").astype({"id": str, "taxon": str})

In [8]:
individuals_df["taxon_canonicalName"] = individuals_df["taxon"].replace({
    "African elephant": "Loxodonta africana",
    "Blue wildebeest": "Connochaetes taurinus",
    "Plains zebra": "Equus quagga",
    "Wild boar": "Sus scrofa"
})

In [52]:
GBIF_KEYS = ["vernacularName", "rank", "kingdom", "phylum", "class", "order", "family", "genus", "species"]

for key in GBIF_KEYS:
    individuals_df["taxon_" + key] = ""

for taxon in tqdm(set(individuals_df["taxon_canonicalName"])):
    response = requests.get("https://api.gbif.org/v1/species", params=dict(name=taxon, limit=1000))
    assert response.ok
    results = response.json()["results"]
    counters = collections.defaultdict(collections.Counter)
    for result in results:
        if result.get("taxonomicStatus") != "ACCEPTED":
            continue
        for key in GBIF_KEYS:
            counters[key][result.get(key)] += 1
    counters["vernacularName"][None] = 0
    for key in GBIF_KEYS:
        individuals_df["taxon_" + key][individuals_df["taxon_canonicalName"] == taxon] = max(counters[key].items(), key=lambda x: x[1])[0] if key in counters else None

  0%|          | 0/295 [00:00<?, ?it/s]

In [65]:
# Fix some problematic taxons
for order in ["Caprimulgiformes", "Testudines"]:
    row_indexer = (individuals_df["taxon_order"] == order)
    for col in ["taxon_kingdom", "taxon_phylum", "taxon_class"]:
        individuals_df.loc[row_indexer, col] = individuals_df.loc[row_indexer, col].dropna().unique().item()

In [66]:
individuals_df[individuals_df[["taxon_canonicalName", "taxon_class"]].isna().any(axis="columns")].groupby(["taxon_canonicalName", "taxon_vernacularName"]).first()

Unnamed: 0_level_0,Unnamed: 1_level_0,id,taxon,taxon_rank,taxon_kingdom,taxon_phylum,taxon_class,taxon_order,taxon_family,taxon_genus,taxon_species
taxon_canonicalName,taxon_vernacularName,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
,liverworts,10335869,,ORDER,Animalia,Chordata,,,,,
Animalia,Animals,180529824,Animalia,KINGDOM,Animalia,,,,,,


In [67]:
individuals_df.to_parquet("individuals.parquet")

In [4]:
with open("../movebank/individual2taxon_emb.pickle", "rb") as f:
    ind2emb_mbk = {str(k): v for k, v in pickle.load(f).items()}
with open("../ungulates/individual2taxon_emb_simon.pickle", "rb") as f:
    ind2emb_ung = pickle.load(f)
with open("individual2taxon_emb.pickle", "wb") as f:
    pickle.dump({**ind2emb_mbk, **ind2emb_ung}, f)

In [75]:
with pd.option_context('display.max_rows', 1000):
    ipd.display(individuals_df.groupby(["taxon_phylum", "taxon_class", "taxon_order", "taxon_family", "taxon_genus", "taxon_species", "taxon_vernacularName"])[[]].first())

taxon_phylum,taxon_class,taxon_order,taxon_family,taxon_genus,taxon_species,taxon_vernacularName
Chordata,Aves,Accipitriformes,Accipitridae,Accipiter,Accipiter brevipes,Levant Sparrowhawk
Chordata,Aves,Accipitriformes,Accipitridae,Accipiter,Accipiter striatus,Sharp-shinned Hawk
Chordata,Aves,Accipitriformes,Accipitridae,Aquila,Aquila chrysaetos,Golden Eagle
Chordata,Aves,Accipitriformes,Accipitridae,Aquila,Aquila pomarina,Lesser Spotted Eagle
Chordata,Aves,Accipitriformes,Accipitridae,Buteo,Buteo albigula,White-throated Hawk
Chordata,Aves,Accipitriformes,Accipitridae,Buteo,Buteo buteo,Common Buzzard
Chordata,Aves,Accipitriformes,Accipitridae,Buteo,Buteo jamaicensis,Red-tailed Hawk
Chordata,Aves,Accipitriformes,Accipitridae,Buteo,Buteo lagopus,Rough-legged Buzzard
Chordata,Aves,Accipitriformes,Accipitridae,Buteo,Buteo platypterus,Broad-winged Hawk
Chordata,Aves,Accipitriformes,Accipitridae,Circus,Circus aeruginosus,Western Marsh Harrier
