# Setup

In [1]:
import os
import json
import sys
sys.path.append('..')

import numpy as np

import open_clip
import torch

sys.path.append('../satclip')
sys.path.append('../satclip/satclip')
import satclip
from satclip.load import get_satclip
from huggingface_hub import hf_hub_download

from utils import get_species_names, format_species_name_CLIP, get_species_embeddings, read_csv_non_utf, count_parameters

In [2]:
# Loading in general configuration
with open('../config.json', 'r') as f:
    config = json.load(f)

# Getting filepaths
gdrive_fp = config['gdrive_path']
LIFE_fp = config['LIFE_folder']
dataset_fp = config['datasets_path']

# Grabbing Benitez-Lopez
benitez_lopez2019 = config['indiv_data_paths']['benitez_lopez2019']
ben_lop_path = os.path.join(gdrive_fp, LIFE_fp, dataset_fp, benitez_lopez2019)
ben_lop2019 = read_csv_non_utf(ben_lop_path)

# Trying out BioCLIP and thinking about integration
- Relevant pages for `pytaxize` (to get taxonomic and common names)
   - [classifier class - get hierarchy from ID](https://sckott.github.io/pytaxize/modules/classification.html)
   - [taxonomic identifier class - get taxonomic ID from scientific name](https://sckott.github.io/pytaxize/modules/ids.html)
   - [`taxize` package documentation in R - original package](https://docs.ropensci.org/taxize/articles/taxize.html)
- Relevant pages for BioCLIP
   - [`open_clip` package documentation - base package](https://pypi.org/project/open-clip-torch/)
   - [BioCLIP model page on HuggingFace](https://huggingface.co/imageomics/bioclip) 

In [3]:
# Testing out on a toy dataset of scientific names
sci_names = ['Loxodonta africana', 'Odocoileus virginianus', 'Pandinus imperator']

# Reading in the pre-trained BioCLIP model
model, _, preprocess_val = open_clip.create_model_and_transforms('hf-hub:imageomics/bioclip')
tokenizer = open_clip.get_tokenizer('hf-hub:imageomics/bioclip')

# Extracting the relevant info from ITIS
full_names = []
for name in sci_names:
    full_names.append(get_species_names(name))

In [4]:
# Processing with BioCLIP
species_embeddings = get_species_embeddings(full_names, model, tokenizer, full_hierarchy = True, common_name = False)

In [5]:
# Checking out the embeddings and the name strings that were processed
for k, v in species_embeddings.items():
    print(f'Species {k} has embedding of shape {v['embedding'].shape}')
    print(v['names_used'])
    print()

Species Loxodonta africana has embedding of shape (512,)
['a photo of Animalia Chordata Mammalia Proboscidea Elephantidae Loxodonta africana']

Species Odocoileus virginianus has embedding of shape (512,)
['a photo of Animalia Chordata Mammalia Artiodactyla Cervidae Odocoileus virginianus']

Species Pandinus imperator has embedding of shape (512,)
['a photo of Animalia Arthropoda Euchelicerata Scorpiones Scorpionidae Pandinus imperator']



In [6]:
# Seeing if elephants are more similar to deer than to scorpions as a basic sanity check
print(species_embeddings['Loxodonta africana']['embedding'].dot(species_embeddings['Odocoileus virginianus']['embedding']))
print(species_embeddings['Loxodonta africana']['embedding'].dot(species_embeddings['Pandinus imperator']['embedding']))

0.64558005
0.4450716


## Using `Pytaxize` to resolve taxonomic names

In [7]:
from itertools import chain

from pytaxize import gn, Ids, itis

In [8]:
def multi_species_extraction(species_names):
    if ', ' in species_names:
        species_names = species_names.split(',')
        species_names = [s.split('and ') for s in species_names]
        species_names = list(chain(*species_names))
    elif 'and ' in species_names:   
        species_names = species_names.split('and ')
    elif 'or ' in species_names:
        species_names = species_names.split('or ')
    else:
        return [species_names.replace('spp', '*').replace('.', '')]

    # Removing whitespace and empty strings
    species_names = [s.strip() for s in species_names]
    species_names = [s for s in species_names if s != '']

    # Special cases for fixing the binomials
    if species_names[0] == 'Sciurus spadiceus':
        species_names = ['Sciurus spadiceus', 'Sciurus sanborni']
    elif species_names[0] == 'Saguinus mystax':
        species_names = ['Saguinus mystax', 'Saguinus imperator']
    elif species_names[0] == 'Potos flavus':
        species_names = ['Potos flavus', 'Bassaricyon *']
    
    # General case
    else:
        for i in range(1, len(species_names)):
            if (species_names[i][0] == species_names[0][0]) and (species_names[i][1] == '.'):
                new_name = species_names[i].split(' ')[1]
                new_name = species_names[0].split(' ')[0] + ' ' + new_name
    
                species_names[i] = new_name

    # Removing any unnecessary periods
    species_names = [s.replace('.', '') for s in species_names]
    
    return species_names

# Grabbing all unique species
ben_lop2019_species = ben_lop2019['Species'].apply(multi_species_extraction)
ben_lop2019_species = set(chain(*list(ben_lop2019_species)))

# Resolving scientific names
full_species = [s for s in ben_lop2019_species if '*' not in s]
higher_tax_level = [s for s in ben_lop2019_species if '*' in s]

species_resolved = gn.resolve(full_species, best_match_only = True, source = [3])

In [9]:
# Getting full names - GENUS (or higher) ONLY CASE!
higher_tax_full = []

for name in higher_tax_level:
    name = name.split(' ')[0]
    print(name)

    #  special cases
    if name == 'Cebus':
        sel_id = 572816
    elif name == 'Ateles':
        sel_id = 572812
    elif name == 'Sciurus':
        sel_id = 180171
    elif name == 'Dasyprocta':
        sel_id = 584623

    #  general case
    else:
        tax_id = Ids(name)
        tax_id.itis(type = 'scientific')
        ids = tax_id.extract_ids()
        
        sel_id = int(ids[name][0]) # the first ID is the direct record, after are children...
        
    level = itis.rank_name(sel_id)['rankName'] # checking the taxonomic level
    level_to_pass = level.replace('Sub', '')
    print(level, sel_id)
    
    #  cleaning up the output to reflect the fact that we only have genus info
    tax_names = get_species_names(itis_id = sel_id, level = level_to_pass)
    higher_tax_full.append(tax_names)
    print()

Dasyprocta
Genus 584623

Aotus
Genus 572811

Ateles
Genus 572812

Proechimys
Genus 584648
ID 584648 has no common names recorded in ITIS

Sciurus
Genus 180171

Alouatta
Genus 572810

Oryzomys
Genus 180335

Genetta
Genus 621833
ID 621833 has no common names recorded in ITIS

Didelphis
Genus 179920

Sciurus
Genus 180171

Xerinae
Subfamily 930206
ID 930206 has no common names recorded in ITIS

Saimiri
Genus 180094

Cebus
Genus 572816

Dasypus
Genus 180102

Marmosa
Genus 552392

Herpestidae
Family 180611

Cephalophus
Genus 183848

Callicebus
Genus 572815
ID 572815 has no common names recorded in ITIS

Saguinus
Genus 572809

Bassaricyon
Genus 621816



In [24]:
# Getting the species ID for ITIS - FULL SPECIES CASE!
species_itis = {}
for s, s_dict in zip(full_species, species_resolved):
    if s == 'Smutsia gigantea':
        tax_id = Ids('Manis gigantea') # this is the correct entry for the giant pangolin
        tax_id.itis(type = 'scientific')
        ids = tax_id.extract_ids()
        sel_id = int(ids['Manis gigantea'][0])

        species_itis[s] = sel_id
    else:
        s_dict = s_dict[0]
        species_itis[s] = int(s_dict['current_taxon_id']) if 'current_taxon_id' in s_dict.keys() else int(s_dict['taxon_id'])

# A proof-of-concept for querying ITIS for full taxonomic hierachy using our existing function
full_names = []
i = 0
for species, itis_id in species_itis.items():
    
    if i > 5:
        break

    print(species, itis_id)
    print()
        
    full_names.append(get_species_names(species, itis_id))
    i += 1

Myoprocta pratti 825306

Alouatta sara 572941

Cerdocyon thous 183825

Dasyprocta leporina 584731

Cephalophus nigrifrons 625169

Hystrix brachyura 584683



# Trying out SatCLIP for location embeddings

It's a little unclear if we want to use the model with $L=50$ or $L=10$; as mentioned in the paper, the latter is better for large-scale patterns and spatial generalization, while the former is better at capturing fine-grained patterns.

In [6]:
# Loading a pre-trained SatCLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"

#  this only loads location encoder by default
model = get_satclip(
    hf_hub_download("microsoft/SatCLIP-ResNet50-L40", "satclip-resnet50-l40.ckpt"),
    device = device,
)
model.eval();

using pretrained moco resnet50


In [7]:
# Checking out the size of the model - seems relatively small!
count_parameters(model)

1213696

In [8]:
# Loading Benitez-Lopez and extracting coordinates - inputs are (longitude, latitude)
coords = torch.from_numpy(ben_lop2019[['X', 'Y']].values).to(device)
coords.shape

torch.Size([3281, 2])

In [9]:
# Processing using the pre-trained location embedder from SatCLIP
with torch.no_grad():
    coord_emb = model(coords).detach().cpu() # these don't seem to be normalized

coord_emb.shape # embedding shape is 256

torch.Size([3281, 256])