In [1]:
import sys
sys.path.append('..')

import open_clip
import torch

from utils import get_species_names, format_species_name_CLIP, get_species_embeddings

# Trying out BioCLIP and thinking about integration
- Relevant pages for `pytaxize` (to get taxonomic and common names)
   - [classifier class - get hierarchy from ID](https://sckott.github.io/pytaxize/modules/classification.html)
   - [taxonomic identifier class - get taxonomic ID from scientific name](https://sckott.github.io/pytaxize/modules/ids.html)
   - [`taxize` package documentation in R - original package](https://docs.ropensci.org/taxize/articles/taxize.html)
- Relevant pages for BioCLIP
   - [`open_clip` package documentation - base package](https://pypi.org/project/open-clip-torch/)
   - [BioCLIP model page on HuggingFace](https://huggingface.co/imageomics/bioclip) 

In [2]:
# Testing out on a toy dataset of scientific names
sci_names = ['Loxodonta africana', 'Odocoileus virginianus', 'Pandinus imperator']

# Reading in the pre-trained BioCLIP model
model, _, preprocess_val = open_clip.create_model_and_transforms('hf-hub:imageomics/bioclip')
tokenizer = open_clip.get_tokenizer('hf-hub:imageomics/bioclip')

# Extracting the relevant info from ITIS
full_names = []
for name in sci_names:
    full_names.append(get_species_names(name))

In [12]:
# Processing with BioCLIP
species_embeddings = get_species_embeddings(full_names, model, tokenizer, full_hierarchy = True, common_name = False)

In [13]:
# Checking out the embeddings and the name strings that were processed
for k, v in species_embeddings.items():
    print(f'Species {k} has embedding of shape {v['embedding'].shape}')
    print(v['names_used'])
    print()

Species Loxodonta africana has embedding of shape (512,)
['a photo of Animalia Chordata Mammalia Proboscidea Elephantidae Loxodonta africana']

Species Odocoileus virginianus has embedding of shape (512,)
['a photo of Animalia Chordata Mammalia Artiodactyla Cervidae Odocoileus virginianus']

Species Pandinus imperator has embedding of shape (512,)
['a photo of Animalia Arthropoda Euchelicerata Scorpiones Scorpionidae Pandinus imperator']



In [14]:
# Seeing if elephants are more similar to deer than to scorpions as a basic sanity check
print(species_embeddings['Loxodonta africana']['embedding'].dot(species_embeddings['Odocoileus virginianus']['embedding']))
print(species_embeddings['Loxodonta africana']['embedding'].dot(species_embeddings['Pandinus imperator']['embedding']))

0.64558005
0.4450716
