In [1]:
import open_clip
import torch

from pytaxize import Ids
from pytaxize import itis

# Trying out BioCLIP and thinking about integration
- Relevant pages for `pytaxize` (to get taxonomic and common names)
   - [classifier class - get hierarchy from ID](https://sckott.github.io/pytaxize/modules/classification.html)
   - [taxonomic identifier class - get taxonomic ID from scientific name](https://sckott.github.io/pytaxize/modules/ids.html)
   - [`taxize` package documentation in R - original package](https://docs.ropensci.org/taxize/articles/taxize.html)
- Relevant pages for BioCLIP
   - [`open_clip` package documentation - base package](https://pypi.org/project/open-clip-torch/)
   - [BioCLIP model page on HuggingFace](https://huggingface.co/imageomics/bioclip) 

In [2]:
# Reading in the pre-trained BioCLIP model
model, _, preprocess_val = open_clip.create_model_and_transforms('hf-hub:imageomics/bioclip')
tokenizer = open_clip.get_tokenizer('hf-hub:imageomics/bioclip')

In [5]:
# Trying out embedding multiple different species by scientific name only
sci_names = ['Loxodonta africana', 'Odocoileus virginianus', 'Pandinus imperator']
sci_names_clip = ['a photo of ' + n for n in sci_names]
text = tokenizer(sci_names_clip)

with torch.no_grad():
    text_features = model.encode_text(text)
    text_features /= text_features.norm(dim = -1, keepdim = True)

text_features.shape

torch.Size([3, 512])

In [108]:
def get_species_name_CLIP(scientific_name, full_hierarchy = True, common_name = False):

    """
    A helper function to format species names in the several supported formats for BioCLIP.

    Paramaters
    ----------

    Returns
    -------
    
    """
    
    name_str = 'a photo of'
    ranks_to_include = ['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Species']

    if full_hierarchy or common_name:
        tax_id = Ids(scientific_name)
        tax_id.itis(type = 'scientific')
        
        ids = tax_id.extract_ids()
        sel_id = int(ids[scientific_name][0]) # it seems like subspecies entries generally come after plain species...
        
        if full_hierarchy:
            tax_hier = itis.hierarchy_full(sel_id, as_dataframe = True)
            hier_str = ' '.join(tax_hier[tax_hier['rankName'].isin(ranks_to_include)]['taxonName'].values)
    
            name_str += ' ' + hier_str
            
        if common_name:
            common = itis.common_names(sel_id)
            assert len(common) > 0, f'{scientific_names} has no common names recorded in ITIS'
            
            com_names = [d['commonName'].lower() for d in common if d['language'] == 'English']
            com_names = list(set(com_names))

            if not full_hierarchy:
                name_strs = [name_str + ' ' + scientific_name + ' with common name ' + name for name in com_names]
            else:
                name_strs = [name_str + ' with common name ' + name for name in com_names]

            return name_strs

        return name_str
        
    return name_str + ' ' + scientific_name

In [109]:
get_species_name_CLIP(sci_names[0], full_hierarchy = True, common_name = False)

'a photo of Animalia Chordata Mammalia Proboscidea Elephantidae Loxodonta africana'