# BioClip Training

In [None]:
# Install required packages
!pip install open_clip_torch pillow pandas

In [None]:
import open_clip
import torch
from PIL import Image
import pandas as pd

# Load BioCLIP model
model, preprocess_train, preprocess_val = open_clip.create_model_and_transforms('hf-hub:imageomics/bioclip')
tokenizer = open_clip.get_tokenizer('hf-hub:imageomics/bioclip')

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

In [None]:
# Load your CSV of plant names
plant_df = pd.read_csv('data/Plants_Formatted.csv', encoding='latin-1')
plant_names = plant_df['Scientific Name'].tolist()
# remove duplicate names
plant_names = list(set(plant_names))
print(len(plant_names))
plant_names

In [None]:
# Load and preprocess your image
image = Image.open('data/example_images/Adiantum-peruvianum-Silver-Dollar-Fern-Amazon-Spheres.jpg.webp')
image = preprocess_val(image).unsqueeze(0).to(device)
image

In [None]:
# Tokenize the plant names
text = tokenizer(plant_names).to(device)

# Get predictions
with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    
    # Normalize features
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    
    # Calculate similarity (cosine similarity * 100)
    similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)

# Get results
probs = similarity[0].cpu().numpy()

# Show top 5 predictions
results = sorted(zip(plant_names, probs), key=lambda x: x[1], reverse=True)[:5]
print("\nTop 5 predictions:")
for plant, prob in results:
    print(f"{plant}: {prob*100:.2f}%")

## Test Plant Classifier Class

In [1]:
import sys
import os
# Add user-service/src to Python path
sys.path.insert(0, os.path.join(os.getcwd(), 'user-service', 'src'))

from game_utils.plant_classifier import PlantClassifier

classifier = PlantClassifier() # this loads the model and encodes the plant names

with open("data/example_images/Adiantum-peruvianum-Silver-Dollar-Fern-Amazon-Spheres.jpg.webp", "rb") as f:
    image_bytes = f.read()

result = classifier.classify_image(image_bytes)

if result["success"]:
    print(f"\nTop prediction: {result['plant_name']}")
    print(f"Confidence: {result['confidence']*100:.2f}%")
    print("\nTop 5 predictions:")
    for plant, conf in result['top_5']:
        print(f"  {plant}: {conf*100:.2f}%")
else:
    print("Classification failed")

Loading BioCLIP model...
Using device: cpu
Loading plants from database...
Found 898 plants in database
Text features precomputed and cached!
BioCLIP model loaded! 898 plants indexed.
Classifying image...

Top prediction: Adiantum peruvianum
Confidence: 36.44%

Top 5 predictions:
  Adiantum peruvianum: 36.44%
  Adiantum ternerum 'Scutum Roseum': 29.52%
  Adiantum raddianum: 27.32%
  Epiphyllum oxypetalum: 3.72%
  Adiantum trapeziforme: 0.78%
