In [9]:
# Standard library imports
import os

# Third-party imports
import torch
import pandas as pd
from PIL import Image
from tqdm import tqdm
from torch.nn.functional import softmax

# Transformers imports
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    CLIPProcessor,
    CLIPModel
)

# ChromaDB imports
import chromadb
from chromadb.utils import embedding_functions

device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

## Sentiment Analysis w/ BERT
- We can find models on [HuggingFace](https://huggingface.co/models?pipeline_tag=text-classification&sort=trending) that can be used for sentiment analysis!
- Let's try using a version of BERT (encoder-only trasnformer) that has been fine-tuned for sentiment analysis (positive vs. negative sentiment)

In [2]:
# Load pre-trained DistilBERT model and tokenizer
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

def get_sentiment(text):
    # Tokenize and prepare input
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    
    with torch.no_grad():
        outputs = model(**inputs) # Get model output
    
    # Apply softmax to get probabilities of each class (0: negative, 1: positive)
    probs = softmax(outputs.logits, dim=-1)
    # Get predicted class (0: negative, 1: positive)
    predicted_class = torch.argmax(probs, dim=-1).item()
    
    # Get confidence score
    confidence = probs[0][predicted_class].item()
    
    return "Positive" if predicted_class == 1 else "Negative", confidence

In [3]:
# Example usage
text = "This Seattle-Style Chicken Teriyaki Is Delicious Over Rice"
sentiment, confidence = get_sentiment(text)
print(f"Sentiment: {sentiment}, Confidence: {confidence:.2f}")

Sentiment: Positive, Confidence: 1.00


## Embeddings + Semantic Search w/ CLIP

- Embeddings are how represent our input (text, images, etc.) numerically as a vector

- We can use many types of models (transformers like BERT or GPT, feedforward neural networks, etc.) to generate embeddings.
    - For this example, we'll take take embeddings from a pre-trained model called [CLIP](https://openai.com/index/clip/)
        - CLIP embeds text and images in the same vector space (related text + images will be close to each other)

- We'll store our embedded vectors in a Vector Database ([ChromaDB](https://trychroma.com/))
    - We can then use ChromaDB to do a "semantic search" - find images/text that are close in **meaning** to our search query

In [4]:
def setup_clip():
    """Initialize the smallest CLIP model available"""
    model_name = "openai/clip-vit-base-patch32"  # Smallest CLIP model
    model = CLIPModel.from_pretrained(model_name)
    processor = CLIPProcessor.from_pretrained(model_name)
    return model, processor

def get_clip_embeddings(image_path, text, model, processor):
    """Get CLIP embeddings for both image and text"""
    # Process image
    image = Image.open(image_path)
    image_inputs = processor(images=image, return_tensors="pt")
    image_features = model.get_image_features(**image_inputs)
    image_embedding = image_features.detach().numpy()[0]
    
    # Process text
    text_inputs = processor(text=text, return_tensors="pt", padding=True)
    text_features = model.get_text_features(**text_inputs)
    text_embedding = text_features.detach().numpy()[0]
    
    return image_embedding, text_embedding

In [5]:
# Load data
image_dir = './data' # image data location

# Setup CLIP
model, processor = setup_clip()

# Initialize ChromaDB
client = chromadb.Client()
collection = client.create_collection(
    name="image_text_collection",
    metadata={"hnsw:space": "cosine"} # use cosine similarity
)

In [6]:
def process_images(image_dir, model, processor):
    """Process all JPG images in directory"""
    image_embeddings = []
    image_ids = []
    image_metadatas = []
    
    for idx, image_file in enumerate(tqdm(os.listdir(image_dir), desc="Processing images")):
        if not image_file.lower().endswith('.jpg'):
            continue
            
        image_path = os.path.join(image_dir, image_file)
        image = Image.open(image_path)
        image_inputs = processor(images=image, return_tensors="pt")
        image_features = model.get_image_features(**image_inputs)
        image_embedding = image_features.detach().numpy()[0]
        
        image_embeddings.append(image_embedding)
        image_ids.append(f"img_{idx}")
        image_metadatas.append({"type": "image", "path": image_path})
        
    return image_embeddings, image_ids, image_metadatas

In [7]:
# Process images
image_embeddings, image_ids, image_metadatas = process_images('./data', model, processor)

# Combine and add to ChromaDB
collection.add(
    embeddings=image_embeddings,
    ids=image_ids,
    metadatas=image_metadatas
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Processing images: 100%|██████████| 7/7 [00:00<00:00, 16.01it/s]


In [8]:
# Example queries
queries = [
    "An Airbus A350",
    "MS Dhoni",
    "Michael Schumacher",
    "Indian cricket team",
    "Airport",
    "Formula 1",
    "Singapore Air"
]

# Process queries
for query in queries:
    # Get query embedding
    text_inputs = processor(text=[query], return_tensors="pt", padding=True)
    query_features = model.get_text_features(**text_inputs)
    query_embedding = query_features.detach().numpy()[0]
    
    # Search in ChromaDB
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=1,
        include=["metadatas", "distances"]
    )
    
    print(f"\nResults for query: {query}")
    for idx, (id, metadata, distance) in enumerate(zip(
        results['ids'][0], 
        results['metadatas'][0], 
        results['distances'][0]
    )):
        similarity = 1 - distance
        print(f"similarity: {similarity:.3f}\n\t {id}: {metadata}")



Results for query: An Airbus A350
similarity: 0.290
	 img_1: {'path': './data/4.jpg', 'type': 'image'}

Results for query: MS Dhoni
similarity: 0.291
	 img_6: {'path': './data/0.JPG', 'type': 'image'}

Results for query: Michael Schumacher
similarity: 0.325
	 img_3: {'path': './data/2.jpg', 'type': 'image'}

Results for query: Indian cricket team
similarity: 0.272
	 img_6: {'path': './data/0.JPG', 'type': 'image'}

Results for query: Airport
similarity: 0.251
	 img_1: {'path': './data/4.jpg', 'type': 'image'}

Results for query: Formula 1
similarity: 0.284
	 img_3: {'path': './data/2.jpg', 'type': 'image'}

Results for query: Singapore Air
similarity: 0.307
	 img_1: {'path': './data/4.jpg', 'type': 'image'}
