In [63]:
!pip install transformers huggingface-hub cloudinary pillow requests h5py tqdm numpy torch open-clip-torch



In [1]:
import argparse
import os
import sys
from pathlib import Path
from io import BytesIO
import json

import numpy as np
import hnswlib
import requests
import time
import h5py
from PIL import Image, ImageFile  # Import th√™m ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True  # Cho ph√©p ƒë·ªçc ·∫£nh b·ªã l·ªói nh·∫π
from tqdm import tqdm

# Check dependencies
try:
    import torch
    import open_clip
except ImportError as e:
    print(f"‚ùå Missing dependency: {e}")
    print("\nPlease install required packages:")
    print("  conda activate hnsw-backend-venv")
    print("  pip install open-clip-torch")
    sys.exit(1)

# Check dependencies
try:
    import cloudinary
    import cloudinary.api
    import cloudinary.uploader
    from transformers import AutoModel, AutoProcessor
    import torch
except ImportError as e:
    print(f"‚ùå Missing dependency: {e}")
    print("\nPlease install required packages:")
    print("  pip install transformers huggingface-hub cloudinary pillow requests h5py tqdm numpy torch")
    sys.exit(1)

In [2]:
def load_biomedclip():
    """Load BiomedCLIP model using open_clip"""
    print("\nü§ñ Loading BiomedCLIP model...")
    print("   Model: hf-hub:microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224")
    print("   This may take a few minutes on first run (downloads ~2GB)\n")
    
    # Load BiomedCLIP using open_clip
    model, preprocess_train, preprocess_val = open_clip.create_model_and_transforms(
        'hf-hub:microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224'
    )
    tokenizer = open_clip.get_tokenizer('hf-hub:microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224')
    
    # Move to GPU if available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    model.eval()  # Set to evaluation mode
    
    print(f"‚úÖ Model loaded successfully!")
    print(f"   Device: {device}")
    print(f"   Embedding dimension: 512")
    
    # Test the model
    test_text = "femur fracture"
    text_tokens = tokenizer([test_text]).to(device)
    with torch.no_grad():
        text_features = model.encode_text(text_tokens)
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
    
    print(f"\nüß™ Test encoding: '{test_text}'")
    print(f"   Output shape: {text_features.shape}")
    print(f"   ‚úÖ Model is working correctly!")
    
    return model, preprocess_val, tokenizer, device

def encode_image(image_path, model, preprocess, device):
    """Encode a single image to embedding"""
    try:
        # Open and convert to RGB
        image = Image.open(image_path).convert('RGB')
        
        # Preprocess image
        image_tensor = preprocess(image).unsqueeze(0).to(device)
        
        # Generate embedding
        with torch.no_grad():
            image_features = model.encode_image(image_tensor)
            image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        
        return image_features.cpu().numpy().astype(np.float32)[0]
        
    except Exception as e:
        print(f"\n‚ùå Failed to process {image_path}: {e}")
        return None
    
def get_text_embedding(text_input, model, tokenizer, device):
    """
    Chuy·ªÉn ƒë·ªïi text (ho·∫∑c list c√°c text) th√†nh vector embedding ƒë√£ ƒë∆∞·ª£c chu·∫©n h√≥a.
    
    Args:
        text_input: M·ªôt chu·ªói (str) ho·∫∑c m·ªôt danh s√°ch chu·ªói (list of str).
        model: Model BiomedCLIP ƒë√£ load.
        tokenizer: Tokenizer c·ªßa BiomedCLIP.
        device: 'cuda' ho·∫∑c 'cpu'.
        
    Returns:
        numpy.ndarray: M·∫£ng vector embedding (shape: [n, 512]).
    """
    
    # 1. X·ª≠ l√Ω ƒë·∫ßu v√†o: ƒê·∫£m b·∫£o lu√¥n l√† list ƒë·ªÉ tokenizer ho·∫°t ƒë·ªông ƒë√∫ng
    if isinstance(text_input, str):
        text_input = [text_input]
        
    # 2. Tokenize: Chuy·ªÉn text th√†nh token ID
    # context_length c·ªßa BiomedCLIP n√†y l√† 256
    tokens = tokenizer(text_input).to(device)
    
    # 3. Inference: Ch·∫°y model ƒë·ªÉ l·∫•y feature
    with torch.no_grad():
        text_features = model.encode_text(tokens)
        
        # 4. Normalization (QUAN TR·ªåNG): 
        # C·∫ßn chia cho ƒë·ªô d√†i vector (L2 norm) ƒë·ªÉ ƒë∆∞a v·ªÅ ƒë∆°n v·ªã chu·∫©n.
        # N·∫øu kh√¥ng l√†m b∆∞·ªõc n√†y, t√≠nh Cosine Similarity s·∫Ω b·ªã sai.
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
        
    # 5. Chuy·ªÉn v·ªÅ Numpy array ƒë·ªÉ d·ªÖ l∆∞u v√†o HNSW/Database
    return text_features.cpu().numpy()

In [3]:
model, processor, tokenizer, device = load_biomedclip()


ü§ñ Loading BiomedCLIP model...
   Model: hf-hub:microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224
   This may take a few minutes on first run (downloads ~2GB)

‚úÖ Model loaded successfully!
   Device: cuda
   Embedding dimension: 512

üß™ Test encoding: 'femur fracture'
   Output shape: torch.Size([1, 512])
   ‚úÖ Model is working correctly!


In [4]:
dataset = json.load(open('cloudinary_urls.json'))

In [5]:
print(len(dataset))

3366


In [6]:
urls = []
all_embeddings_numpy = []

for i in tqdm(range(len(dataset))):
    url = dataset[i]['url']
    image_path = dataset[i]['local_path']
    embeddings = encode_image(image_path, model, processor, device)
    if embeddings is not None:
        urls.append(url)
        all_embeddings_numpy.append(embeddings)

all_embeddings = np.array(all_embeddings_numpy)
norms = np.linalg.norm(all_embeddings_numpy, axis=1, keepdims=True)
all_embeddings_numpy = all_embeddings_numpy / norms

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3366/3366 [01:00<00:00, 55.85it/s]


In [7]:
output_path = "../backend/temp/Medical_Embedded.h5"

with h5py.File(output_path, "w") as outfile:
    outfile.create_dataset("urls", data=np.array(urls, dtype='S')) 
    outfile.create_dataset("embeddings", data=all_embeddings_numpy)

In [9]:
output_bin_path = "../backend/temp/Medical_Embedded.bin"

data = h5py.File(output_path, "r")
p = hnswlib.Index(space='cosine', dim=512)
p.init_index(max_elements=10000, ef_construction=400, M=200)
p.add_items(data["embeddings"])
p.set_ef(400)


In [10]:
p.save_index(output_bin_path)

In [11]:
xyz = p.knn_query(get_text_embedding("arm", model, tokenizer, device), 10)
print(xyz)

(array([[1147, 3107,   54, 1993, 1793, 1741,  885, 2999, 3269,  616]],
      dtype=uint64), array([[0.5979632 , 0.60067916, 0.6017135 , 0.6060069 , 0.6070913 ,
        0.60986036, 0.61100316, 0.6114818 , 0.61228967, 0.61520696]],
      dtype=float32))


In [12]:
print(dataset[2999]["url"])

https://res.cloudinary.com/dp4qen6gz/image/upload/v1765208122/medical/fractures_2/IMG0002064.jpg
