In [None]:
!pip install --upgrade pip
!pip install ftfy regex tqdm
!pip install sentencepiece
!pip install -U transformers

# Import

In [None]:
import os
import json
import torch
import numpy as np
from PIL import Image
from tqdm import tqdm
import torch.nn as nn
from concurrent.futures import ThreadPoolExecutor
import torch_xla # TPU
import torch_xla.core.xla_model as xm

In [None]:
image_dir = '/kaggle/input/eventa-img-cieldt/database_images_compressed90'  
device = "cuda" if torch.cuda.is_available() else "cpu"
device = xm.xla_device()
# print(f"Sử dụng thiết bị: {device}")

In [None]:
filenames = [f for f in os.listdir(image_dir) if f.lower().endswith((".jpg", ".png", ".jpeg"))]
filenames.sort()
print('done preparing images path')

# CLIP

In [None]:
from transformers import CLIPProcessor, CLIPModel
import torch

model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
model.eval();

In [None]:
import time
print('Extracting image features...')
all_img_id = []

batch_size = 64
image_embeddings = []

def load_and_preprocess(filename):
    try:
        image_path = os.path.join(image_dir, filename)
        image = Image.open(image_path).convert("RGB")
        return filename, image
    except Exception as e:
        return None, None

total_images_processed = 0
start_time = time.time()

start = 0
end = len(filenames)

with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
    for i in tqdm(range(start, end, batch_size), desc="Processing Batches"):
        # print(f"{i} tasks done")
        batch_filenames = filenames[i: min(i + batch_size, end)]

        results = list(executor.map(load_and_preprocess, batch_filenames))

        valid_results = [(fn, img) for fn, img in results if img is not None]
        if not valid_results:
            print(f"Skipping batch {i}-{i+batch_size} due to no valid images.")
            continue

        fnames = [fn for fn, _ in valid_results]
        images_list = [img for _, img in valid_results]
        
        image_tensors = processor(images=images_list, return_tensors="pt")
        image_tensors = {k: v.to(device) for k, v in image_tensors.items()}

        with torch.no_grad():
            embeddings = model.get_image_features(**image_tensors).cpu().numpy()

        for fname, embedding in zip(fnames, embeddings):
            img_id = os.path.splitext(fname)[0]
            all_img_id.append(img_id)
            image_embeddings.append(embedding)

        total_images_processed += len(fnames)
        # break

end_time = time.time()
print(f"\nTổng thời gian chạy: {end_time - start_time:.2f} giây")
# Lưu kết quả
embeddings_np = np.array(image_embeddings)
np.save("databse_clip.npy", embeddings_np)
print('Done.')

In [None]:
# query json
with open("database_clip.json", "w") as f:
    json.dump(all_img_id, f, indent=2)

In [None]:
del model
del processor

# SigLIP

In [None]:
from transformers import pipeline
from transformers import AutoProcessor, AutoModelForZeroShotImageClassification

processor = AutoProcessor.from_pretrained("google/siglip-so400m-patch14-384")
model = AutoModelForZeroShotImageClassification.from_pretrained("google/siglip-so400m-patch14-384")
model.to(device)
model.eval();

In [None]:
import time
print('Extracting image features...')
all_img_id = []

batch_size = 64
image_embeddings = []

def load_and_preprocess(filename):
    try:
        image_path = os.path.join(image_dir, filename)
        image = Image.open(image_path).convert("RGB")
        return filename, image
    except Exception as e:
        return None, None

total_images_processed = 0
start_time = time.time()

start = 0
end = len(filenames)

with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
    for i in tqdm(range(start, end, batch_size), desc="Processing Batches"):
        # print(f"{i} tasks done")
        batch_filenames = filenames[i: min(i + batch_size, end)]

        results = list(executor.map(load_and_preprocess, batch_filenames))

        valid_results = [(fn, img) for fn, img in results if img is not None]
        if not valid_results:
            print(f"Skipping batch {i}-{i+batch_size} due to no valid images.")
            continue

        fnames = [fn for fn, _ in valid_results]
        images_list = [img for _, img in valid_results]
        
        image_tensors = processor(images=images_list, return_tensors="pt")
        image_tensors = {k: v.to(device) for k, v in image_tensors.items()}

        with torch.no_grad():
            embeddings = model.get_image_features(**image_tensors).cpu().numpy()

        for fname, embedding in zip(fnames, embeddings):
            img_id = os.path.splitext(fname)[0]
            all_img_id.append(img_id)
            image_embeddings.append(embedding)

        total_images_processed += len(fnames)
        # break

end_time = time.time()
print(f"\nTổng thời gian chạy: {end_time - start_time:.2f} giây")
# Lưu kết quả
embeddings_np = np.array(image_embeddings)
np.save("private_test_siglip.npy", embeddings_np)
print('Done.')

In [None]:
del processor
del model

# DINO V2

In [None]:
from transformers import AutoImageProcessor, AutoModel
from PIL import Image

processor = AutoImageProcessor.from_pretrained('facebook/dinov2-giant')
model = AutoModel.from_pretrained('facebook/dinov2-giant')
model.to(device);
model.eval();

In [None]:
import time
print('Extracting image features...')
all_img_id = []

batch_size = 64
image_embeddings = []

def load_and_preprocess(filename):
    try:
        image_path = os.path.join(image_dir, filename)
        image = Image.open(image_path).convert("RGB")
        return filename, image
    except Exception as e:
        return None, None

total_images_processed = 0
start_time = time.time()

start = 0
end = len(filenames)

with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
    for i in tqdm(range(start, end, batch_size), desc="Processing Batches"):
        # print(f"{i} tasks done")
        batch_filenames = filenames[i: min(i + batch_size, end)]

        results = list(executor.map(load_and_preprocess, batch_filenames))

        valid_results = [(fn, img) for fn, img in results if img is not None]
        if not valid_results:
            print(f"Skipping batch {i}-{i+batch_size} due to no valid images.")
            continue

        fnames = [fn for fn, _ in valid_results]
        images_list = [img for _, img in valid_results]
        
        image_tensors = processor(images=images_list, return_tensors="pt")
        image_tensors = {k: v.to(device) for k, v in image_tensors.items()}

        with torch.no_grad():
            output = model(**image_tensors)
            embeddings = output.last_hidden_state[:, 0, :].cpu().numpy()

        for fname, embedding in zip(fnames, embeddings):
            img_id = os.path.splitext(fname)[0]
            all_img_id.append(img_id)
            image_embeddings.append(embedding)

        total_images_processed += len(fnames)

end_time = time.time()
print(f"\nTổng thời gian chạy: {end_time - start_time:.2f} giây")
# Lưu kết quả
embeddings_np = np.array(image_embeddings)
np.save("private_test_dino.npy", embeddings_np)
print('Done .')