In [41]:
import subprocess
import psycopg
import torch
import h5py
import gc
import os
import numpy as np
import pandas as pd
from PIL import Image
from tqdm.auto import tqdm
from psycopg.rows import dict_row
from transformers import CLIPProcessor, CLIPModel
from torch.utils.data import DataLoader, Dataset

In [42]:
conn = psycopg.connect(
    host="localhost",
    port=5432,
    dbname="postgres",
    user="postgres",
    password="0909231769"
)

In [43]:
def max_index() -> int:
    with conn.cursor(row_factory=dict_row) as cur:
        cur.execute("SELECT COUNT(*) AS total_rows FROM raw_image")
        result = cur.fetchone()
    return result['total_rows']

def get_row(index: int) -> dict | None:
    with conn.cursor(row_factory=dict_row) as cur: 
        cur.execute("SELECT * FROM raw_image WHERE idx = %s", (index,))
        return cur.fetchone()
    
def get_batch_rows(start_idx: int, end_idx: int) -> list[dict]:
    with conn.cursor(row_factory=dict_row) as cur:
        cur.execute("""
            SELECT idx, imageid, subset, originalurl 
            FROM raw_image 
            WHERE idx >= %s AND idx < %s 
            ORDER BY idx
        """, (start_idx, end_idx))
        return cur.fetchall()
    
def merge_HDF5_files(input_list, output_file):
    if not input_list:
        print("❌ Error: Input file list is empty.")
        return
    total_records = 0
    first_file = None
    for f_path in input_list:
        if os.path.exists(f_path):
            first_file = f_path
            break
    if not first_file:
        print("❌ Error: No valid input files found.")
        return
    with h5py.File(first_file, 'r') as f_first:
        embed_shape = np.squeeze(f_first['embeddings'][:]).shape[1]
        embed_dtype = f_first['embeddings'].dtype
        url_dtype = f_first['urls'].dtype
    with h5py.File(output_file, 'w') as f_output:
        f_output.create_dataset(
            'urls',
            shape=(0,),
            maxshape=(None,),
            dtype=url_dtype,
            chunks=True
        )
        f_output.create_dataset(
            'embeddings',
            shape=(0, embed_shape),
            maxshape=(None, embed_shape),
            dtype=embed_dtype,
            chunks=True
        )
    pbar = tqdm(total = len(input_list), desc="Merging")
    with h5py.File(output_file, 'a') as f_output:
        for file_path in input_list:
            if not os.path.exists(file_path):
                print(f"⚠️ File not found: {file_path}. Skipping.")
                pbar.update(1)
                continue
            try:
                with h5py.File(file_path, 'r') as f_input:
                    current_urls = f_input['urls'][:]
                    current_embeddings = f_input['embeddings'][:]
                    current_embeddings = np.squeeze(current_embeddings)
                    num_records = current_urls.shape[0]
                    if num_records == 0:
                        continue
                    dset_urls = f_output['urls']
                    dset_embeddings = f_output['embeddings']
                    new_size = total_records + num_records
                    dset_urls.resize(new_size, axis=0)
                    dset_embeddings.resize(new_size, axis=0)
                    dset_urls[total_records:new_size] = current_urls
                    dset_embeddings[total_records:new_size] = current_embeddings
                    total_records = new_size
            except Exception as e:
                print(f"❌ Error processing file {file_path}: {e}. Skipping this file.")
            pbar.update(1)
    pbar.close()

In [None]:
start = 600001
end = 700000
chunk = 10000
end = min(end, max_index())

In [45]:
MODEL_LIST = [  "openai/clip-vit-base-patch32", 
                "openai/clip-vit-base-patch16",
                "openai/clip-vit-large-patch14",
                "openai/clip-vit-large-patch14-336" ]
MODEL = MODEL_LIST[0]
CACHE = "../.cache"
OUTPUT = f"../.cache/{MODEL[7:]}/image_embeddings"
NUM_WORKERS = min(16, int(chunk/200))
BATCH_SIZE = 32

In [46]:
class CLIPImageDataset(Dataset):
    def __init__(self, image_ids, processor):
        self.image_ids = image_ids      
        self.processor = processor

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx):
        image_id = self.image_ids[idx]  
        image_path = f"{CACHE}/images/{image_id}.jpg"
        image = Image.open(image_path).convert("RGB")
        return self.processor(images=image, return_tensors="pt")

In [47]:
subprocess.run(["mkdir", f"../.cache/{MODEL[7:]}"])
subprocess.run(["mkdir", OUTPUT])
processor = CLIPProcessor.from_pretrained(MODEL, use_fast=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CLIPModel.from_pretrained(MODEL).to(device)
model.eval()

mkdir: cannot create directory ‘../.cache/clip-vit-base-patch32’: File exists
mkdir: cannot create directory ‘../.cache/clip-vit-base-patch32/image_embeddings’: File exists


CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [48]:
with open(CACHE + "/list_images.txt", "w") as f:
    for i in range(start, end + 1):
        f.write(get_row(i)["subset"] + "/" + get_row(i)["imageid"] + "\n")
!python ../.cache/downloader.py ../.cache/list_images.txt --download_folder=../.cache/images --num_processes=100

Downloading images: 100%|███████████████| 100000/100000 [22:39<00:00, 73.56it/s]


In [49]:
for i in range(start, end + 1, chunk):
    batch_rows = get_batch_rows(i, i + chunk)
    if not batch_rows:
        continue
    current_image_ids = [row["imageid"] for row in batch_rows]
    Data = CLIPImageDataset(image_ids=current_image_ids, processor=processor)
    LoadData = DataLoader(Data, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)
    all_image_embeddings = []
    for batch in tqdm(LoadData, desc=f"Processing images {i} to {i + chunk - 1}"):
        inputs = {k: v.squeeze(1).to(device) for k, v in batch.items()}
        with torch.no_grad():
            image_features = model.get_image_features(**inputs)
        image_embeddings = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
        all_image_embeddings.append(image_embeddings.cpu())
        del inputs
        del image_features
        del image_embeddings
    all_image_embeddings_tensor = torch.cat(all_image_embeddings, dim=0)
    del all_image_embeddings 
    output_path = OUTPUT + f"/{MODEL[7:]}_Images_Embedded_{i}_to_{i + chunk - 1}.h5"
    if not os.path.exists(output_path):
        all_embeddings_numpy = all_image_embeddings_tensor.numpy()
        current_urls = [row["originalurl"] for row in batch_rows]
        with h5py.File(output_path, "w") as outfile:
            dt = h5py.string_dtype(encoding='utf-8')
            outfile.create_dataset("urls", data=current_urls, dtype=dt)
            outfile.create_dataset("embeddings", data=all_embeddings_numpy)
        del all_embeddings_numpy
    del all_image_embeddings_tensor
    del Data
    del LoadData
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()

Processing images 500001 to 510000:   0%|          | 0/313 [00:00<?, ?it/s]

Processing images 510001 to 520000:   0%|          | 0/313 [00:00<?, ?it/s]

Processing images 520001 to 530000:   0%|          | 0/313 [00:00<?, ?it/s]

Processing images 530001 to 540000:   0%|          | 0/313 [00:00<?, ?it/s]

Processing images 540001 to 550000:   0%|          | 0/313 [00:00<?, ?it/s]

Processing images 550001 to 560000:   0%|          | 0/313 [00:00<?, ?it/s]

Processing images 560001 to 570000:   0%|          | 0/313 [00:00<?, ?it/s]

Processing images 570001 to 580000:   0%|          | 0/313 [00:00<?, ?it/s]

Processing images 580001 to 590000:   0%|          | 0/313 [00:00<?, ?it/s]

Processing images 590001 to 600000:   0%|          | 0/313 [00:00<?, ?it/s]

In [50]:
file_chunks = [
    OUTPUT + f"/{MODEL[7:]}_Images_Embedded_{i}_to_{i + chunk - 1}.h5"
    for i in range(start, end + 1, chunk)
]
file_gop_cuoi = OUTPUT + f"/{MODEL[7:]}_Images_Embedded_{start}_to_{end}.h5"
merge_HDF5_files(file_chunks, file_gop_cuoi)
for chunk_path in file_chunks:
    try:
        os.remove(chunk_path)
    except FileNotFoundError:
        print(f"⚠️ Warning: Chunk file not found during cleanup: {chunk_path}")

Merging:   0%|          | 0/10 [00:00<?, ?it/s]