In [1]:
import subprocess
import psycopg
import torch
import h5py
import gc
import os
import numpy as np
import pandas as pd
from PIL import Image
from tqdm.auto import tqdm
from psycopg.rows import dict_row
from transformers import CLIPProcessor, CLIPModel
from torch.utils.data import DataLoader, Dataset

In [3]:
conn = psycopg.connect(
    host="localhost",
    port=5432,
    dbname="postgres",
    user="postgres",
    password="0909231769"
)

In [4]:
def max_index() -> int:
    with conn.cursor(row_factory=dict_row) as cur:
        cur.execute("SELECT COUNT(*) AS total_rows FROM raw_image")
        result = cur.fetchone()
    return result['total_rows']

def get_row(index: int) -> dict | None:
    with conn.cursor(row_factory=dict_row) as cur: 
        cur.execute("SELECT * FROM raw_image WHERE idx = %s", (index,))
        return cur.fetchone()
    
def get_batch_rows(start_idx: int, end_idx: int) -> list[dict]:
    with conn.cursor(row_factory=dict_row) as cur:
        cur.execute("""
            SELECT idx, imageid, subset, originalurl 
            FROM raw_image 
            WHERE idx >= %s AND idx < %s 
            ORDER BY idx
        """, (start_idx, end_idx))
        return cur.fetchall()
    
def merge_HDF5_files(input_list, output_file):
    """
    Merges data (URLs and embeddings) from multiple HDF5 files into a single output file.
    """
    if not input_list:
        print("❌ Error: Input file list is empty.")
        return

    total_records = 0

    # 1. Initialize Output Structure based on the first valid file
    first_file = None
    # Find the first existing file to determine the required structure (dtype, shape).
    for f_path in input_list:
        if os.path.exists(f_path):
            first_file = f_path
            break

    if not first_file:
        print("❌ Error: No valid input files found.")
        return

    with h5py.File(first_file, 'r') as f_first:
        # Get embedding dimension (shape[1]) and data types (dtype) for initialization.
        # np.squeeze is used to handle potential extra dimensions (e.g., shape (N, 1, D) -> (N, D)).
        embed_shape = np.squeeze(f_first['embeddings'][:]).shape[1]
        embed_dtype = f_first['embeddings'].dtype
        url_dtype = f_first['urls'].dtype

    # Create the output file and initialize extendable datasets.
    with h5py.File(output_file, 'w') as f_output:
        # Initialize 'urls' dataset with zero length, maxshape=(None,) allows extension.
        f_output.create_dataset(
            'urls',
            shape=(0,),
            maxshape=(None,),
            dtype=url_dtype,
            chunks=True
        )
        # Initialize 'embeddings' dataset with zero length, maxshape=(None, embed_shape) allows extension.
        f_output.create_dataset(
            'embeddings',
            shape=(0, embed_shape),
            maxshape=(None, embed_shape),
            dtype=embed_dtype,
            chunks=True
        )

    pbar = tqdm(total = len(input_list), desc="Merging")

    # 2. Loop through input files and append data
    # Open the output file in append mode ('a') for modification.
    with h5py.File(output_file, 'a') as f_output:
        for file_path in input_list:
            if not os.path.exists(file_path):
                print(f"⚠️ File not found: {file_path}. Skipping.")
                pbar.update(1)
                continue

            try:
                with h5py.File(file_path, 'r') as f_input:
                    # Read data from the current input file.
                    current_urls = f_input['urls'][:]
                    current_embeddings = f_input['embeddings'][:]
                    current_embeddings = np.squeeze(current_embeddings)
                    num_records = current_urls.shape[0]

                    if num_records == 0:
                        continue

                    dset_urls = f_output['urls']
                    dset_embeddings = f_output['embeddings']

                    new_size = total_records + num_records

                    # Resize the datasets in the output file to accommodate new records.
                    dset_urls.resize(new_size, axis=0)
                    dset_embeddings.resize(new_size, axis=0)

                    # Write the current file's data into the newly reserved space.
                    dset_urls[total_records:new_size] = current_urls
                    dset_embeddings[total_records:new_size] = current_embeddings

                    # Update the running total of merged records.
                    total_records = new_size

            except Exception as e:
                # Handle potential errors during file reading or resizing.
                print(f"❌ Error processing file {file_path}: {e}. Skipping this file.")

            pbar.update(1)
    pbar.close()

In [5]:
num_images = max_index()

In [6]:
start = 1
end = 10000
chunk = 1000

In [7]:
CACHE = "../.cache"
MODEL_LIST = [  "openai/clip-vit-base-patch32",
                "openai/clip-vit-base-patch16",
                "openai/clip-vit-large-patch14",
                "openai/clip-vit-large-patch14-336" ]
MODEL = MODEL_LIST[3]
NUM_WORKERS = min(16, int(chunk/200))
BATCH_SIZE = 32

In [8]:
class CLIPImageDataset(Dataset):
    def __init__(self, image_ids, processor):
        self.image_ids = image_ids      
        self.processor = processor

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx):
        image_id = self.image_ids[idx]  
        image_path = f"{CACHE}/images/{image_id}.jpg"
        image = Image.open(image_path).convert("RGB")
        return self.processor(images=image, return_tensors="pt")

In [9]:
subprocess.run(["mkdir", "OUTPUT"])
processor = CLIPProcessor.from_pretrained(MODEL, use_fast=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CLIPModel.from_pretrained(MODEL).to(device)
model.eval()

mkdir: cannot create directory ‘OUTPUT’: File exists


CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 768)
      (position_embedding): Embedding(77, 768)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-05,

In [26]:
with open(CACHE + "/list_images.txt", "w") as f:
    for i in range(start, end + 1):
        f.write(get_row(i)["subset"] + "/" + get_row(i)["imageid"] + "\n")
!python ../.cache/downloader.py ../.cache/list_images.txt --download_folder=../.cache/images --num_processes=100

Downloading images: 100%|█████████████████| 10000/10000 [02:54<00:00, 57.29it/s]


In [9]:
image_ids = [get_row(j)["imageid"] for j in range(start, start + chunk)]
print(len(image_ids))


1000


In [10]:
for i in range(start, end + 1, chunk):
    batch_rows = get_batch_rows(i, i + chunk)
    
    if not batch_rows:
        continue

    current_image_ids = [row["imageid"] for row in batch_rows]
    
    Data = CLIPImageDataset(image_ids=current_image_ids, processor=processor)
    LoadData = DataLoader(Data, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)

    all_image_embeddings = []

    for batch in tqdm(LoadData, desc=f"Processing images {i} to {i + chunk - 1}"):
        inputs = {k: v.squeeze(1).to(device) for k, v in batch.items()}
        with torch.no_grad():
            image_features = model.get_image_features(**inputs)
        
        image_embeddings = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
        all_image_embeddings.append(image_embeddings.cpu())

        del inputs
        del image_features
        del image_embeddings

    all_image_embeddings_tensor = torch.cat(all_image_embeddings, dim=0)
    
    del all_image_embeddings 

    output_path = f"OUTPUT/{MODEL[7:]}_Images_Embedded_{i}_to_{i + chunk - 1}.h5"

    if not os.path.exists(output_path):
        all_embeddings_numpy = all_image_embeddings_tensor.numpy()
        current_urls = [row["originalurl"] for row in batch_rows]
        with h5py.File(output_path, "w") as outfile:
            dt = h5py.string_dtype(encoding='utf-8')
            outfile.create_dataset("urls", data=current_urls, dtype=dt)
            outfile.create_dataset("embeddings", data=all_embeddings_numpy)
        
        del all_embeddings_numpy

    del all_image_embeddings_tensor
    del Data
    del LoadData
    
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        
    gc.collect()

Processing images 1 to 1000:   0%|          | 0/32 [00:00<?, ?it/s]

Processing images 1001 to 2000:   0%|          | 0/32 [00:00<?, ?it/s]

Processing images 2001 to 3000:   0%|          | 0/32 [00:00<?, ?it/s]

Processing images 3001 to 4000:   0%|          | 0/32 [00:00<?, ?it/s]

Processing images 4001 to 5000:   0%|          | 0/32 [00:00<?, ?it/s]

Processing images 5001 to 6000:   0%|          | 0/32 [00:00<?, ?it/s]

Processing images 6001 to 7000:   0%|          | 0/32 [00:00<?, ?it/s]

Processing images 7001 to 8000:   0%|          | 0/32 [00:00<?, ?it/s]

Processing images 8001 to 9000:   0%|          | 0/32 [00:00<?, ?it/s]

Processing images 9001 to 10000:   0%|          | 0/32 [00:00<?, ?it/s]

In [11]:
# 1. MERGE CHUNKED HDF5 FILES
# Generate a list of all HDF5 chunk files created during the embedding process.
file_chunks = [
    f"OUTPUT/{MODEL[7:]}_Images_Embedded_{i}_to_{i + chunk - 1}.h5"
    for i in range(start, end + 1, chunk)
]

# Define the final, consolidated HDF5 file path.
file_gop_cuoi = f"OUTPUT/{MODEL[7:]}_Images_Embedded_{start}_to_{end}.h5"

# Call the function to merge all chunk files into the single final file.
merge_HDF5_files(file_chunks, file_gop_cuoi)

# 2. CLEANUP: REMOVE TEMPORARY CHUNKS
# Iterate through the indices used to generate the chunk files.
for chunk_path in file_chunks:
    try:
        # Delete the individual temporary HDF5 chunk file to free up disk space.
        os.remove(chunk_path)
    except FileNotFoundError:
        # Handle case where the file might have been skipped or already deleted.
        print(f"⚠️ Warning: Chunk file not found during cleanup: {chunk_path}")

Merging:   0%|          | 0/10 [00:00<?, ?it/s]

In [11]:
r = h5py.File(file_gop_cuoi, "r")
print(r["embeddings"].shape)
print(r["urls"].shape)
r.close()

(10000, 512)
(10000,)
