In [None]:
import numpy as np
import os
from functools import partial
from transformers import CLIPModel, CLIPImageProcessor
from compute_pretrained_embeddings import get_embeddings
from dataloader import TarImageDataset, custom_collate_fn
from torch.utils.data import DataLoader
from my_utils import load_config

config = load_config("semdedup_configs.yaml")

model_name = "openai/clip-vit-base-patch32"
tar_files_directory = os.path.abspath("data/raw")
batch_size = 32

model = CLIPModel.from_pretrained(model_name)
image_processor = CLIPImageProcessor.from_pretrained(model_name)
my_collate_fn = partial(custom_collate_fn, image_processor=image_processor)

dataset = TarImageDataset(tar_dir=tar_files_directory, transform=None)
dataloader = DataLoader(
    dataset, 
    batch_size=batch_size, 
    shuffle=True, 
    collate_fn=my_collate_fn,
    num_workers=0)

paths_str_type = 'U256' #'S256'
embed_float_type = 'float32' #'S256'
emb_memory_loc = config['embs_memory_loc']
paths_memory_loc = config['path_memory_loc']
dataset_size = dataset.__len__()
emb_size = config['emd_size']

emb_array = np.memmap(
    emb_memory_loc, 
    dtype=embed_float_type, 
    mode='w+', 
    shape=(dataset_size, emb_size)
)

path_array = np.memmap(
    paths_memory_loc, 
    dtype=paths_str_type,
    mode='w+', 
    shape=(dataset_size,)
)

In [7]:
import torch

In [None]:
!nvidia-smi

In [8]:
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA version PyTorch is built with:", torch.version.cuda)
print("CUDA device count:", torch.cuda.device_count())

PyTorch version: 2.6.0+cu124
CUDA available: True
CUDA version PyTorch is built with: 12.4
CUDA device count: 2


In [9]:
# -- Device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
# -- model
model = model.to(device)
model = model.eval()

cuda


In [10]:
with torch.no_grad():
    for data_batch, paths_batch, batch_indices in dataloader:
        if True:
            break

[`get_image_features`](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPModel.get_image_features)

In [11]:
encodings = model.get_image_features(
    pixel_values=data_batch
)

In [12]:
encodings.shape

torch.Size([32, 512])

In [13]:
from torch.nn.functional import normalize

In [14]:
embeds_normalized = normalize(encodings, dim=1)

In [16]:
path_array[batch_indices] = paths_batch

In [17]:
emb_array[batch_indices] = embeds_normalized.cpu().detach().numpy()

In [18]:
emb_array[batch_indices]

array([[-0.02233509, -0.02032149,  0.02072333, ...,  0.09887572,
        -0.00211839, -0.01152895],
       [ 0.00800907,  0.02617006,  0.00255419, ...,  0.03405451,
         0.01636192,  0.00202419],
       [-0.0636093 , -0.01677628,  0.04364581, ...,  0.07423619,
        -0.03214012,  0.00549925],
       ...,
       [ 0.02042953,  0.01727075,  0.00955408, ...,  0.05227413,
        -0.02564773,  0.02683108],
       [-0.02755074,  0.05310895,  0.00985589, ...,  0.06265323,
         0.02549751, -0.00182506],
       [ 0.01792138, -0.0007999 ,  0.02287464, ..., -0.04192063,
         0.01003846,  0.01667938]], dtype=float32)

In [22]:
path_array[batch_indices]

array(['000000000527.jpg', '000000010133.jpg', '000000000084.jpg',
       '000000011394.jpg', '000000010571.jpg', '000000011143.jpg',
       '000000010723.jpg', '000000010278.jpg', '000000010409.jpg',
       '000000010302.jpg', '000000000439.jpg', '000000000519.jpg',
       '000000010795.jpg', '000000010349.jpg', '000000010793.jpg',
       '000000011156.jpg', '000000011020.jpg', '000000000422.jpg',
       '000000000787.jpg', '000000011053.jpg', '000000011151.jpg',
       '000000010932.jpg', '000000000345.jpg', '000000011260.jpg',
       '000000010872.jpg', '000000000739.jpg', '000000010979.jpg',
       '000000000960.jpg', '000000000224.jpg', '000000010368.jpg',
       '000000000979.jpg', '000000000912.jpg'], dtype='<U256')

In [11]:
# file_path = 'data/embeddings/dist_to_cent.npy'
# file_path = '/home/fbernardi/Documents/SemDeDup/data/embeddings/kmeans_centroids.npy'
file_path = '/home/fbernardi/Documents/SemDeDup/data/embeddings/nearest_cent.npy'

print(np.load(file_path),
np.load(file_path).shape)

[273  89  93 ... 197 194  50] (1004,)


In [None]:
import numpy as np

# Replace 'data.npy' with your actual file name or path
file_path = "/home/fbernardi/Documents/SemDeDup/data/embeddings/embs.npy"
data = np.memmap(
    file_path,
    dtype='float32',
    mode='r',  # read-only
    shape=(dataset_size, emb_size)
)

print("Loaded data:", data)

NameError: name 'dataset_size' is not defined

In [29]:
data[0]

memmap([-6.36093020e-02, -1.67762786e-02,  4.36458141e-02,
        -4.82349796e-03,  3.45024988e-02, -2.32574344e-02,
        -2.49317451e-03, -6.22026809e-02, -1.26413144e-02,
        -7.81361386e-03,  3.32660377e-02,  5.08501194e-03,
         1.42847765e-02, -2.54854485e-02,  2.80646281e-03,
         4.18728478e-02,  5.67851216e-03,  1.23474412e-02,
         7.87839200e-03,  1.06908251e-02,  1.95562169e-02,
        -3.89889610e-04,  5.73137477e-02,  7.79694831e-03,
        -2.02623080e-03,  4.92877839e-03,  1.46089764e-02,
        -4.46669571e-02, -9.41669755e-03,  2.94540729e-02,
         2.64614485e-02,  8.21091235e-03,  6.22427277e-03,
         2.71132472e-03, -6.71255589e-02,  3.24139148e-02,
        -2.22830027e-02, -3.43820415e-02, -1.95267890e-02,
         1.44049540e-01, -1.66624319e-02, -4.63205576e-02,
        -2.10069097e-03, -4.46671844e-02,  6.81137433e-03,
        -2.41442427e-01, -2.01218706e-02,  5.18792309e-02,
        -3.40216747e-03,  4.13236283e-02,  1.87341105e-0

In [24]:
batch_indices

[180,
 525,
 0,
 996,
 670,
 862,
 725,
 577,
 634,
 578,
 184,
 190,
 739,
 586,
 750,
 954,
 814,
 140,
 267,
 852,
 883,
 826,
 88,
 943,
 777,
 273,
 949,
 321,
 83,
 589,
 361,
 319]

In [31]:
file_path = "/home/fbernardi/Documents/SemDeDup/data/embeddings/path.npy"
paths_str_type = 'U256'

paths = np.memmap(
    file_path,
    dtype=paths_str_type,
    mode='r',  # read-only
    shape=(dataset_size,)
)

In [34]:
paths.shape

(1004,)

In [35]:
path_array[batch_indices]

array(['000000000527.jpg', '000000010133.jpg', '000000000084.jpg',
       '000000011394.jpg', '000000010571.jpg', '000000011143.jpg',
       '000000010723.jpg', '000000010278.jpg', '000000010409.jpg',
       '000000010302.jpg', '000000000439.jpg', '000000000519.jpg',
       '000000010795.jpg', '000000010349.jpg', '000000010793.jpg',
       '000000011156.jpg', '000000011020.jpg', '000000000422.jpg',
       '000000000787.jpg', '000000011053.jpg', '000000011151.jpg',
       '000000010932.jpg', '000000000345.jpg', '000000011260.jpg',
       '000000010872.jpg', '000000000739.jpg', '000000010979.jpg',
       '000000000960.jpg', '000000000224.jpg', '000000010368.jpg',
       '000000000979.jpg', '000000000912.jpg'], dtype='<U256')