In [None]:
import numpy as np
import os
from functools import partial
from transformers import CLIPModel, CLIPImageProcessor
from compute_pretrained_embeddings import get_embeddings
from dataloader import TarImageDataset, custom_collate_fn
from torch.utils.data import DataLoader
from my_utils import load_config

config = load_config("semdedup_configs.yaml")

model_name = "openai/clip-vit-base-patch32"
tar_files_directory = os.path.abspath("data/raw")
batch_size = 32

model = CLIPModel.from_pretrained(model_name)
image_processor = CLIPImageProcessor.from_pretrained(model_name)
my_collate_fn = partial(custom_collate_fn, image_processor=image_processor)

dataset = TarImageDataset(tar_dir=tar_files_directory, transform=None)
dataloader = DataLoader(
    dataset, 
    batch_size=batch_size, 
    shuffle=True, 
    collate_fn=my_collate_fn,
    num_workers=0)

paths_str_type = 'U256' #'S256'
embed_float_type = 'float32' #'S256'
emb_memory_loc = config['embs_memory_loc']
paths_memory_loc = config['path_memory_loc']
dataset_size = dataset.__len__()
emb_size = config['emd_size']

emb_array = np.memmap(
    emb_memory_loc, 
    dtype=embed_float_type, 
    mode='w+', 
    shape=(dataset_size, emb_size)
)

path_array = np.memmap(
    paths_memory_loc, 
    dtype=paths_str_type,
    mode='w+', 
    shape=(dataset_size,)
)

In [7]:
import torch

In [None]:
!nvidia-smi

In [8]:
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA version PyTorch is built with:", torch.version.cuda)
print("CUDA device count:", torch.cuda.device_count())

PyTorch version: 2.6.0+cu124
CUDA available: True
CUDA version PyTorch is built with: 12.4
CUDA device count: 2


In [9]:
# -- Device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
# -- model
model = model.to(device)
model = model.eval()

cuda


In [10]:
with torch.no_grad():
    for data_batch, paths_batch, batch_indices in dataloader:
        if True:
            break

[`get_image_features`](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPModel.get_image_features)

In [11]:
encodings = model.get_image_features(
    pixel_values=data_batch
)

In [12]:
encodings.shape

torch.Size([32, 512])

In [13]:
from torch.nn.functional import normalize

In [14]:
embeds_normalized = normalize(encodings, dim=1)

In [16]:
path_array[batch_indices] = paths_batch

In [17]:
emb_array[batch_indices] = embeds_normalized.cpu().detach().numpy()

In [18]:
emb_array[batch_indices]

array([[-0.02233509, -0.02032149,  0.02072333, ...,  0.09887572,
        -0.00211839, -0.01152895],
       [ 0.00800907,  0.02617006,  0.00255419, ...,  0.03405451,
         0.01636192,  0.00202419],
       [-0.0636093 , -0.01677628,  0.04364581, ...,  0.07423619,
        -0.03214012,  0.00549925],
       ...,
       [ 0.02042953,  0.01727075,  0.00955408, ...,  0.05227413,
        -0.02564773,  0.02683108],
       [-0.02755074,  0.05310895,  0.00985589, ...,  0.06265323,
         0.02549751, -0.00182506],
       [ 0.01792138, -0.0007999 ,  0.02287464, ..., -0.04192063,
         0.01003846,  0.01667938]], dtype=float32)

In [22]:
path_array[batch_indices]

array(['000000000527.jpg', '000000010133.jpg', '000000000084.jpg',
       '000000011394.jpg', '000000010571.jpg', '000000011143.jpg',
       '000000010723.jpg', '000000010278.jpg', '000000010409.jpg',
       '000000010302.jpg', '000000000439.jpg', '000000000519.jpg',
       '000000010795.jpg', '000000010349.jpg', '000000010793.jpg',
       '000000011156.jpg', '000000011020.jpg', '000000000422.jpg',
       '000000000787.jpg', '000000011053.jpg', '000000011151.jpg',
       '000000010932.jpg', '000000000345.jpg', '000000011260.jpg',
       '000000010872.jpg', '000000000739.jpg', '000000010979.jpg',
       '000000000960.jpg', '000000000224.jpg', '000000010368.jpg',
       '000000000979.jpg', '000000000912.jpg'], dtype='<U256')

In [11]:
# file_path = 'data/embeddings/dist_to_cent.npy'
# file_path = '/home/fbernardi/Documents/SemDeDup/data/embeddings/kmeans_centroids.npy'
file_path = '/home/fbernardi/Documents/SemDeDup/data/embeddings/nearest_cent.npy'

print(np.load(file_path),
np.load(file_path).shape)

[273  89  93 ... 197 194  50] (1004,)


In [None]:
import numpy as np

# Replace 'data.npy' with your actual file name or path
file_path = "/home/fbernardi/Documents/SemDeDup/data/embeddings/embs.npy"
data = np.memmap(
    file_path,
    dtype='float32',
    mode='r',  # read-only
    shape=(dataset_size, emb_size)
)

print("Loaded data:", data)

NameError: name 'dataset_size' is not defined

In [29]:
data[0]

memmap([-6.36093020e-02, -1.67762786e-02,  4.36458141e-02,
        -4.82349796e-03,  3.45024988e-02, -2.32574344e-02,
        -2.49317451e-03, -6.22026809e-02, -1.26413144e-02,
        -7.81361386e-03,  3.32660377e-02,  5.08501194e-03,
         1.42847765e-02, -2.54854485e-02,  2.80646281e-03,
         4.18728478e-02,  5.67851216e-03,  1.23474412e-02,
         7.87839200e-03,  1.06908251e-02,  1.95562169e-02,
        -3.89889610e-04,  5.73137477e-02,  7.79694831e-03,
        -2.02623080e-03,  4.92877839e-03,  1.46089764e-02,
        -4.46669571e-02, -9.41669755e-03,  2.94540729e-02,
         2.64614485e-02,  8.21091235e-03,  6.22427277e-03,
         2.71132472e-03, -6.71255589e-02,  3.24139148e-02,
        -2.22830027e-02, -3.43820415e-02, -1.95267890e-02,
         1.44049540e-01, -1.66624319e-02, -4.63205576e-02,
        -2.10069097e-03, -4.46671844e-02,  6.81137433e-03,
        -2.41442427e-01, -2.01218706e-02,  5.18792309e-02,
        -3.40216747e-03,  4.13236283e-02,  1.87341105e-0

In [24]:
batch_indices

[180,
 525,
 0,
 996,
 670,
 862,
 725,
 577,
 634,
 578,
 184,
 190,
 739,
 586,
 750,
 954,
 814,
 140,
 267,
 852,
 883,
 826,
 88,
 943,
 777,
 273,
 949,
 321,
 83,
 589,
 361,
 319]

In [31]:
file_path = "/home/fbernardi/Documents/SemDeDup/data/embeddings/path.npy"
paths_str_type = 'U256'

paths = np.memmap(
    file_path,
    dtype=paths_str_type,
    mode='r',  # read-only
    shape=(dataset_size,)
)

In [34]:
paths.shape

(1004,)

In [35]:
path_array[batch_indices]

array(['000000000527.jpg', '000000010133.jpg', '000000000084.jpg',
       '000000011394.jpg', '000000010571.jpg', '000000011143.jpg',
       '000000010723.jpg', '000000010278.jpg', '000000010409.jpg',
       '000000010302.jpg', '000000000439.jpg', '000000000519.jpg',
       '000000010795.jpg', '000000010349.jpg', '000000010793.jpg',
       '000000011156.jpg', '000000011020.jpg', '000000000422.jpg',
       '000000000787.jpg', '000000011053.jpg', '000000011151.jpg',
       '000000010932.jpg', '000000000345.jpg', '000000011260.jpg',
       '000000010872.jpg', '000000000739.jpg', '000000010979.jpg',
       '000000000960.jpg', '000000000224.jpg', '000000010368.jpg',
       '000000000979.jpg', '000000000912.jpg'], dtype='<U256')

In [1]:
import numpy as np

In [3]:
np.load("/home/fbernardi/Documents/SemDeDup/data/sorted_clusters/cluster_0.npy")

array([['000000010909.jpg', '851', '-2.384185791015625e-07', '0']],
      dtype='<U32')

In [4]:
np.load("/home/fbernardi/Documents/SemDeDup/data/sorted_clusters/cluster_1.npy")

array([['000000001208.jpg', '500', '0.06237077713012695', '1'],
       ['000000000157.jpg', '58', '0.0623706579208374', '1']],
      dtype='<U32')

In [9]:
np.load("/home/fbernardi/Documents/SemDeDup/data/sorted_clusters/cluster_499.npy")

array([['000000000381.jpg', '152', '0.09433567523956299', '499'],
       ['000000010665.jpg', '732', '0.09433513879776001', '499']],
      dtype='<U32')

In [36]:
np.load("/home/fbernardi/Documents/SemDeDup/data/embeddings/dist_to_cent.npy").shape

(1004,)

In [37]:
np.load("/home/fbernardi/Documents/SemDeDup/data/embeddings/kmeans_centroids.npy").shape

(500, 512)

In [39]:
np.load("/home/fbernardi/Documents/SemDeDup/data/embeddings/nearest_cent.npy")[1]

np.int64(89)

In [34]:
np.load("/home/fbernardi/Documents/SemDeDup/data/sorted_clusters/cluster_89.npy")

array([['000000000002.jpg', '1', '-2.384185791015625e-07', '89'],
       ['000000000002.jpg', '5', '-2.384185791015625e-07', '89']],
      dtype='<U32')

In [35]:
sum(np.load("/home/fbernardi/Documents/SemDeDup/data/embeddings/nearest_cent.npy") == 89)

np.int64(2)

In [32]:
import pandas as pd
import pickle
import numpy as np

In [2]:
with open('/home/fbernardi/Documents/SemDeDup/data/embeddings/kmeans_index.pickle', 'rb') as file:
    kmeans_index = pickle.load(file)


In [33]:
pd.DataFrame(np.load("/home/fbernardi/Documents/SemDeDup/data/sorted_clusters/cluster_34.npy"))

Unnamed: 0,0,1,2,3
0,000000001130.jpg,447,0.4321774840354919,34
1,000000011266.jpg,932,0.3664817810058594,34
2,000000001049.jpg,356,0.3587756156921386,34
3,000000011147.jpg,866,0.3566626906394958,34
4,000000010382.jpg,599,0.3367999792098999,34
5,000000010861.jpg,754,0.3340635895729065,34
6,000000000257.jpg,61,0.3101370334625244,34
7,000000010739.jpg,769,0.2562384605407715,34
8,000000010160.jpg,533,0.2415503859519958,34
9,000000000464.jpg,179,0.2330095767974853,34


In [28]:
with open('/home/fbernardi/Documents/SemDeDup/data/dataframes/cluster_34.pkl', 'rb') as file:
    pruning_table = pickle.load(file)


In [29]:
pruning_table

Unnamed: 0,indices,eps=0.0001,eps=0.001,eps=0.1,eps=0.2,eps=0.3,eps=0.4,eps=0.5
0,21,False,False,False,False,False,False,False
1,20,False,False,False,False,False,True,True
2,19,False,False,False,False,True,True,True
3,18,False,False,False,False,True,True,True
4,17,False,False,False,False,True,True,True
5,16,False,False,False,False,True,True,True
6,15,False,False,False,True,True,True,True
7,14,False,False,False,True,True,True,True
8,13,False,False,False,False,True,True,True
9,12,False,False,False,False,True,True,True


In [26]:
with open('/home/fbernardi/Documents/SemDeDup/data/statistics/dataframes/shard_0.pkl', 'rb') as file:
    shard = pickle.load(file)


In [27]:
shard

Unnamed: 0,cluster_size,cluster_id,avg_sim_to_cent,std_sim_to_cent,std_pair_w_sim,avg_sim_to_others_list,max_pair_w_sim_list,min_pair_w_sim_list
0,6,0,0.781313,0.0371907,tensor(0.1064),"[tensor(0.5818), tensor(0.5569), tensor(0.5364...","[tensor(0.6699), tensor(0.6699), tensor(0.6015...","[tensor(0.), tensor(0.), tensor(0.), tensor(0...."
0,23,1,0.771713,0.06235939,tensor(0.0732),"[tensor(0.6528), tensor(0.6392), tensor(0.6346...","[tensor(0.7920), tensor(0.7485), tensor(0.7485...","[tensor(0.), tensor(0.), tensor(0.), tensor(0...."
0,15,2,0.768526,0.04700727,tensor(0.0803),"[tensor(0.6293), tensor(0.6144), tensor(0.5945...","[tensor(0.8459), tensor(0.8459), tensor(0.7102...","[tensor(0.), tensor(0.), tensor(0.), tensor(0...."
0,15,3,0.746363,0.09592509,tensor(0.0908),"[tensor(0.6053), tensor(0.6036), tensor(0.5999...","[tensor(0.7129), tensor(0.7684), tensor(0.7684...","[tensor(0.), tensor(0.), tensor(0.), tensor(0...."
0,31,4,0.761544,0.06007457,tensor(0.0675),"[tensor(0.6500), tensor(0.6221), tensor(0.6181...","[tensor(0.8101), tensor(0.7940), tensor(0.7735...","[tensor(0.), tensor(0.), tensor(0.), tensor(0...."
0,30,5,0.811754,0.08358292,tensor(0.0798),"[tensor(0.7130), tensor(0.7124), tensor(0.7069...","[tensor(0.8179), tensor(0.8173), tensor(0.8649...","[tensor(0.), tensor(0.), tensor(0.), tensor(0...."
0,31,6,0.753913,0.06448479,tensor(0.0642),"[tensor(0.6229), tensor(0.6183), tensor(0.6017...","[tensor(0.7255), tensor(0.7060), tensor(0.7127...","[tensor(0.), tensor(0.), tensor(0.), tensor(0...."
0,9,7,0.732335,0.09266689,tensor(0.0963),"[tensor(0.5693), tensor(0.5516), tensor(0.5290...","[tensor(0.7100), tensor(0.7694), tensor(0.6423...","[tensor(0.), tensor(0.), tensor(0.), tensor(0...."
0,3,8,0.758902,0.04128992,tensor(0.0997),"[tensor(0.4140), tensor(0.3767), tensor(0.3010)]","[tensor(0.4897), tensor(0.4897), tensor(0.3382)]","[tensor(0.), tensor(0.), tensor(0.)]"
0,4,9,0.75488,0.04461223,tensor(0.1054),"[tensor(0.4959), tensor(0.4348), tensor(0.3965...","[tensor(0.6216), tensor(0.6216), tensor(0.4833...","[tensor(0.), tensor(0.), tensor(0.), tensor(0.)]"


In [24]:
import torch

# Replace 'your_file_path.pt' with the actual path to your .pt file
file_path = '/home/fbernardi/Documents/SemDeDup/data/statistics/dicts/shard_0.pt'

# Load the .pt file
pt_data = torch.load(file_path, weights_only=False)

# Print the loaded data
print(pt_data)


{0.0001:     duplicates_ratio num_duplicates cluster_id
0                0.0              0          0
1                0.0              0          0
2                0.0              0          0
3                0.0              0          0
4                0.0              0          0
..               ...            ...        ...
8                0.0              0         48
9                0.0              0         48
10               0.0              0         48
0                0.0              0         49
1                0.0              0         49

[998 rows x 3 columns], 0.001:     duplicates_ratio num_duplicates cluster_id
0                0.0              0          0
1                0.0              0          0
2                0.0              0          0
3                0.0              0          0
4                0.0              0          0
..               ...            ...        ...
8                0.0              0         48
9                0.

In [25]:
pt_data.keys()

dict_keys([0.0001, 0.001, 0.1, 0.2, 0.3, 0.4, 0.5])