In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import random

In [2]:
from pathlib import Path
DATA_FOLDER = Path("/data/cc3m")
EMBEDDINGS_FOLDER = DATA_FOLDER / "cc3m_2023/embeddings"
IMAGENET_EMBEDDINGS_FOLDER = EMBEDDINGS_FOLDER / "imagenet_class_embeddings_L14.npy"
CC_EMBEDDINGS_FOLDER = EMBEDDINGS_FOLDER / "text_embeddings_L14.npy"
CC_CAPTIONS_DF = "/data/cc3m/cc3m_2023/Train_GCC-training.tsv"
DOT_PRODUCTS = EMBEDDINGS_FOLDER / "CC_vs_imagenet_L14.npy"  # (cc entries, imagenet classes)
IMAGENET_LABELS = DATA_FOLDER / "imagenet_classes.txt"
LONG_IMAGENET_LABELS = DATA_FOLDER / "imagenet_classes_long.txt"

In [3]:
dot_products = np.load(DOT_PRODUCTS)
argmax = np.argmax(dot_products, axis=1)
maxes = np.max(dot_products, axis=1)

In [96]:
def get_relevant_captions(similarity: np.ndarray, 
                          col_id : int, 
                          n_relevant = 10,
                          only_argmax = False,
                          sort_best = False, 
                          CAPTIONS_FILE = "/data/cc3m/cc3m_2023/Train_GCC-training.tsv"):
    """
    Get the relevant captions for a given column of the dot products matrix.
    Args:
        similarity: (cc entries, classes) matrix. Each row corresponds to a CC entry and each column to a class.o
                    The entries are similarity values between the CC entry and the class.
                    Example: dot product matrix.
                    Example: 1 - distance matrix
        col_id: The column to get the relevant captions for.
        n_relevant: The number of relevant captions to return.
        only_argmax: If True, only consider the captions most similar to given col. If False, consider all captions.
        sort_best: If True, return the top n_relevant similarities. If False, choose randomly.
    
    Return:
        A list of n_relevant captions.
    """

    # TODO take a look at argmax_check.py file for inspiration
    
    
    captions = pd.read_csv(CAPTIONS_FILE, sep='\t', names=["caption","url"], usecols=range(0,2))["caption"].tolist()
    assert similarity.shape[0] == len(captions), "Similarity matrix and captions length do not match!"
    assert similarity.shape[1] - 1 >= abs(col_id), "col_id exceeds the # columns in similarity matrix!"
    similarity_relevant = similarity[:,col_id]
    if only_argmax == True:
        argmax = np.argmax(similarity, axis=1)
        maxes = np.max(similarity, axis=1)
        similarity_relevant = similarity_relevant[argmax==col_id]
        captions = [captions[i] for i in np.where(argmax==col_id)[0]]
                              
    n_relevant_available = min(n_relevant, len(similarity_relevant))
                              
    if sort_best != True:
        random_entries = random.sample(range(len(similarity_relevant)), n_relevant_available)
        return [captions[entry] for entry in random_entries]
    else:
        idx = np.argpartition(similarity_relevant, -n_relevant_available)[-n_relevant_available:]
        idx_sorted = idx[np.argsort(similarity_relevant[idx])][::-1]
        return [captions[entry] for entry in idx_sorted]

In [97]:
get_relevant_captions(similarity=dot_products, col_id=76, only_argmax=False, sort_best=True)

['close up of a tarantula',
 '... and the tarantula that we found on',
 'image of a common tarantula .',
 'largest species of tarantula in the world',
 'he was like a tarantula to me',
 'look at this scary spider',
 'tarantula , a wolf spider that hunts whilst walking on the ground .',
 'tarantula , a wolf spider that hunts whilst walking on the ground .',
 "harmless : despite being as big as a man 's hand this particular species of tarantula is completely harmless to humans",
 'tarantula email this to a friend']

In [9]:
data = pd.read_csv("/data/cc3m/cc3m_2023/Train_GCC-training.tsv", sep='\t', names=["caption","url"], usecols=range(0,2))

In [10]:
captions = data["caption"].tolist()

In [11]:
n_relevant_available = 10
similarity = dot_products
col_id = 30

In [12]:
random_entries = random.sample(range(len(similarity[:,col_id])), n_relevant_available)

In [13]:
[captions[entry] for entry in random_entries]

['the gray cat sleeps on the white sofa',
 'sculpture with an umbrella , arboretum .',
 'teacher helping kids in a preschool class',
 'industry for a city showing the private swimming pool',
 'caucasian man preparing to drive the car .',
 'a family had homes damaged and cars ruined by hurricane sandy .',
 'what does tv character from comic book series look like with no mask ?',
 'background with woman in style .',
 "football player celebrates scoring his side 's first goal of the game in front of the fans with football player",
 'actor and person attend 25th anniversary event']

In [14]:
idx = np.argpartition(similarity[:,col_id], -n_relevant_available)[-n_relevant_available:]

In [15]:
idx_sorted = idx[np.argsort(similarity[:,col_id][idx])][::-1]

In [16]:
idx_sorted

array([1233486, 1968039, 1085264, 2798738, 2974413, 2071905, 2928383,
        777160, 2140310, 1742130])

In [17]:
similarity[:,col_id][idx_sorted]

array([0.65565443, 0.6306698 , 0.6259816 , 0.62330246, 0.6229262 ,
       0.6229262 , 0.6229262 , 0.6229262 , 0.6229262 , 0.6229262 ],
      dtype=float32)

In [18]:
[captions[entry] for entry in idx_sorted]

['this is a mountain yellow - legged frog .',
 'common frog is a widespread species .',
 'frog , critically endangered in the wild',
 'close view of a common frog',
 'this large aquatic frog is organism .',
 'this large aquatic frog is organism .',
 'this large aquatic frog is organism .',
 'this large aquatic frog is organism .',
 'this large aquatic frog is organism .',
 'this large aquatic frog is organism .']

In [19]:
n_relevant_available = 10
similarity = dot_products
col_id = 30

In [20]:
similarity_relevant = similarity[:,col_id]
similarity_relevant = similarity_relevant[argmax==col_id]

In [21]:
similarity_relevant

array([0.57160926, 0.34483665, 0.4299124 , 0.41132516, 0.52932286,
       0.6172392 , 0.5341002 , 0.50568503, 0.5341002 , 0.4778833 ,
       0.32334027, 0.5810215 , 0.5038953 , 0.6140211 , 0.5150279 ,
       0.40928322, 0.5341917 , 0.5392379 , 0.5898162 , 0.5045848 ,
       0.3165427 , 0.42452884, 0.27025238, 0.48669985, 0.4104669 ,
       0.47195876, 0.5898162 , 0.30660352, 0.38492748, 0.4178289 ,
       0.39712718, 0.2724526 , 0.5045848 , 0.38773084, 0.39452294,
       0.2724526 , 0.52399945, 0.3553119 , 0.2724526 , 0.57148767,
       0.27025238, 0.3138243 , 0.33028188, 0.53052   , 0.538291  ,
       0.3632834 , 0.45262495, 0.2724526 , 0.33028188, 0.5341002 ,
       0.56437993], dtype=float32)

In [22]:
captions_relevant = [captions[i] for i in np.where(argmax==col_id)[0]]

In [23]:
lines = open(LONG_IMAGENET_LABELS).read().splitlines()
texts = ["This is a " + line for line in lines]

In [6]:
label_embeddings = np.load(IMAGENET_EMBEDDINGS_FOLDER)
cc_embeddings = np.load(CC_EMBEDDINGS_FOLDER)

In [24]:
label_embeddings_relevant = label_embeddings[col_id]

In [55]:
print(cc_embeddings.shape)
print(label_embeddings_relevant.shape)

(3318333, 768)
(768,)


In [28]:
comparison = label_embeddings_relevant @ cc_embeddings.T

In [38]:
comparison2 = cc_embeddings @ label_embeddings_relevant

In [39]:
i = 6787
print(comparison[i])
print(comparison2[i])
print(similarity[i,col_id])

0.053526737
0.053526737
0.053526722


In [59]:
comparison.shape

(3318333,)

In [56]:
k = np.ones([3,5])
k.shape

(3, 5)

In [57]:
l = np.ones(5)
l.shape

(5,)

In [53]:
k-l

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [58]:
a = 3
if a == 1:
    print(1)
elif a == 3:
    print(3)
        

3


In [60]:
temp = cc_embeddings - label_embeddings_relevant

In [64]:
temp

array([[ 0.01856818, -0.04548136, -0.04816247, ...,  0.05884226,
         0.07124819, -0.00535386],
       [ 0.05407539, -0.06928588, -0.08092396, ...,  0.03008895,
        -0.00898384,  0.01607214],
       [ 0.04828547, -0.10116106, -0.09367523, ...,  0.05698025,
        -0.01283206, -0.05270676],
       ...,
       [ 0.03008148, -0.07547796,  0.00516605, ...,  0.04212256,
         0.03777242, -0.05866101],
       [ 0.00449855, -0.04209557, -0.07445265, ...,  0.02327939,
         0.0308992 , -0.00660304],
       [ 0.02137729, -0.07284716, -0.04025277, ...,  0.03487681,
         0.03703548, -0.00302604]], dtype=float32)

In [67]:
k

array([[1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.]])

In [69]:
np.linalg.norm(k, axis=1)

array([2.23606798, 2.23606798, 2.23606798])

In [71]:
distance = np.linalg.norm(temp, axis=1)

In [73]:
distance.shape

(3318333,)

In [74]:
distance

array([1.3594415, 1.3718246, 1.4043311, ..., 1.3754959, 1.3372407,
       1.3615527], dtype=float32)

In [77]:
np.min(distance)

0.8298745

In [98]:
def get_relevant_captions_from_embeddings(embeddings: np.ndarray, 
                                          query : np.ndarray,
                                          distance_function = "dot_product",
                                          n_relevant = 10,
                                          sort_best = False, 
                                          CAPTIONS_FILE = "/data/cc3m/cc3m_2023/Train_GCC-training.tsv"):
    """
    Get the relevant captions for a given query.
    Args:
        embeddings: (cc entries, embedding size) matrix. Each row is an embedding for a CC entry.
        query: (embedding size,) vector. The query to get the relevant captions for.
        distance_function: The distance function to use. Can be "dot_product" or "euclidean".
        n_relevant: The number of relevant captions to return.
        sort_best: If True, return the top n_relevant similarities. If False, choose randomly.
    
    Return:
        A list of n_relevant captions.
    """

    # TODO Compute the similarity
    
    if distance_function == "dot_product":
        comparison = embeddings @ query
    elif distance_function == "euclidean":
        diff = embeddings - query
        distance = np.linalg.norm(diff, axis=1)
        comparison = 1 - distance
    else:
        raise NotImplementedError("This distance method is not implemented yet.")

    # TODO call get_relevant_captions with col_id = 0
    return get_relevant_captions(similarity = comparison[:,None], 
                          col_id = 0, 
                          n_relevant = n_relevant,
                          sort_best = sort_best, 
                          CAPTIONS_FILE = CAPTIONS_FILE)

In [100]:
get_relevant_captions_from_embeddings(cc_embeddings, label_embeddings[76], sort_best=True,distance_function = "euclidean")

KeyboardInterrupt: 

In [88]:
label_embeddings_relevant[:,None].shape

(768, 1)

In [24]:
a = np.ones(3)*3
b = np.array([[1,2,3,4,5]]).T*np.ones([5,3])

In [25]:
temp=b-a
temp

array([[-2., -2., -2.],
       [-1., -1., -1.],
       [ 0.,  0.,  0.],
       [ 1.,  1.,  1.],
       [ 2.,  2.,  2.]])

In [26]:
np.linalg.norm(temp, axis=1)

array([3.46410162, 1.73205081, 0.        , 1.73205081, 3.46410162])