Here we have the output of an embedding model

In [5]:
import torch
import torch.nn.functional as F

batch_size = 10
embedding_size = 100
embedding_space = torch.rand((batch_size,embedding_size))

Then we will apply the inner product and get the similarity matrix.
The similarity matrix is a symmetric matrix where the diagnol is 1.

In [15]:
features = F.normalize(embedding_space.squeeze(0), dim=1)
similarity = torch.matmul(features, features.T)
print(similarity.shape)
similarity

torch.Size([10, 10])


tensor([[1.0000, 0.7674, 0.7821, 0.8081, 0.7755, 0.7740, 0.7402, 0.7749, 0.7610,
         0.7838],
        [0.7674, 1.0000, 0.7260, 0.7622, 0.7053, 0.6826, 0.6855, 0.7223, 0.7672,
         0.7654],
        [0.7821, 0.7260, 1.0000, 0.7568, 0.7278, 0.7126, 0.7317, 0.7328, 0.7626,
         0.7712],
        [0.8081, 0.7622, 0.7568, 1.0000, 0.7517, 0.7402, 0.7336, 0.7313, 0.7973,
         0.7999],
        [0.7755, 0.7053, 0.7278, 0.7517, 1.0000, 0.7178, 0.6905, 0.7414, 0.7574,
         0.7679],
        [0.7740, 0.6826, 0.7126, 0.7402, 0.7178, 1.0000, 0.7513, 0.7723, 0.7536,
         0.7369],
        [0.7402, 0.6855, 0.7317, 0.7336, 0.6905, 0.7513, 1.0000, 0.7373, 0.7435,
         0.7381],
        [0.7749, 0.7223, 0.7328, 0.7313, 0.7414, 0.7723, 0.7373, 1.0000, 0.7680,
         0.7218],
        [0.7610, 0.7672, 0.7626, 0.7973, 0.7574, 0.7536, 0.7435, 0.7680, 1.0000,
         0.7881],
        [0.7838, 0.7654, 0.7712, 0.7999, 0.7679, 0.7369, 0.7381, 0.7218, 0.7881,
         1.0000]])

To apply the InfoNCE loss, you need to transform the problem into a classification problem, so we will mask the diagonal and apply a softmax.

In [22]:
mask = ~torch.eye(similarity.shape[0], dtype=torch.bool)
similarity = mask * similarity
probabilities = F.softmax(similarity, dim=1)
print(sum(probabilities[0,:]))
probabilities

tensor(1.)


tensor([[0.0487, 0.1050, 0.1065, 0.1093, 0.1058, 0.1057, 0.1022, 0.1058, 0.1043,
         0.1067],
        [0.1093, 0.0507, 0.1048, 0.1087, 0.1027, 0.1004, 0.1007, 0.1045, 0.1092,
         0.1090],
        [0.1095, 0.1035, 0.0501, 0.1068, 0.1037, 0.1022, 0.1041, 0.1043, 0.1074,
         0.1083],
        [0.1103, 0.1054, 0.1048, 0.0492, 0.1043, 0.1031, 0.1024, 0.1021, 0.1091,
         0.1094],
        [0.1096, 0.1022, 0.1045, 0.1070, 0.0505, 0.1034, 0.1006, 0.1059, 0.1076,
         0.1087],
        [0.1093, 0.0998, 0.1028, 0.1057, 0.1034, 0.0504, 0.1069, 0.1092, 0.1071,
         0.1054],
        [0.1067, 0.1010, 0.1058, 0.1060, 0.1015, 0.1079, 0.0509, 0.1064, 0.1071,
         0.1065],
        [0.1088, 0.1032, 0.1043, 0.1041, 0.1052, 0.1085, 0.1048, 0.0501, 0.1080,
         0.1031],
        [0.1051, 0.1057, 0.1052, 0.1090, 0.1047, 0.1043, 0.1032, 0.1058, 0.0491,
         0.1079],
        [0.1078, 0.1058, 0.1064, 0.1095, 0.1061, 0.1028, 0.1029, 0.1013, 0.1082,
         0.0492]])

Once you the output probabilities, you just need to apply the crossentropy loss against the labels.

In [20]:
labels = torch.eye(batch_size)
labels = labels[torch.randperm(batch_size)]
loss = torch.nn.CrossEntropyLoss()
loss(probabilities, labels)

tensor(2.3042)

Here is a function with the full implementation of the cossine similarity as classification and returning the probabilities

In [None]:
def compute_similarity_matrix(
     output_embedding_spaces: torch.Tensor
 ) -> torch.Tensor:
     features = F.normalize(output_embedding_spaces.squeeze(0), dim=1)
     similarity = torch.matmul(features, features.T)
     mask = ~torch.eye(similarity.shape[0], dtype=torch.bool)
     similarity = mask * similarity
     probabilities = F.softmax(similarity, dim=1)

     return probabilities