# Playground for Developing the Contrastive Learning Model

### Setting up dataloaders

The different possible combinations have to be taken into account. A contrastive learning model can be trained with positive only or positive and negative pairs. The pairs must provide annotation.

In [37]:
import pandas as pd
import torch

train_data = pd.read_pickle('../data/04a_Train_Set.pkl')

In [38]:
train_data

Unnamed: 0,message,author_email,project,label
0,calcs/hazard/event_based/post_processing:\n\nM...,Lars.Butler@gmail.com,gem_oq-engine,0.0
1,added javadoc heading to hdf5 util class\n\n\n...,Lars.Butler@gmail.com,gem_oq-engine,0.0
2,added missing imports in db_tests/__init__.py ...,Lars.Butler@gmail.com,gem_oq-engine,0.0
3,"Fixed up a longer-running test, added slow attr",Lars.Butler@gmail.com,gem_oq-engine,0.0
4,calculators/hazard/event_based/core_next:\n\nR...,Lars.Butler@gmail.com,gem_oq-engine,0.0
...,...,...,...,...
47433,"Fixed ""is a"" op with Ident",tj@vision-media.ca,stylus_stylus,40.0
47434,removed old dynamic helper logic from the view...,tj@vision-media.ca,expressjs_express,40.0
47435,fixed property error due to parser not being p...,tj@vision-media.ca,stylus_stylus,40.0
47436,Fixed connect middleware for <I>.x,tj@vision-media.ca,stylus_stylus,40.0


In [39]:
import math

group_sizes = []
training_pairs = []

for i, group in enumerate(train_data.groupby("author_email")):
    group_sizes.append(len(group[1]))
    pair = []
    for i, message in enumerate(group[1]['message']):
        pair.append(message)
        if i % 2 == 1:
            training_pairs.append(pair)
            pair = []

number_of_training_pairs = 0

for size in group_sizes:
    number_of_training_pairs += math.comb(size, 2)

In [40]:
number_of_training_pairs

51268322

### Trying a Model with the Sentence-Transformers framework

In [41]:
# Source: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2

MODEL = 'sentence-transformers/all-MiniLM-L6-v2'

In [42]:
from sentence_transformers import SentenceTransformer

sentences = ["This is a sentence", "This is another sentence"]

model = SentenceTransformer(MODEL)
embedding = model.encode(sentences, convert_to_numpy=False)
embeddings = torch.stack(embedding)

In [43]:
embeddings

tensor([[ 5.0483e-02,  8.8006e-02,  4.8748e-03,  3.6269e-02, -1.0183e-03,
          1.9154e-02,  1.4232e-02, -1.8787e-02,  8.1525e-02,  3.7219e-02,
          7.0416e-02, -5.8707e-02, -1.6880e-02, -6.2914e-03, -6.7617e-03,
          1.6698e-02,  4.8856e-02, -1.9675e-02, -8.7432e-02, -4.0718e-03,
         -2.6633e-02,  9.2473e-02, -4.5293e-03,  4.6394e-04, -1.5114e-02,
          9.1224e-03, -4.3226e-02,  6.4198e-02,  6.7900e-02, -2.0826e-02,
         -5.0175e-02, -1.1977e-02,  7.0468e-02,  4.7732e-02,  2.3713e-02,
         -1.3724e-02, -5.7130e-03,  4.2292e-02,  1.3804e-02, -1.2827e-02,
          1.0733e-02, -3.8728e-02,  1.7663e-02,  4.5973e-03,  9.4009e-03,
         -6.4760e-03, -1.2174e-02, -1.4917e-02,  3.7665e-03,  6.7884e-03,
         -1.1023e-01, -8.5607e-02, -3.1882e-02, -4.4356e-02,  7.6891e-03,
          5.6485e-02,  5.1927e-02,  2.5315e-02,  4.8374e-02, -2.6516e-03,
         -2.3762e-03, -8.6608e-03, -8.9494e-02,  2.5110e-02,  1.4982e-01,
          4.8533e-02, -9.1607e-03, -5.

In [44]:
print(model)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)


### Do the Tokenization Yourself

This requires to not use the sentence-transformer framework.

In [45]:
# Tutorial on Using the model without sentence transformers
# https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL)

def tokenize_function(examples):
    return tokenizer(examples, padding=True, truncation=True, return_tensors='pt')

In [46]:
from transformers import AutoModel

model = AutoModel.from_pretrained(MODEL)

In [50]:
encoding = tokenize_function(sentences)

embedding = model(**encoding)

In [48]:
embedding

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0647, -0.0155,  0.1525,  ..., -0.0533, -0.1189, -0.3003],
         [ 0.6825,  0.4957,  0.0291,  ...,  0.0711,  0.1228,  0.4107],
         [ 0.0888,  0.0129,  0.0130,  ...,  0.0775,  0.8945, -0.1673],
         [-0.2899,  0.4032, -0.1168,  ..., -0.4846, -0.0872,  0.2883],
         [ 0.2138,  0.8209,  0.2821,  ..., -0.2332,  0.4246, -1.8285],
         [ 0.8544,  0.8716, -0.2164,  ..., -0.2195,  0.0575, -1.4919]],

        [[-0.1049, -0.0547,  0.1308,  ..., -0.0340, -0.1849, -0.2871],
         [ 0.5272,  0.3465,  0.1185,  ...,  0.0901,  0.2462,  0.4893],
         [ 0.0279, -0.2499,  0.1144,  ..., -0.3339,  0.7503, -0.2253],
         [-0.0900, -0.3181, -0.2485,  ..., -0.3548,  0.5346,  0.7864],
         [ 0.3286,  0.5703,  0.3674,  ..., -0.3386,  0.3716, -1.6157],
         [ 0.7286,  0.6006, -0.1332,  ..., -0.6608,  0.0972, -1.4084]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[ 1.5477e-03,  

In [49]:
# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Perform pooling
sentence_embeddings = mean_pooling(embedding, encoding['attention_mask'])

# Normalize embeddings
sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)

print("Sentence embeddings:")
print(sentence_embeddings)

Sentence embeddings:
tensor([[ 5.0483e-02,  8.8006e-02,  4.8748e-03,  3.6269e-02, -1.0183e-03,
          1.9154e-02,  1.4232e-02, -1.8787e-02,  8.1525e-02,  3.7219e-02,
          7.0416e-02, -5.8707e-02, -1.6880e-02, -6.2914e-03, -6.7617e-03,
          1.6698e-02,  4.8856e-02, -1.9675e-02, -8.7432e-02, -4.0718e-03,
         -2.6633e-02,  9.2473e-02, -4.5293e-03,  4.6394e-04, -1.5114e-02,
          9.1224e-03, -4.3226e-02,  6.4198e-02,  6.7900e-02, -2.0826e-02,
         -5.0175e-02, -1.1977e-02,  7.0468e-02,  4.7732e-02,  2.3713e-02,
         -1.3724e-02, -5.7130e-03,  4.2292e-02,  1.3804e-02, -1.2827e-02,
          1.0733e-02, -3.8728e-02,  1.7663e-02,  4.5973e-03,  9.4009e-03,
         -6.4760e-03, -1.2174e-02, -1.4917e-02,  3.7665e-03,  6.7884e-03,
         -1.1023e-01, -8.5607e-02, -3.1882e-02, -4.4356e-02,  7.6891e-03,
          5.6485e-02,  5.1927e-02,  2.5315e-02,  4.8374e-02, -2.6516e-03,
         -2.3762e-03, -8.6608e-03, -8.9494e-02,  2.5110e-02,  1.4982e-01,
          4.8533e

While the embeddings stay the same no matter whether the sentence-transformers library is used or not, in the second version without sentence-transformers there is the grad_fn=\<DivBackward0\> ending.

# Cosine Distance Experiments

Here you can experiment what target value you need for what input tensors to find the right loss setup for training your model.

In [63]:
loss_fn = torch.nn.CosineEmbeddingLoss()

x1 = torch.tensor([1, 0])
x2 = torch.tensor([0, 1])
# 1 if positive pair and -1 if negative pair
target = torch.tensor(-1)

loss_fn(x1, x2, target)

tensor(0.)

The loss should be minimized. Two completely different input tensors should have a loss of 1 if the corresponding training pair belongs together (=positive pair) and a loss of 0 if not (=negative pair).

In [74]:
loss_fn = torch.nn.CosineEmbeddingLoss(margin=0.9)

x1 = torch.tensor([1, 1])
x2 = torch.tensor([0, 1])
# 1 if positive pair and -1 if negative pair
target = torch.tensor(-1)

loss_fn(x1, x2, target)

tensor(0.)

A margin can be applied: It works only on the negative training pairs and reduces the loss by the margin amount.