# [SimCSE: Simple Contrastive Learning of Sentence Embeddings](https://aclanthology.org/2021.emnlp-main.552.pdf)
- unsupervised SimCSE simply predicts the input sentence itself with only dropout
- obtain two different embeddings as “positive pairs”. Then we take other sentences in the same mini-batch as “negatives”
- the model predicts the positive one among negatives

<img src="./figures/simcse_architecture.png" width="700" height="300">

### Reference Code
- https://github.com/bhuvanakundumani/SimCSE_unsupervised/tree/main
- https://github.com/hppRC/simple-simcse/blob/main

In [1]:
import os
import math
import re
from random import *
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from tqdm.auto import tqdm

# Set GPU device
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

os.environ['http_proxy']  = 'http://192.41.170.23:3128'
os.environ['https_proxy'] = 'http://192.41.170.23:3128'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## Unsupervised SimCSE

## 1. Preprocessing

[Download wiki1m_for_simcse.txt](https://huggingface.co/datasets/princeton-nlp/datasets-for-simcse)

In [21]:
# from datasets import load_dataset

# dataset = load_dataset("princeton-nlp/datasets-for-simcse")

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import DatasetDict, Dataset

csv_path = './wiki1m_for_simcse.txt'
# Assuming you have already read the CSV file and created the data list
dataset_df = pd.read_csv(csv_path, header=None, names=['text'], sep='\t')
source_texts = dataset_df["text"].values
target_texts = dataset_df["text"].values
data = list(zip(source_texts, target_texts))

# Split the data into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.15, random_state=42, shuffle=False)

# Create datasets
train_dataset = Dataset.from_pandas(
    pd.DataFrame(train_data, columns=['source_text', 'target_text']))
val_dataset = Dataset.from_pandas(
    pd.DataFrame(val_data, columns=['source_text', 'target_text']))

train_dataset.shape, val_dataset.shape

((846129, 2), (149318, 2))

In [3]:
# Create a DatasetDict
raw_dataset = DatasetDict({
    'train': train_dataset.select(list(range(10000))),
    'validation': val_dataset.select(list(range(1000)))
})

raw_dataset
# Now you can access your training and validation sets using dataset_dict['train'] and dataset_dict['validation']

DatasetDict({
    train: Dataset({
        features: ['source_text', 'target_text'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['source_text', 'target_text'],
        num_rows: 1000
    })
})

In [4]:
raw_dataset['train'][0]

{'source_text': 'YMCA in South Australia',
 'target_text': 'YMCA in South Australia'}

## 2. Model & Tokenizer

In [5]:
from transformers import AutoConfig, AutoTokenizer, BertModel

model_name_or_path = 'bert-base-uncased'

# Loading tokenizer and config
config = AutoConfig.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

# start from a pretrained bert-base-uncased model
model = BertModel.from_pretrained(model_name_or_path)
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [6]:
max_seq_length = 32
batch_size = 32 

def preprocess_function(examples):
    source_texts = tokenizer(
        examples['source_text'], max_length=max_seq_length, padding="max_length", truncation=True)
    target_texts = tokenizer(
        examples['target_text'], max_length=max_seq_length, padding="max_length", truncation=True)

    return {
        "source_input_ids": source_texts["input_ids"],
        "source_attention_mask": source_texts["attention_mask"],
        "target_input_ids": target_texts["input_ids"],
        "target_attention_mask": target_texts["attention_mask"],
    }

In [7]:
tokenized_datasets = raw_dataset.map(
    preprocess_function,
    batched=True,
)

tokenized_datasets = tokenized_datasets.remove_columns(['source_text','target_text'])
tokenized_datasets.set_format("torch")

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [8]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['source_input_ids', 'source_attention_mask', 'target_input_ids', 'target_attention_mask'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['source_input_ids', 'source_attention_mask', 'target_input_ids', 'target_attention_mask'],
        num_rows: 1000
    })
})

## 3. Dataloader

In [9]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

# Initialize the DataLoader with DataCollatorForLanguageModeling
batch_size = 16

train_dataloader = DataLoader(
    tokenized_datasets['train'],
    batch_size=batch_size,
    shuffle=True,
    collate_fn=default_data_collator,
)

eval_dataloader = DataLoader(
    tokenized_datasets['validation'],
    batch_size=batch_size,
    collate_fn=default_data_collator,
)

In [10]:
for batch in train_dataloader:
    break

In [11]:
batch.keys()

dict_keys(['source_input_ids', 'source_attention_mask', 'target_input_ids', 'target_attention_mask'])

## 4. Training

### Pooling
SBERT adds a pooling operation to the output of BERT / RoBERTa to derive a fixed sized sentence embedding

In [12]:
# define mean pooling function
def mean_pool(token_embeds, attention_mask):
    # reshape attention_mask to cover 768-dimension embeddings
    in_mask = attention_mask.unsqueeze(-1).expand(
        token_embeds.size()
    ).float()
    # perform mean-pooling but exclude padding tokens (specified by in_mask)
    pool = torch.sum(token_embeds * in_mask, 1) / torch.clamp(
        in_mask.sum(1), min=1e-9
    )
    return pool

In [13]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

num_train_epochs = 1
gradient_accumulation_steps = 1
learning_rate = 1e-5
adam_epsilon = 1e-8
warmup_proportion = 0.1
weight_decay = 0.01

num_train_optimization_steps = int(len(train_data) / batch_size / gradient_accumulation_steps) * num_train_epochs

param_optimizer = list(model.named_parameters())
no_decay = ['bias','LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

warmup_steps = int(warmup_proportion * num_train_optimization_steps)
optimizer = AdamW(optimizer_grouped_parameters,lr=learning_rate, eps=adam_epsilon)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=num_train_optimization_steps)

## Cosine Similarity 

$$
\text{sim}(h_1, h_2) = \frac{{h_1 \cdot h_2}}{{\|h_1\| \cdot \|h_2\|}}
$$

## Contrastive Learning


$$
\ell_i = -\log \frac{e^{\text{sim}(\mathbf{h}^{z_i}, \mathbf{h}^{z'_i}) / \tau}}{\sum_{j=1}^{N} e^{\text{sim}(\mathbf{h}^{z_i}, \mathbf{h}^{z'_j}) / \tau}}
$$



In [22]:
import torch
import torch.nn.functional as F

def contrastive_loss(h_z, h_z_prime, temperature=1.0):
    # Compute similarity scores for positive pair (i.e., h_z and h_z_prime)
    sim_positive = F.cosine_similarity(h_z, h_z_prime, dim=-1) / temperature #Positive instance
    # Compute similarity scores for negative pairs (i.e., h_z and other h_z')
    sim_negative = F.cosine_similarity(h_z.unsqueeze(1), h_z_prime, dim=-1) / temperature #Negative instance
    # Compute log probabilities
    log_probs = F.log_softmax(torch.cat([sim_positive.unsqueeze(1), sim_negative], dim=1), dim=1)
    # Negative log likelihood loss for the positive pair
    loss = -log_probs[:, 0].mean()
    return loss

In [27]:
#testing
temperature = 1
batch_size, embedding_dim = 16, 768
h_z = torch.rand((batch_size, embedding_dim))
h_z_prime = torch.rand((batch_size, embedding_dim))

In [25]:
h_z.shape, h_z_prime.shape

(torch.Size([16, 768]), torch.Size([16, 768]))

In [28]:
F.cosine_similarity(h_z, h_z_prime, dim=-1) / temperature

tensor([0.7401, 0.7479, 0.7318, 0.7543, 0.7533, 0.7572, 0.7647, 0.7592, 0.7450,
        0.7379, 0.7322, 0.7749, 0.7625, 0.7619, 0.7343, 0.7602])

In [30]:
h_z.unsqueeze(1).shape

torch.Size([16, 1, 768])

In [32]:
F.cosine_similarity(h_z.unsqueeze(1), h_z_prime, dim=-1) / temperature

tensor([[0.7401, 0.7551, 0.7343, 0.7484, 0.7522, 0.7621, 0.7389, 0.7435, 0.7422,
         0.7484, 0.7330, 0.7532, 0.7458, 0.7675, 0.7463, 0.7539],
        [0.7262, 0.7479, 0.7479, 0.7314, 0.7464, 0.7559, 0.7359, 0.7458, 0.7501,
         0.7480, 0.7161, 0.7475, 0.7586, 0.7508, 0.7470, 0.7393],
        [0.7446, 0.7332, 0.7318, 0.7468, 0.7445, 0.7429, 0.7447, 0.7427, 0.7430,
         0.7331, 0.7407, 0.7499, 0.7416, 0.7418, 0.7334, 0.7383],
        [0.7558, 0.7581, 0.7564, 0.7543, 0.7336, 0.7449, 0.7607, 0.7518, 0.7391,
         0.7497, 0.7418, 0.7560, 0.7460, 0.7446, 0.7490, 0.7444],
        [0.7542, 0.7483, 0.7345, 0.7429, 0.7533, 0.7488, 0.7483, 0.7523, 0.7333,
         0.7582, 0.7431, 0.7406, 0.7584, 0.7435, 0.7573, 0.7576],
        [0.7455, 0.7646, 0.7556, 0.7287, 0.7575, 0.7572, 0.7527, 0.7582, 0.7414,
         0.7540, 0.7239, 0.7357, 0.7489, 0.7520, 0.7441, 0.7466],
        [0.7478, 0.7592, 0.7608, 0.7549, 0.7878, 0.7397, 0.7647, 0.7597, 0.7397,
         0.7574, 0.7474, 0.7619, 0.77

In [33]:
loss = contrastive_loss(h_z, h_z_prime)
print(f"Contrastive Loss: {loss.item()}")

Contrastive Loss: 2.8297362327575684


In [16]:
from tqdm.auto import tqdm

num_epoch = 5
# 1 epoch should be enough, increase if wanted
for epoch in range(num_epoch):
    model.train()  
    # initialize the dataloader loop with tqdm (tqdm == progress bar)
    for step, batch in enumerate(tqdm(train_dataloader, leave=True)):
        # zero all gradients on each new step
        optimizer.zero_grad()
        
        # prepare batches and more all to the active device
        inputs_ids_a = batch['source_input_ids'].to(device)
        inputs_ids_b = batch['target_input_ids'].to(device)
        attention_a = batch['source_attention_mask'].to(device)
        attention_b = batch['target_attention_mask'].to(device)
        
        # extract token embeddings from BERT at last_hidden_state
        u = model(inputs_ids_a, attention_mask=attention_a)  
        v = model(inputs_ids_b, attention_mask=attention_b)  

        u_last_hidden_state = u.last_hidden_state # all token embeddings A = batch_size, seq_len, hidden_dim
        v_last_hidden_state = v.last_hidden_state # all token embeddings B = batch_size, seq_len, hidden_dim

         # get the mean pooled vectors
        u_mean_pool = mean_pool(u_last_hidden_state, attention_a) # batch_size, hidden_dim
        v_mean_pool = mean_pool(v_last_hidden_state, attention_b) # batch_size, hidden_dim

        loss = contrastive_loss(u_mean_pool, u_mean_pool)
        # using loss, calculate gradients and then optimizerize
        loss.backward()
        optimizer.step()
        scheduler.step() # update learning rate scheduler
        
    print(f'Epoch: {epoch+1} | loss = {loss.item():.6f}')

  0%|          | 0/625 [00:00<?, ?it/s]

Epoch: 1 | loss = 2.033902


  0%|          | 0/625 [00:00<?, ?it/s]

Epoch: 2 | loss = 2.023149


  0%|          | 0/625 [00:00<?, ?it/s]

Epoch: 3 | loss = 2.033118


  0%|          | 0/625 [00:00<?, ?it/s]

Epoch: 4 | loss = 2.020890


  0%|          | 0/625 [00:00<?, ?it/s]

Epoch: 5 | loss = 2.020044


In [18]:
import torch.nn as nn
import torch

class Similarity(nn.Module):
    """ Dot product or cosine similarity """
    def __init__(self, temp):
        super().__init__()
        self.temp = temp
        self.cos = nn.CosineSimilarity(dim=-1)

    def forward(self, x, y):
        return self.cos(x, y) / self.temp
    
cosine_similarity = Similarity(temp = 1)

In [19]:
total_similarity = 0
# from sklearn.metrics.pairwise import cosine_similarity

with torch.no_grad():
    for step, batch in enumerate(eval_dataloader):
        # prepare batches and more all to the active device
        inputs_ids_a = batch['source_input_ids'].to(device)
        inputs_ids_b = batch['target_input_ids'].to(device)
        attention_a = batch['source_attention_mask'].to(device)
        attention_b = batch['target_attention_mask'].to(device)
        
        # extract token embeddings from BERT at last_hidden_state
        u = model(inputs_ids_a, attention_mask=attention_a)  
        v = model(inputs_ids_b, attention_mask=attention_b)  

        u_last_hidden_state = u.last_hidden_state # all token embeddings A = batch_size, seq_len, hidden_dim
        v_last_hidden_state = v.last_hidden_state # all token embeddings B = batch_size, seq_len, hidden_dim

         # get the mean pooled vectors
        u_mean_pool = mean_pool(u_last_hidden_state, attention_a) # batch_size, hidden_dim
        v_mean_pool = mean_pool(v_last_hidden_state, attention_b) # batch_size, hidden_dim

        similarity_score = cosine_similarity(u_mean_pool, v_mean_pool).mean()
        total_similarity += similarity_score
    
average_similarity = total_similarity / len(eval_dataloader)
print(f"Average Cosine Similarity: {average_similarity:.4f}")

Average Cosine Similarity: 0.9051


## 5. Inference

In [34]:
import torch
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(model, tokenizer, sentence_a, sentence_b, device):
    # Tokenize and convert sentences to input IDs and attention masks
    inputs_a = tokenizer(sentence_a, return_tensors='pt', truncation=True, padding=True).to(device)
    inputs_b = tokenizer(sentence_b, return_tensors='pt', truncation=True, padding=True).to(device)

    # Move input IDs and attention masks to the active device
    inputs_ids_a = inputs_a['input_ids']
    attention_a = inputs_a['attention_mask']
    inputs_ids_b = inputs_b['input_ids']
    attention_b = inputs_b['attention_mask']

    # Extract token embeddings from BERT
    u = model(inputs_ids_a, attention_mask=attention_a)[0]  # all token embeddings A = batch_size, seq_len, hidden_dim
    v = model(inputs_ids_b, attention_mask=attention_b)[0]  # all token embeddings B = batch_size, seq_len, hidden_dim

    # Get the mean-pooled vectors
    u = mean_pool(u, attention_a).detach().cpu().numpy().reshape(-1)  # batch_size, hidden_dim
    v = mean_pool(v, attention_b).detach().cpu().numpy().reshape(-1)  # batch_size, hidden_dim

    # Calculate cosine similarity
    similarity_score = cosine_similarity(u.reshape(1, -1), v.reshape(1, -1))[0, 0]

    return similarity_score

# Example usage:
sentence_a = 'Your contribution helped make it possible for us to provide our students with a quality education.'
sentence_b = "Your contributions were of no help with our students' education."
similarity = calculate_similarity(model, tokenizer, sentence_a, sentence_b, device)
print(f"Cosine Similarity: {similarity:.4f}")

Cosine Similarity: 0.5799


In [37]:
# Example usage:
sentence_a = 'Your contribution helped make it possible for us to provide our students with a quality education.'
sentence_b = "Your contributions were of help with our students' education."
similarity = calculate_similarity(model, tokenizer, sentence_a, sentence_b, device)
print(f"Cosine Similarity: {similarity:.4f}")

Cosine Similarity: 0.8270
