# [SimCSE: Simple Contrastive Learning of Sentence Embeddings](https://aclanthology.org/2021.emnlp-main.552.pdf)


### Dataset
wget https://huggingface.co/datasets/princeton-nlp/datasets-for-simcse/resolve/main/wiki1m_for_simcse.txt

### Reference Code
- https://github.com/bhuvanakundumani/SimCSE_unsupervised/tree/main
- https://github.com/hppRC/simple-simcse/blob/main

<img src="../figures/simcse_architecture.png" width="700" height="300">

In [2]:
import os
import math
import re
from random import *
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from tqdm.auto import tqdm

# Set GPU device
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# os.environ['http_proxy']  = 'http://192.41.170.23:3128'
# os.environ['https_proxy'] = 'http://192.41.170.23:3128'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

## Unsupervised SimCSE

## 1. Preprocessing

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import DatasetDict, Dataset

csv_path = './wiki1m_for_simcse.txt'
# Assuming you have already read the CSV file and created the data list
dataset_df = pd.read_csv(csv_path, header=None, names=['text'], sep='\t')
source_texts = dataset_df["text"].values
target_texts = dataset_df["text"].values
data = list(zip(source_texts, target_texts))

# Split the data into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.15, random_state=42, shuffle=False)

# Create datasets
train_dataset = Dataset.from_pandas(
    pd.DataFrame(train_data, columns=['source_text', 'target_text']))
val_dataset = Dataset.from_pandas(
    pd.DataFrame(val_data, columns=['source_text', 'target_text']))

# Create a DatasetDict
raw_dataset = DatasetDict({
    'train': train_dataset.select(list(range(1000))),
    'validation': val_dataset.select(list(range(100)))
})

raw_dataset
# Now you can access your training and validation sets using dataset_dict['train'] and dataset_dict['validation']

DatasetDict({
    train: Dataset({
        features: ['source_text', 'target_text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['source_text', 'target_text'],
        num_rows: 100
    })
})

## 2. Model & Tokenizer

In [3]:
from transformers import AutoConfig, AutoTokenizer, BertModel

model_name_or_path = 'bert-base-uncased'

# Loading tokenizer and config
config = AutoConfig.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

# start from a pretrained bert-base-uncased model
model = BertModel.from_pretrained(model_name_or_path)
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [4]:
max_seq_length = 32
batch_size = 32 

def preprocess_function(examples):
    source_texts = tokenizer(
        examples['source_text'], max_length=max_seq_length, padding="max_length", truncation=True)
    target_texts = tokenizer(
        examples['target_text'], max_length=max_seq_length, padding="max_length", truncation=True)

    return {
        "source_input_ids": source_texts["input_ids"],
        "source_attention_mask": source_texts["attention_mask"],
        "target_input_ids": target_texts["input_ids"],
        "target_attention_mask": target_texts["attention_mask"],
    }

In [5]:
tokenized_datasets = raw_dataset.map(
    preprocess_function,
    batched=True,
)

tokenized_datasets = tokenized_datasets.remove_columns(['source_text','target_text'])
tokenized_datasets.set_format("torch")

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [6]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['source_input_ids', 'source_attention_mask', 'target_input_ids', 'target_attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['source_input_ids', 'source_attention_mask', 'target_input_ids', 'target_attention_mask'],
        num_rows: 100
    })
})

## 3. Dataloader

In [7]:
from torch.utils.data import DataLoader
from transformers import default_data_collator
# Initialize the DataLoader with DataCollatorForLanguageModeling
batch_size = 16
train_dataloader = DataLoader(
    tokenized_datasets['train'],
    batch_size=batch_size,
    shuffle=True,
    collate_fn=default_data_collator,
)
eval_dataloader = DataLoader(
    tokenized_datasets['validation'],
    batch_size=batch_size,
    collate_fn=default_data_collator,
)

In [8]:
for batch in train_dataloader:
    break

In [9]:
batch.keys()

dict_keys(['source_input_ids', 'source_attention_mask', 'target_input_ids', 'target_attention_mask'])

## 4. Training

### Pooling
SBERT adds a pooling operation to the output of BERT / RoBERTa to derive a fixed sized sentence embedding

In [10]:
# define mean pooling function
def mean_pool(token_embeds, attention_mask):
    # reshape attention_mask to cover 768-dimension embeddings
    in_mask = attention_mask.unsqueeze(-1).expand(
        token_embeds.size()
    ).float()
    # perform mean-pooling but exclude padding tokens (specified by in_mask)
    pool = torch.sum(token_embeds * in_mask, 1) / torch.clamp(
        in_mask.sum(1), min=1e-9
    )
    return pool

In [11]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

num_train_epochs = 1
gradient_accumulation_steps = 1
learning_rate = 3e-5 
adam_epsilon = 1e-8
warmup_proportion = 0.1
weight_decay = 0.01

num_train_optimization_steps = int(len(train_data) / batch_size / gradient_accumulation_steps) * num_train_epochs

param_optimizer = list(model.named_parameters())
no_decay = ['bias','LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

warmup_steps = int(warmup_proportion * num_train_optimization_steps)
optimizer = AdamW(optimizer_grouped_parameters,lr=learning_rate, eps=adam_epsilon)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=num_train_optimization_steps)

## Cosine Similarity 

$$
\text{Cosine Similarity}(A, B) = \frac{{A \cdot B}}{{\|A\| \cdot \|B\|}}
$$

## Contrastive Learning


$$
\ell_i = -\log \frac{e^{\text{sim}(\mathbf{h}^{z_i}, \mathbf{h}^{z'_i}) / \tau}}{\sum_{j=1}^{N} e^{\text{sim}(\mathbf{h}^{z_i}, \mathbf{h}^{z'_j}) / \tau}}
$$



In [18]:
class Similarity(nn.Module):
    """ Dot product or cosine similarity """
    def __init__(self, temp):
        super().__init__()
        self.temp = temp
        self.cos = nn.CosineSimilarity(dim=-1)

    def forward(self, x, y):
        return self.cos(x, y) / self.temp
    
cosine_similarity = Similarity(temp = 1)
loss_fct = nn.CrossEntropyLoss()

In [17]:
from tqdm.auto import tqdm

num_epoch = 5
# 1 epoch should be enough, increase if wanted
for epoch in range(num_epoch):
    model.train()  
    # initialize the dataloader loop with tqdm (tqdm == progress bar)
    for step, batch in enumerate(tqdm(train_dataloader, leave=True)):
        # zero all gradients on each new step
        optimizer.zero_grad()
        
        # prepare batches and more all to the active device
        inputs_ids_a = batch['source_input_ids'].to(device)
        inputs_ids_b = batch['target_input_ids'].to(device)
        attention_a = batch['source_attention_mask'].to(device)
        attention_b = batch['target_attention_mask'].to(device)
        
        # extract token embeddings from BERT at last_hidden_state
        u = model(
            inputs_ids_a, attention_mask=attention_a
        )  
        v = model(
            inputs_ids_b, attention_mask=attention_b
        )  

        u_last_hidden_state = u.last_hidden_state # all token embeddings A = batch_size, seq_len, hidden_dim
        v_last_hidden_state = v.last_hidden_state # all token embeddings B = batch_size, seq_len, hidden_dim

         # get the mean pooled vectors
        u_mean_pool = mean_pool(u_last_hidden_state, attention_a) # batch_size, hidden_dim
        v_mean_pool = mean_pool(v_last_hidden_state, attention_b) # batch_size, hidden_dim
        
        similarity_score = cosine_similarity(u_mean_pool, v_mean_pool)
        labels = torch.arange(similarity_score.size(0)).float().to(device)
        loss = loss_fct(similarity_score, labels)

        # using loss, calculate gradients and then optimizerize
        loss.backward()
        optimizer.step()
        scheduler.step() # update learning rate scheduler
        
    print(f'Epoch: {epoch} | loss = {loss.item():.6f}')

  0%|          | 0/63 [00:00<?, ?it/s]

Epoch: 0 | loss = 58.146614


  0%|          | 0/63 [00:00<?, ?it/s]

Epoch: 1 | loss = 57.848351


  0%|          | 0/63 [00:00<?, ?it/s]

Epoch: 2 | loss = 58.293335


  0%|          | 0/63 [00:00<?, ?it/s]

Epoch: 3 | loss = 58.184685


  0%|          | 0/63 [00:00<?, ?it/s]

Epoch: 4 | loss = 57.945744
