In [1]:
import random
import transformers
import pandas as pd
import numpy as np
import os
import functools
import torch
from tqdm import tqdm
import torch.nn as nn
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
import transformers
from transformers.models.bert.modeling_bert import BertPreTrainedModel, BertModel
from transformers.modeling_outputs import SequenceClassifierOutput, BaseModelOutputWithPoolingAndCrossAttentions

from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, dataloader
from transformers import AutoConfig, AutoTokenizer
from torch.utils.data import DataLoader,SequentialSampler,RandomSampler

# Mô hình

In [2]:
class MLPLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.activation = nn.Tanh()

    def forward(self, features, **kwargs):
        x = self.dense(features)
        x = self.activation(x)
        return x

In [3]:
class Similarity(nn.Module):
    def __init__(self, temp):
        super().__init__()
        self.temp = temp
        self.cos = nn.CosineSimilarity(dim=-1)

    def forward(self, x, y):
        return self.cos(x, y) / self.temp

In [4]:
class Pooler(nn.Module):
    def __init__(self, pooler_type):
        super().__init__()
        self.pooler_type = pooler_type
        assert self.pooler_type in ["cls", "cls_before_pooler", "avg", "avg_top2", "avg_first_last"], "unrecognized pooling type %s" % self.pooler_type

    def forward(self, attention_mask, outputs):
        last_hidden = outputs.last_hidden_state
        pooler_output = outputs.pooler_output
        hidden_states = outputs.hidden_states

        if self.pooler_type in ['cls_before_pooler', 'cls']:
            return last_hidden[:, 0]
        else:
            raise NotImplementedError

In [5]:
class ArgumentsModel():
    def __init__(self): 
        self.temp = 0.05 # Temperature for softmax.
        self.pooler_type = 'cls' #What kind of pooler to use 
        # Number of sentences in one instance
        # 2: pair instance; 3: pair instance with a hard negative
        self.num_sent = 2

args_model = ArgumentsModel()

In [6]:
def cl_init(cls, config):
    cls.pooler_type = args_model.pooler_type
    cls.pooler = Pooler(args_model.pooler_type)
    cls.mlp = MLPLayer(config)
    cls.sim = Similarity(temp=args_model.temp)
    cls.init_weights()

In [7]:
def cl_forward(cls,encoder,input_ids=None,attention_mask=None,token_type_ids=None,position_ids=None,
    head_mask=None,inputs_embeds=None,labels=None,output_attentions=None,output_hidden_states=None,
    return_dict=None,
):
    #import ipdb; ipdb.set_trace();
    return_dict = return_dict if return_dict is not None else cls.config.use_return_dict
    batch_size = int(input_ids.size(0)/2)
    
    # Get raw embeddings
    outputs = encoder(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
        head_mask=head_mask,
        inputs_embeds=inputs_embeds,
        output_attentions=output_attentions,
        output_hidden_states=False if args_model.pooler_type == 'cls' else True,
        return_dict=True,
    )

    # Pooling
    pooler_output = cls.pooler(attention_mask, outputs)
    pooler_output = pooler_output.view((batch_size, args_model.num_sent, pooler_output.size(-1))) # (bs, num_sent, hidden)

    # If using "cls", we add an extra MLP layer
    # (same as BERT's original implementation) over the representation.
    if cls.pooler_type == "cls":
        pooler_output = cls.mlp(pooler_output)

    # Separate representation
    z1, z2 = pooler_output[:,0], pooler_output[:,1]
    cos_sim = cls.sim(z1.unsqueeze(1), z2.unsqueeze(0))
    labels = torch.arange(cos_sim.size(0)).long().to(cls.device)
    loss_fct = nn.CrossEntropyLoss()

    loss = loss_fct(cos_sim, labels)

    if not return_dict:
        output = (cos_sim,) + outputs[2:]
        return ((loss,) + output) if loss is not None else output
    return SequenceClassifierOutput(
        loss=loss,
        logits=cos_sim,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )

In [8]:
def sentemb_forward(cls,encoder,input_ids=None,attention_mask=None,token_type_ids=None,position_ids=None,
    head_mask=None,inputs_embeds=None,labels=None,output_attentions=None,output_hidden_states=None,
    return_dict=None,
):
    return_dict = return_dict if return_dict is not None else cls.config.use_return_dict

    outputs = encoder(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
        head_mask=head_mask,
        inputs_embeds=inputs_embeds,
        output_attentions=output_attentions,
        output_hidden_states=True if cls.pooler_type in ['avg_top2', 'avg_first_last'] else False,
        return_dict=True,
    )

    pooler_output = cls.pooler(attention_mask, outputs)
    if cls.pooler_type == "cls":
        pooler_output = cls.mlp(pooler_output)

    if not return_dict:
        return (outputs[0], pooler_output) + outputs[2:]

    return BaseModelOutputWithPoolingAndCrossAttentions(
        pooler_output=pooler_output,
        last_hidden_state=outputs.last_hidden_state,
        hidden_states=outputs.hidden_states,
    )

In [9]:
class BertForCL(BertPreTrainedModel):
    _keys_to_ignore_on_load_missing = [r"position_ids"]

    def __init__(self, config):
        super().__init__(config)
        #self.model_args = model_kargs["model_args"]
        self.bert = BertModel(config)
        cl_init(self, config)

    def forward(self,input_ids=None,attention_mask=None,token_type_ids=None,position_ids=None,
        head_mask=None,inputs_embeds=None,labels=None,output_attentions=None,output_hidden_states=None,
        return_dict=None,sent_emb=False,
    ):
        if sent_emb:
            return sentemb_forward(self, self.bert,
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                position_ids=position_ids,
                head_mask=head_mask,
                inputs_embeds=inputs_embeds,
                labels=labels,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )
        else:
            return cl_forward(self, self.bert,
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                position_ids=position_ids,
                head_mask=head_mask,
                inputs_embeds=inputs_embeds,
                labels=labels,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )

# Dữ liệu: 
/kaggle/input/wiki1m/wiki1m_for_simcse.txt

In [10]:
class wiki1mData(Dataset):
    def __init__(self, csv_path, training=True, full=False):
        df = pd.read_csv(csv_path, sep='\t', names=["text"])
        df.dropna(inplace=True)
        source_texts = df["text"].values
        target_texts = df["text"].values
        data = list(zip(source_texts, target_texts))
        if full:
            self.data = data
        else: 
            train_data, val_data = train_test_split(data,test_size=0.15,random_state=42,shuffle=False)
            self.data = train_data if training else val_data
            
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]

In [11]:
class Arguments():
  def __init__(self):
    self.model_name_or_path = 'bert-base-uncased'
    self.max_seq_length = 32
    self.learning_rate = 3e-5 
    self.adam_epsilon = 1e-8
    self.warmup_proportion = 0.1
    self.weight_decay = 0.01
    self.num_train_epochs = 1
    self.gradient_accumulation_steps = 1
    self.pad_to_max_length = True
    self.batch_size = 32 
    self.output_dir = '/kaggle/working/model_outputs'
    self.overwrite = True
    self.local_rank = -1
    self.no_cuda = False

args = Arguments()

In [12]:
def process_batch(txt_list,tokenizer,max_len=args.max_seq_length):
    source_ls = [source for source,target in txt_list]
    target_ls = [target for source,target in txt_list]

    source_tokens = tokenizer(source_ls,truncation=True,padding="max_length",max_length=args.max_seq_length)
    target_tokens = tokenizer(target_ls,truncation=True,padding="max_length",max_length=args.max_seq_length)

    input_ids = []
    attention_mask = []
    token_type_ids = []

    for i in range(len(source_tokens["input_ids"])):
        input_ids.append(source_tokens["input_ids"][i])
        input_ids.append(target_tokens["input_ids"][i])
        attention_mask.append(source_tokens["attention_mask"][i])
        attention_mask.append(target_tokens["attention_mask"][i])
        token_type_ids.append(source_tokens["token_type_ids"][i])
        token_type_ids.append(target_tokens["token_type_ids"][i])

    return torch.tensor(input_ids),torch.tensor(attention_mask),torch.tensor(token_type_ids)

In [13]:
def train_dataloader(train_dataset):
    train_sampler = SequentialSampler(train_dataset)
    model_collate_fn = functools.partial(
                        process_batch,
                        tokenizer=tokenizer,
                        max_len=args.max_seq_length
                        )
    train_dataloader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              sampler=train_sampler,
                              collate_fn=model_collate_fn)
    return train_dataloader

In [14]:
train_data = wiki1mData("/kaggle/input/wiki1m/wiki1m_for_simcse.txt", full=True)

In [15]:
train_data

<__main__.wiki1mData at 0x7f8f0da478b0>

In [16]:
# Loading tokenizer and config
config = AutoConfig.from_pretrained(args.model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [17]:
train_dataloader = train_dataloader(train_data)

In [18]:
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x7f8e45bb8c70>

In [19]:
num_train_optimization_steps = int(len(train_data) / args.batch_size / args.gradient_accumulation_steps) * args.num_train_epochs

In [20]:
num_train_optimization_steps

31107

In [21]:
model = BertForCL.from_pretrained(args.model_name_or_path,config=config)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForCL were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['mlp.dense.bias', 'mlp.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
model.to(device)

BertForCL(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_aff

In [23]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias','LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

warmup_steps = int(args.warmup_proportion * num_train_optimization_steps)
optimizer = AdamW(optimizer_grouped_parameters,lr=args.learning_rate,eps=args.adam_epsilon)
scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=num_train_optimization_steps)

In [24]:
for epoch in range(args.num_train_epochs):
  model.train()
  running_loss = 0.0
  for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
  #for input_ids,attention_mask,token_type_ids in train_dataloader:
    batch = tuple(t.to(device) for t in batch)
    input_ids,attention_mask,token_type_ids = batch
    #import ipdb; ipdb.set_trace();
    # zero the parameter gradients
    optimizer.zero_grad()
    outputs = model(input_ids,attention_mask,token_type_ids)
    loss = outputs["loss"]

    if args.gradient_accumulation_steps > 1:
      loss = loss / args.gradient_accumulation_steps
    loss.backward()
    running_loss += loss.item()
    if (step + 1) % args.gradient_accumulation_steps == 0:
      optimizer.step()
      scheduler.step()  # Update learning rate schedule
      model.zero_grad()
    if (step + 1) % 1000 == 0:  # Change the frequency as needed
            print(f"Epoch [{epoch + 1}/{args.num_train_epochs}], Step [{step + 1}/{len(train_dataloader)}], Loss: {running_loss / (step + 1):.4f}")
            
            

Iteration:   3%|▎         | 1001/31108 [03:11<1:35:22,  5.26it/s]

Epoch [1/1], Step [1000/31108], Loss: 0.2067


Iteration:   6%|▋         | 2001/31108 [06:21<1:32:03,  5.27it/s]

Epoch [1/1], Step [2000/31108], Loss: 0.1182


Iteration:  10%|▉         | 3001/31108 [09:31<1:28:53,  5.27it/s]

Epoch [1/1], Step [3000/31108], Loss: 0.0872


Iteration:  13%|█▎        | 4001/31108 [12:41<1:25:51,  5.26it/s]

Epoch [1/1], Step [4000/31108], Loss: 0.0726


Iteration:  16%|█▌        | 5001/31108 [15:51<1:22:37,  5.27it/s]

Epoch [1/1], Step [5000/31108], Loss: 0.0605


Iteration:  19%|█▉        | 6001/31108 [19:01<1:19:36,  5.26it/s]

Epoch [1/1], Step [6000/31108], Loss: 0.0562


Iteration:  23%|██▎       | 7001/31108 [22:11<1:16:28,  5.25it/s]

Epoch [1/1], Step [7000/31108], Loss: 0.0499


Iteration:  26%|██▌       | 8001/31108 [25:22<1:13:14,  5.26it/s]

Epoch [1/1], Step [8000/31108], Loss: 0.0477


Iteration:  29%|██▉       | 9001/31108 [28:32<1:09:59,  5.26it/s]

Epoch [1/1], Step [9000/31108], Loss: 0.0445


Iteration:  32%|███▏      | 10001/31108 [31:42<1:06:58,  5.25it/s]

Epoch [1/1], Step [10000/31108], Loss: 0.0411


Iteration:  35%|███▌      | 11001/31108 [34:52<1:03:41,  5.26it/s]

Epoch [1/1], Step [11000/31108], Loss: 0.0388


Iteration:  39%|███▊      | 12001/31108 [38:02<1:00:34,  5.26it/s]

Epoch [1/1], Step [12000/31108], Loss: 0.0370


Iteration:  42%|████▏     | 13001/31108 [41:13<57:19,  5.26it/s]

Epoch [1/1], Step [13000/31108], Loss: 0.0354


Iteration:  45%|████▌     | 14001/31108 [44:23<54:15,  5.25it/s]

Epoch [1/1], Step [14000/31108], Loss: 0.0349


Iteration:  48%|████▊     | 15001/31108 [47:33<50:58,  5.27it/s]

Epoch [1/1], Step [15000/31108], Loss: 0.0335


Iteration:  51%|█████▏    | 16001/31108 [50:43<47:51,  5.26it/s]

Epoch [1/1], Step [16000/31108], Loss: 0.0320


Iteration:  55%|█████▍    | 17001/31108 [53:53<44:35,  5.27it/s]

Epoch [1/1], Step [17000/31108], Loss: 0.0307


Iteration:  58%|█████▊    | 18001/31108 [57:03<41:26,  5.27it/s]

Epoch [1/1], Step [18000/31108], Loss: 0.0293


Iteration:  61%|██████    | 19001/31108 [1:00:13<38:17,  5.27it/s]

Epoch [1/1], Step [19000/31108], Loss: 0.0279


Iteration:  64%|██████▍   | 20001/31108 [1:03:23<35:12,  5.26it/s]

Epoch [1/1], Step [20000/31108], Loss: 0.0267


Iteration:  68%|██████▊   | 21001/31108 [1:06:34<32:02,  5.26it/s]

Epoch [1/1], Step [21000/31108], Loss: 0.0255


Iteration:  71%|███████   | 22001/31108 [1:09:44<28:51,  5.26it/s]

Epoch [1/1], Step [22000/31108], Loss: 0.0247


Iteration:  74%|███████▍  | 23001/31108 [1:12:54<25:41,  5.26it/s]

Epoch [1/1], Step [23000/31108], Loss: 0.0238


Iteration:  77%|███████▋  | 24001/31108 [1:16:04<22:33,  5.25it/s]

Epoch [1/1], Step [24000/31108], Loss: 0.0229


Iteration:  80%|████████  | 25001/31108 [1:19:15<19:20,  5.26it/s]

Epoch [1/1], Step [25000/31108], Loss: 0.0223


Iteration:  84%|████████▎ | 26001/31108 [1:22:25<16:09,  5.27it/s]

Epoch [1/1], Step [26000/31108], Loss: 0.0217


Iteration:  87%|████████▋ | 27001/31108 [1:25:35<13:01,  5.25it/s]

Epoch [1/1], Step [27000/31108], Loss: 0.0209


Iteration:  90%|█████████ | 28001/31108 [1:28:45<09:50,  5.26it/s]

Epoch [1/1], Step [28000/31108], Loss: 0.0203


Iteration:  93%|█████████▎| 29001/31108 [1:31:55<06:40,  5.27it/s]

Epoch [1/1], Step [29000/31108], Loss: 0.0197


Iteration:  96%|█████████▋| 30001/31108 [1:35:06<03:30,  5.26it/s]

Epoch [1/1], Step [30000/31108], Loss: 0.0192


Iteration: 100%|█████████▉| 31001/31108 [1:38:16<00:20,  5.26it/s]

Epoch [1/1], Step [31000/31108], Loss: 0.0186


Iteration: 100%|██████████| 31108/31108 [1:38:36<00:00,  5.26it/s]


In [25]:
if not os.path.exists(args.output_dir):
    os.makedirs(args.output_dir)
model.save_pretrained(args.output_dir)
tokenizer.save_pretrained(args.output_dir)

('/kaggle/working/model_outputs/tokenizer_config.json',
 '/kaggle/working/model_outputs/special_tokens_map.json',
 '/kaggle/working/model_outputs/vocab.txt',
 '/kaggle/working/model_outputs/added_tokens.json',
 '/kaggle/working/model_outputs/tokenizer.json')

In [26]:
# Đặt mô hình vào chế độ đánh giá
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForCL(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_aff

In [27]:
def get_sentence_embedding(sentence, model, tokenizer, device):
    # Tokenize sentence
    inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding='max_length', max_length=args.max_seq_length)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Get embedding
    with torch.no_grad():
        outputs = model(sent_emb=True, **inputs)
        sentence_embedding = outputs.pooler_output

    return sentence_embedding

In [28]:
# Example usage of get_sentence_embedding
sentence = "This is a test sentence."
embedding = get_sentence_embedding(sentence, model, tokenizer, device)
print(embedding)

tensor([[-2.0885e-01, -8.1746e-02, -4.7719e-01, -1.9839e-01,  9.6258e-02,
         -1.6265e-01, -5.9050e-01,  1.4705e-01, -1.4234e-01,  1.2036e-01,
         -3.2306e-02,  2.5323e-01, -2.5249e-01,  2.8687e-01, -2.0828e-01,
          1.2084e-01, -3.6191e-01,  2.1067e-01,  3.9963e-03,  2.8630e-01,
          1.4539e-01, -1.8763e-01, -1.0709e-01, -8.1587e-03,  1.8506e-03,
          1.0824e-01, -2.5082e-02, -4.5106e-01,  1.6739e-01, -9.3042e-02,
          1.4792e-01,  2.5533e-01,  1.8882e-02,  6.1786e-01, -4.7192e-01,
         -1.0533e-01,  2.1480e-01,  1.8915e-01, -4.2078e-01,  1.7893e-01,
         -3.9554e-01,  9.7841e-02, -3.3106e-02, -8.3382e-02,  3.4222e-01,
         -1.4916e-01,  4.1210e-01, -1.5587e-01,  3.4349e-01,  1.8986e-01,
         -1.3811e-01,  5.6191e-02, -1.8167e-01,  2.6947e-01, -1.8516e-01,
          2.1178e-01,  5.1079e-01,  1.8339e-01,  8.0980e-02,  2.2657e-01,
         -7.0998e-02, -7.4644e-03,  1.6776e-01, -1.7474e-01,  2.5915e-01,
          7.4296e-02, -1.3381e-01, -3.

In [29]:
from torch.nn.functional import cosine_similarity

# Các câu cần tính toán độ tương đồng
sentence1 = "I love chocolates"
sentence2 = "chocolates are my favourite items."

# Lấy embedding của các câu
embedding1 = get_sentence_embedding(sentence1, model, tokenizer, device)
embedding2 = get_sentence_embedding(sentence2, model, tokenizer, device)

# Tính toán độ tương đồng cosin
similarity = cosine_similarity(embedding1, embedding2)
print(f"Similarity: {similarity.item()}")

Similarity: 0.5812699794769287
