In [None]:
pip install --upgrade datasets fsspec

In [None]:
from datasets import load_dataset
from huggingface_hub import login
from transformers import BertTokenizer, BertForMaskedLM

import os

hf_api_key = os.getenv("HF_TOKEN")
login(token="hf_api_key")

ds = load_dataset("mteb/stsbenchmark-sts")
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

embeding_dim = 768
vocab_size = 30522
n_layer = 12
block_size = 512


class myBertForMaskedLM(nn.Module):
  def __init__(self, vocab_size, embedding_dim, block_size, n_layer):
      super().__init__()
      self.bert = BertModel(vocab_size, embedding_dim, block_size, n_layer)
      self.pooling = BertPooling()

  def forward(self, x, attention_mask):
    x = self.bert(x, attention_mask)
    x = self.pooling(x, attention_mask)
    return x

class BertModel(nn.Module):
  def __init__(self, vocab_size, embedding_dim, block_size, n_layer):
      super().__init__()
      self.embeddings = BertEmbeddings(vocab_size, embedding_dim, block_size)
      self.encoder = BertEncoder(n_layer,embeding_dim)

  def forward(self, x, attention_mask):
      x = self.embeddings(x)
      x = self.encoder(x, attention_mask)
      return x

class BertEmbeddings(nn.Module):
    def __init__(self, vocab_size, embedding_dim, block_size, type_vocab_size=2):
        super().__init__()
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.position_embeddings = nn.Embedding(block_size, embedding_dim)
        self.token_type_embeddings = nn.Embedding(type_vocab_size, embedding_dim)
        self.LayerNorm = nn.LayerNorm(embedding_dim)
        self.dropout = nn.Dropout(0.1)

    def forward(self, input_ids, token_type_ids=None):
        B, T = input_ids.size()
        if token_type_ids is None:
            token_type_ids = torch.zeros((B, T), dtype=torch.long, device=input_ids.device)

        word_emb = self.word_embeddings(input_ids)
        pos_ids = torch.arange(T, dtype=torch.long, device=input_ids.device).unsqueeze(0).expand(B, T)
        pos_emb = self.position_embeddings(pos_ids)
        token_type_emb = self.token_type_embeddings(token_type_ids)


        emb = word_emb + pos_emb
        emb = self.LayerNorm(emb)
        return self.dropout(emb)

class BertEncoder(nn.Module):
    def __init__(self, n_layer, n_embed):
        super().__init__()
        self.layer = nn.ModuleList([BertLayer(n_embed) for _ in range(n_layer)])

    def forward(self, x, attention_mask):
        for layer in self.layer:
            x = layer(x, attention_mask)
        return x


class BertLayer(nn.Module):
    def __init__(self, n_embed):
        super().__init__()
        self.attention = BertAttention(n_embed)
        self.intermediate = BertIntermediate(n_embed)
        self.output = BertOutput(n_embed)

    def forward(self, x, attention_mask):
        x = self.attention(x, attention_mask)
        x = self.intermediate(x)
        x = self.output(x)
        return x


class BertAttention(nn.Module):
    def __init__(self, n_embed):
        super().__init__()
        self.self = BertSdpaSelfAttention(n_embed)
        self.output = BertSelfOutput(n_embed)

    def forward(self, x, attention_mask):
        x = self.self(x, attention_mask)
        x = self.output(x)
        return x


class BertSdpaSelfAttention(nn.Module):
    def __init__(self, n_embed):
        super().__init__()
        self.query = nn.Linear(n_embed, n_embed)
        self.key = nn.Linear(n_embed, n_embed)
        self.value = nn.Linear(n_embed, n_embed)
        self.dropout = nn.Dropout(p=0.1)
        self.n_embed = n_embed


    def forward(self, x, attention_mask):
        q = self.query(x)
        k = self.key(x)
        v = self.value(x)

        att = q @ k.transpose(-2, -1) / math.sqrt(self.n_embed)

        att = att.masked_fill(attention_mask.unsqueeze(1) == 0, float("-inf"))
        att = F.softmax(att, dim=-1)
        att = self.dropout(att)

        out = att @ v
        return out


class BertSelfOutput(nn.Module):
    def __init__(self, n_embed):
        super().__init__()
        self.dense = nn.Linear(n_embed, n_embed)
        self.LayerNorm = nn.LayerNorm(n_embed)
        self.dropout = nn.Dropout(p=0.1)

    def forward(self, x):
        out = self.dense(x)
        out = self.dropout(self.LayerNorm(out))
        return out


class BertIntermediate(nn.Module):
    def __init__(self, n_embed):
        super().__init__()
        self.dense = nn.Linear(n_embed, 4 * n_embed)
        self.intermediate_act_fn = nn.GELU()

    def forward(self, x):
        return self.intermediate_act_fn(self.dense(x))


class BertOutput(nn.Module):
    def __init__(self, n_embed):
        super().__init__()
        self.dense = nn.Linear(4 * n_embed, n_embed)
        self.LayerNorm = nn.LayerNorm(n_embed)
        self.dropout = nn.Dropout(p=0.1)

    def forward(self, x):
        x = self.dense(x)
        x = self.dropout(self.LayerNorm(x))
        return x

#The crux of this notebooks !!! :)
class BertPooling(nn.Module):
  def __init__(self) -> None:
      super().__init__()

  def forward(self, x, attention_mask):
    # x => B,T, C
    mask = attention_mask.unsqueeze(-1) #B,T,1
    x = x * mask  #B,T,C
    lengths = mask.sum(dim=1).clamp(min=1)
    x_mean = x.sum(dim=1) / lengths

    return x_mean


#We steal the weights of pretrained BERT, so we don't need to traine the model :)

mymodel = myBertForMaskedLM(vocab_size=vocab_size, embedding_dim=embeding_dim, block_size=block_size, n_layer=n_layer)
mymodel.load_state_dict(model.state_dict(), strict=False)

In [None]:
#Just a check to make sur I have the same weight than pretrained BERT

for (name1, param1), (name2, param2) in zip(model.named_parameters(), mymodel.named_parameters()):
    if name1 == name2:
        same = torch.allclose(param1, param2, atol=1e-6)
        print(f"{name1}: {'OK' if same else 'DIFF'}")

In [None]:
#I create my Dataloader, to fine tune it , on this dataset we will focus on sentence1, sentence2, and score. So I load this 3 features within batches, batch_st1, batch_st2, y

class Dataloader:
  def __init__(self,B,T):
    self.B = B
    self.T = T

  def _generate_batch(self, idx , mode='train'):


    batch_1 = tokenizer(
    ds["train"][idx: self.B + idx + 1]["sentence1"],
    padding='max_length',
    max_length=32,
    truncation=True,
    return_tensors="pt"
)
    batch_2 = tokenizer(
    ds["train"][i: self.B + idx + 1]["sentence2"],
    padding='max_length',
    max_length=32,
    truncation=True,
    return_tensors="pt"
)
    y = torch.tensor(ds["train"][idx: self.B + idx + 1]["score"]) / 5.0

    return batch_1, batch_2, y


In [None]:
import torch.nn.functional as F

sentences = ["The cat slept peacefully while the car crashed loudly into the wall", "Stock markets are experriencing a major downturn"]
inputs = tokenizer(sentences, padding='longest', return_tensors='pt')
res = mymodel(inputs.inputs_ids, inputs.attention_mask)

#On recup l"embedding des 2 tokens, pour

token_2 = tokenizer.convert_tokens_to_ids("peacefully")
token_1 = tokenizer.convert_tokens_to_ids("crashed")
B,T, c = res.shape
for b in range(B):
  for t in range(T):
    if batch[b,t].item() == token_1:
      coor_1 = b,t
    elif batch[b,t].item() == token_2:
      coor_2 = b,t
embedding_token_1 = res[coor_1[0],coor_1[1] , :]
embedding_token_2 =  res[coor_2[0],coor_2[1] , :]



F.cosine_similarity(embedding_token_1,embedding_token_2, dim=0)

#NOTE: I'm quite surprised to see that semantically similar tokens are not actually close in the embedding space.

In [None]:
import torch
import torch.nn
import torch.nn.functional as F

d = Dataloader(B=16, T=32)
optimizer = torch.optim.AdamW(mymodel.parameters(), lr=1e-4)
loss_obj = nn.MSELoss()

scheduler = CosineAnnealingLR(
    optimizer,
    T_max=150,
    eta_min=1e-4
)

#Let's train on 10 epochs

for epoch in range(10):
    idxs = torch.randint(0, len(ds['train']) - d.B, (32,))  # 32 batchs par epoch
    for i in idxs:
        optimizer.zero_grad(set_to_none=True)

        batch_1, batch_2, y = d._generate_batch(idx=i.item())

        emb_1 = mymodel(batch_1.input_ids, batch_1.attention_mask)
        emb_2 = mymodel(batch_2.input_ids, batch_2.attention_mask)

        sim = F.cosine_similarity(emb_1, emb_2, dim=-1)
        loss = loss_obj(sim, y)
        print(f"Loss: {loss.item():.4f}")

        loss.backward()
        optimizer.step()
        scheduler.step()





EVALUATION

In [None]:
#Exemple of sentence
sentence1 = "A cat is sleeping on the couch."
sentence2 = "A dog is resting on the sofa."

# Tokenization
inputs1 = tokenizer(sentence1, return_tensors="pt", padding='max_length', truncation=True, max_length=32)
inputs2 = tokenizer(sentence2, return_tensors="pt", padding='max_length', truncation=True, max_length=32)

# Calcul des embeddings
with torch.no_grad():
    emb1 = mymodel(inputs1["input_ids"], inputs1["attention_mask"])  # [1, C]
    emb2 = mymodel(inputs2["input_ids"], inputs2["attention_mask"])  # [1, C]

# Similarité cosinus
sim = F.cosine_similarity(emb1, emb2, dim=-1)  # [1]
print(f"Cosine similarity: {sim.item():.4f}")