# Sentence transformers

In [18]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = ""

from typing import Iterable

import numpy as np
import torch
from sentence_transformers import SentenceTransformer, models
from torch.utils.data import DataLoader

import transformer_document_embedding as tde

In [3]:
sentences = ["Hi, how are you", "I am fine thanks"]

In [4]:
model = SentenceTransformer("all-distilroberta-v1")
task = tde.tasks.IMDBClassification()

  return torch._C._cuda_getDeviceCount() > 0


In [5]:
class TorchSoftmaxHead(torch.nn.Module):
    def __init__(
        self, input_dim: int, hidden: int, num_classes: int, activation: str
    ) -> None:
        super().__init__()
        self._fc_1 = torch.nn.Linear(input_dim, hidden)
        self._fc_2 = torch.nn.Linear(hidden, num_classes)
        self._activation = torch.nn.ReLU()

    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
        hidden = self._activation(self._fc_1(inputs))
        outputs = self._fc_2(hidden)
        return outputs


cls_head = TorchSoftmaxHead(model.get_sentence_embedding_dimension(), 32, 2, "asdf")

In [6]:
embeddings = model.encode(sentences)
class_ = cls_head.forward(torch.Tensor(embeddings))

In [7]:
print(class_)

tensor([[0.1288, 0.0815],
        [0.1276, 0.0899]], grad_fn=<AddmmBackward0>)


In [8]:
def ds_head(iterable: Iterable) -> None:
    for i, item in enumerate(iterable):
        print(item)
        print()
        if i > 10:
            break

In [9]:
ds_head(task.train.filter(lambda doc: doc["label"] >= 0))

Found cached dataset imdb (/home/dburian/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/25 [00:00<?, ?ba/s]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [10]:


class IMDBTorchDataset(torch.utils.data.Dataset):
    def __init__(self, hf_dataset: tde.tasks.imdb.IMDBData) -> None:
        self._hf_dataset = hf_dataset.filter(lambda doc: doc["label"] >= 0).with_format(
            "torch"
        )

    def __len__(self) -> int:
        return len(self._hf_dataset)

    def __getitem__(self, index: int) -> tuple[str, torch.Tensor]:
        return self._transform(self._hf_dataset[index])

    def _transform(self, doc: dict) -> tuple[str, torch.Tensor]:
        return doc["text"], doc["label"]


data_loader = DataLoader(IMDBTorchDataset(task.train), batch_size=16, shuffle=True)

  0%|          | 0/25 [00:00<?, ?ba/s]

In [11]:
ds_head(data_loader)

[('My wife and I found this film to be highly unsatisfying. While the plot keeps you interested and busy wondering just what is going on, when you leave the theater, there are just too many loose ends that make no sense at all. (SPOILERS AHEAD) Christopher Plummer, enormously wealthy head of a NY bank has a terrible hidden secret. Profiting from WW II deals with the Nazis and hiding loot stolen from Jews, he keeps the evidence (including diamonds and documents with the Nazi swastika) in a safety deposit box in his bank. Why? If he wants this never to be revealed, why did he not burn and destroy the documents years ago? And the diamonds? Obviously, he does not need them - why keep them rather than dispose of them? How did the bank robbers find out his secret? How did they know to zero in on this very safety deposit box #232? Ace detective Denzel Washington also discovers bank records show SD Boxes No\'s 231 and 233, but no #232. Curious. He meticulously found time somehow to do an exhau

## SBertIMDB

In [12]:
class SBertIMDB(tde.models.ExperimentalModel, torch.nn.Module):
    def __init__(self, log_dir: str, epochs: int) -> None:
        super().__init__()
        self._base_model = SentenceTransformer("all-distilroberta-v1")
        self._cls_head = TorchSoftmaxHead(
            self._base_model.get_sentence_embedding_dimension(),
            32,
            2,
            "",
        )
        self._epochs = epochs
        self._loss = torch.nn.CrossEntropyLoss()
        self._optimizer = torch.optim.Adam(self.parameters())

    # TODO: Rename train to fit? No collision with torch API?
    def train(self, training_data: tde.tasks.imdb.IMDBData) -> None:
        data_loader = DataLoader(
            IMDBTorchDataset(training_data), batch_size=16, shuffle=True
        )
        for epoch in range(self._epochs):
            for inputs, true_outputs in data_loader:
                # print(f"inputs: {inputs}")
                pred_outputs = self.forward(inputs)
                loss = self._loss(pred_outputs, true_outputs)

                loss.backward()
                self._optimizer.step()
                self._optimizer.zero_grad()
                print(f"{epoch}: {loss}")

    def forward(self, inputs: list[str]) -> torch.Tensor:
        embeddings = self._base_model.encode(inputs)
        logits = self._cls_head(embeddings)
        return logits

    def predict(self, inputs: tde.tasks.imdb.IMDBData) -> np.ndarray:
        pass

    def save(self, dir_path: str) -> None:
        pass

    def load(self, dir_path: str) -> None:
        pass


# TODO: The above is crazy, lets create softmax module, pass it to sentence
# transformers, create InputExamples and train it that way

In [13]:
sbert = SBertIMDB("", 10)
sbert.train(task.train)

Loading cached processed dataset at /home/dburian/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-21856f5aa059f864.arrow


TypeError: linear(): argument 'input' (position 1) must be Tensor, not numpy.ndarray

In [19]:
bert = models.Transformer("bert-base-uncased", max_seq_length=512)
embed_dim = bert.get_word_embedding_dimension()
pooling = models.Pooling(embed_dim)
cls_head = models.Dense(embed_dim, 1, activation_function=torch.nn.Sigmoid())

model = SentenceTransformer(modules=[bert, pooling, cls_head])

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [20]:
model.encode(sentences)

array([[0.4969878],
       [0.4548001]], dtype=float32)

In [21]:
from sentence_transformers import InputExample, losses
from torch.utils.data import DataLoader

In [50]:
training_data = [
    InputExample(texts=[sentence], label=i % 2) for i, sentence in enumerate(sentences)
]
train_dataloader = DataLoader(training_data, shuffle=True, batch_size=2)
loss = torch.nn.CrossEntropyLoss()

In [51]:
class MyLoss(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self._model = model
        self.loss_fct = torch.nn.BCELoss()

    def forward(
        self, sentence_features: Iterable[dict[str, torch.Tensor]], labels: torch.Tensor
    ):
        print(sentence_features)
        rep = self._model(sentence_features[0])
        print(rep)
        rep = rep["sentence_embedding"][:, 0]
        return self.loss_fct(rep, labels.float())

In [52]:

model.fit(
    train_objectives=[(train_dataloader, MyLoss(model))], epochs=1, warmup_steps=0
)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1 [00:00<?, ?it/s]

[{'input_ids': tensor([[ 101, 1045, 2572, 2986, 4283,  102,    0],
        [ 101, 7632, 1010, 2129, 2024, 2017,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1]])}]
{'input_ids': tensor([[ 101, 1045, 2572, 2986, 4283,  102,    0],
        [ 101, 7632, 1010, 2129, 2024, 2017,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1]]), 'token_embeddings': tensor([[[ 0.3015, -0.2576,  0.3002,  ..., -0.0560,  0.5451,  0.3539],
         [ 0.0992, -0.5967,  0.3512,  ..., -0.5013,  0.1805,  0.8264],
         [-0.3740, -0.1559,  0.3085,  ..., -0.5238, -0.0316,  0.5428],
         ...,
         [ 0.1484, -0.9344,  0.4032,  ...,  0.6869,  0.2041,  0.5290],
         [ 0.7704, -0.2393, -0.0937,  ...,  0.1604, -0.6814, -0.1194],
         [-0.0139, -0.1846,  0.4313,  ...