# Sentence transformers

In [21]:
import os

from transformer_document_embedding.tasks.imdb import IMDBClassification

os.environ["CUDA_VISIBLE_DEVICES"] = ""

from typing import Iterable

import numpy as np
import torch
from sentence_transformers import SentenceTransformer, models
from torch.utils.data import DataLoader

import transformer_document_embedding as tde

In [2]:
sentences = ["Hi, how are you", "I am fine thanks"]

In [3]:
model = SentenceTransformer("all-distilroberta-v1")
task = tde.tasks.IMDBClassification()

Downloading:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/653 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/15.7k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [4]:
class TorchSoftmaxHead(torch.nn.Module):
    def __init__(
        self, input_dim: int, hidden: int, num_classes: int, activation: str
    ) -> None:
        super().__init__()
        self._fc_1 = torch.nn.Linear(input_dim, hidden)
        self._fc_2 = torch.nn.Linear(hidden, num_classes)
        self._activation = torch.nn.ReLU()

    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
        hidden = self._activation(self._fc_1(inputs))
        outputs = self._fc_2(hidden)
        return outputs


cls_head = TorchSoftmaxHead(model.get_sentence_embedding_dimension(), 32, 2, "asdf")

In [5]:
embeddings = model.encode(sentences)
class_ = cls_head.forward(torch.Tensor(embeddings))

In [6]:
print(class_)

tensor([[-0.0665,  0.1158],
        [-0.0679,  0.1188]], grad_fn=<AddmmBackward0>)


In [7]:
def ds_head(iterable: Iterable) -> None:
    for i, item in enumerate(iterable):
        print(item)
        print()
        if i > 10:
            break

In [8]:
ds_head(task.train.filter(lambda doc: doc["label"] >= 0))

Downloading and preparing dataset imdb/plain_text to /home/dburian/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /home/dburian/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/24999 [00:00<?, ?ex/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [9]:


class IMDBTorchDataset(torch.utils.data.Dataset):
    def __init__(self, hf_dataset: tde.tasks.imdb.IMDBData) -> None:
        self._hf_dataset = hf_dataset.filter(lambda doc: doc["label"] >= 0).with_format(
            "torch"
        )

    def __len__(self) -> int:
        return len(self._hf_dataset)

    def __getitem__(self, index: int) -> tuple[str, torch.Tensor]:
        return self._transform(self._hf_dataset[index])

    def _transform(self, doc: dict) -> tuple[str, torch.Tensor]:
        return doc["text"], doc["label"]


data_loader = DataLoader(IMDBTorchDataset(task.train), batch_size=16, shuffle=True)

  0%|          | 0/25 [00:00<?, ?ba/s]

In [10]:
ds_head(data_loader)

[('After a long period in the space, looking for the remains of planet Krypton, Superman (Brandon Routh) returns to Earth. He misses Lois Lane (Kate Bosworth), who got married and has a son with Richard White (James Marsden). Meanwhile, Lex Luthor (Kevin Spacey) plots an evil plan, using crystals he stole from the Fortress of Solitude, to create a new land and submerge the USA.<br /><br />After so many delightful movies of Superman with the unforgettable Christopher Reeve, or TV shows like "Lois and Clark" (and Teri Hatcher) or "Smallville", a great expectation was created for the return of Superman in this Bryan Singer\'s version. Unfortunately, the awful story is too long and boring, with many unnecessary parts, lack of emotion and overrated in IMDb. In addition, the romance between Lois Lane and Superman is something shamefully ridiculous. The twenty-two years old actress Kate Bosworth is wrongly miscast, playing the role of a mature reporter and experienced mother of a five years o

## SBertIMDB

In [11]:
class SBertIMDB(tde.models.ExperimentalModel, torch.nn.Module):
    def __init__(self, log_dir: str, epochs: int) -> None:
        super().__init__()
        self._base_model = SentenceTransformer("all-distilroberta-v1")
        self._cls_head = TorchSoftmaxHead(
            self._base_model.get_sentence_embedding_dimension(),
            32,
            2,
            "",
        )
        self._epochs = epochs
        self._loss = torch.nn.CrossEntropyLoss()
        self._optimizer = torch.optim.Adam(self.parameters())

    # TODO: Rename train to fit? No collision with torch API?
    def train(self, training_data: tde.tasks.imdb.IMDBData) -> None:
        data_loader = DataLoader(
            IMDBTorchDataset(training_data), batch_size=16, shuffle=True
        )
        for epoch in range(self._epochs):
            for inputs, true_outputs in data_loader:
                # print(f"inputs: {inputs}")
                pred_outputs = self.forward(inputs)
                loss = self._loss(pred_outputs, true_outputs)

                loss.backward()
                self._optimizer.step()
                self._optimizer.zero_grad()
                print(f"{epoch}: {loss}")

    def forward(self, inputs: list[str]) -> torch.Tensor:
        embeddings = self._base_model.encode(inputs)
        logits = self._cls_head(embeddings)
        return logits

    def predict(self, inputs: tde.tasks.imdb.IMDBData) -> np.ndarray:
        pass

    def save(self, dir_path: str) -> None:
        pass

    def load(self, dir_path: str) -> None:
        pass


# TODO: The above is crazy, lets create softmax module, pass it to sentence
# transformers, create InputExamples and train it that way

In [12]:
sbert = SBertIMDB("", 10)
sbert.train(task.train)

Loading cached processed dataset at /home/dburian/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-b0c62f1d4623f0a4.arrow


TypeError: linear(): argument 'input' (position 1) must be Tensor, not numpy.ndarray

In [13]:
bert = models.Transformer("bert-base-uncased", max_seq_length=512)
embed_dim = bert.get_word_embedding_dimension()
pooling = models.Pooling(embed_dim)
cls_head = models.Dense(embed_dim, 1, activation_function=torch.nn.Sigmoid())

model = SentenceTransformer(modules=[bert, pooling, cls_head])

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [14]:
model.encode(sentences)

array([[0.5916969],
       [0.5706852]], dtype=float32)

In [15]:
from sentence_transformers import InputExample, losses
from torch.utils.data import DataLoader

In [16]:
training_data = [
    InputExample(texts=[sentence], label=i % 2) for i, sentence in enumerate(sentences)
]
train_dataloader = DataLoader(training_data, shuffle=True, batch_size=2)
loss = torch.nn.CrossEntropyLoss()

In [17]:
class MyLoss(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self._model = model
        self.loss_fct = torch.nn.BCELoss()

    def forward(
        self, sentence_features: Iterable[dict[str, torch.Tensor]], labels: torch.Tensor
    ):
        print(sentence_features)
        rep = self._model(sentence_features[0])
        print(rep)
        rep = rep["sentence_embedding"][:, 0]
        return self.loss_fct(rep, labels.float())

In [18]:

model.fit(
    train_objectives=[(train_dataloader, MyLoss(model))], epochs=1, warmup_steps=0
)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1 [00:00<?, ?it/s]

[{'input_ids': tensor([[ 101, 1045, 2572, 2986, 4283,  102,    0],
        [ 101, 7632, 1010, 2129, 2024, 2017,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1]])}]
{'input_ids': tensor([[ 101, 1045, 2572, 2986, 4283,  102,    0],
        [ 101, 7632, 1010, 2129, 2024, 2017,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1]]), 'token_embeddings': tensor([[[ 1.6087e-01,  2.1743e-01,  3.3955e-02,  ..., -5.3160e-02,
           1.5297e-01,  3.1409e-01],
         [-4.1646e-02,  1.8957e-01,  4.2093e-01,  ..., -1.2765e-02,
           2.7832e-01,  7.2491e-01],
         [-5.3346e-02, -3.5291e-01,  6.6916e-02,  ...,  4.2143e-03,
           3.5807e-01,  4.9660e-01],
         ...,
         [ 1.6283e-01, -8.3761e-01,  5.9700e-02,  ...,  7.3287e-01,
           4.

In [22]:
task = IMDBClassification()
predict_data = task.test["text"]

Found cached dataset imdb (/home/dburian/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)
Loading cached processed dataset at /home/dburian/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-f549350f5baa041a.arrow
Found cached dataset imdb (/home/dburian/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/49999 [00:00<?, ?ex/s]

Found cached dataset imdb (/home/dburian/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/24999 [00:00<?, ?ex/s]

In [23]:
ds_head(predict_data)

I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. It's not. It's clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It's really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it's rubbish as they have to alway

In [None]:
model.encode(predict_data)