In [None]:
!pip install datasets -q
!pip install transformers -q
!pip install sentence-transformers -q

---

In [2]:
# load tokenizer from dir

from transformers import AutoTokenizer
from sentence_transformers import models, SentenceTransformer
from sentence_transformers import losses

from datasets import load_dataset
from sentence_transformers import InputExample
from tqdm.auto import tqdm
from torch.utils.data import DataLoader

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Tokenizer")

------

In [None]:
data = load_dataset('xnli', 'sw')
snli = load_dataset('snli', split='train')

In [None]:
train_samples = []
for row in tqdm(data['train']):
    train_samples.append(InputExample(
        texts=[row['premise'], row['hypothesis']],
        label=row['label']
    )
)

In [None]:
batch_size = 256        # originally 16

loader = DataLoader(
    train_samples, shuffle=True, batch_size=batch_size)

In [None]:
bert = models.Transformer('bert-base-uncased')
pooler = models.Pooling(
    bert.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)

model = SentenceTransformer(modules=[bert, pooler])

In [None]:
loss = losses.SoftmaxLoss(
    model=model,
    sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
    num_labels=3)  # XNLI-SW dataset has ['entailment', 'neutral', 'contradiction'] labels

In [None]:
# check gpu memory
!nvidia-smi

In [None]:
epochs = 1
warmup_steps = int(len(loader) * epochs * 0.1)

model.fit(
    train_objectives=[(loader, loss)],
    epochs=epochs,
    warmup_steps=warmup_steps,
    output_path='./sbert_test_b',
    show_progress_bar=False,
)

----

In [7]:
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import BertModel
from transformers import AutoTokenizer
from sentence_transformers import models, SentenceTransformer
from sentence_transformers import losses

from datasets import load_dataset
from sentence_transformers import InputExample
from tqdm.auto import tqdm
from torch.utils.data import DataLoader


tokenizer = AutoTokenizer.from_pretrained("Tokenizer")
model = BertModel.from_pretrained('model')



In [8]:
# sample data:

sentences = [
    "mahali pazuri zaidi katika vega kwa ajili ya kiamsha kinywa chunguza tu kikao au jua",
    "Baada ya kulivuta gari langu ingoje kwa dakika nyingine kabla ya kukubaliwa",
    "valentines zenye furaha",
    "alama yangu ya ni zaidi kwa njama",
    "Ambaye hutoa mali yake kwa ajili ya kujitakasa"
]

-  now we can calculate the similarity between the all sentences and plot a 2d  graph


In [9]:
# encode sentences

import torch

input_ids = []
attention_mask = []

for sent in sentences:
    encoded_sent = tokenizer.encode_plus(
        text=sent,
        add_special_tokens=True,
        max_length=128,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    input_ids.append(encoded_sent['input_ids'])
    attention_mask.append(encoded_sent['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)         # this ensures that the input_ids are in the same tensor
attention_mask = torch.cat(attention_mask, dim=0)

# get the embeddings

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

features = last_hidden_states[0][:, 0, :].numpy()

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [10]:
# plot the embeddings

import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

X_embedded = TSNE(n_components=2).fit_transform(features)
X_embedded.shape

: 

: 