In [None]:
# https://www.sbert.net/examples/applications/computing-embeddings/README.html

# PART 1

In [None]:
!pip install transformers

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

In [None]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    #print(token_embeddings.shape)
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    #print(input_mask_expanded)
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    # print(sum_embeddings.shape)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    # print(sum_embeddings / sum_mask)
    return sum_embeddings / sum_mask

In [None]:
#Load AutoModel from huggingface model repository
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

In [None]:
text1="""Vaisampayana said, 'Having offered oblations, of water unto all their friends and kinsmen, the sons of Pandu,
and Vidura, and Dhritarashtra, and all the Bharata ladies, continued to dwell there (on the banks of the sacred stream).
The high-souled sons of Pandu desired to pass the period of mourning,  which extended for a month, outside the Kuru city.
After king Yudhishthira the just had performed the water-rites, many high-souled sages crowned with ascetic success and many
foremost of regenerate Rishis came there to see the monarch. Among them were the Island-born (Vyasa),
and Narada, and the great Rishi Devala, and Devasthana, and Kanwa. They were all accompanied by best of their pupils.
Many other members of the regenerate order, possessed of wisdom and accomplished in the Vedas, leading lives of domesticity or
belonging to the Snataka class, came to behold the Kuru king. Those high-souled ones, as they came, were duly worshipped by Yudhishthira. Bat, Mat. This is cat but not a rat"""

In [None]:
#Sentences we want sentence embeddings for
# text1='BERT (and other transformer networks) output for each token in our input text an embedding. In order to create a fixed-sized sentence embedding out of this, the model applies mean pooling, i.e., the output embeddings for all tokens are averaged to yield a fixed-sized vector.'

sentences = [text1,]

# sentences = ['This framework generates embeddings for each input sentence',
#              'Sentences are passed as a list of string.',
#              'The quick brown fox jumps over the lazy dog.']



#Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=512, return_tensors='pt')

#Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

#Perform pooling. In this case, mean pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

torch.Size([1, 384])


In [None]:
s1=sentence_embeddings

In [None]:
s1==sentence_embeddings

In [None]:
sentence_embeddings

In [None]:
# model_output[1]
# tokenizer.decode(encoded_input['input_ids'])
len(encoded_input['input_ids'][0])

260

In [None]:
encoded_input['input_ids'][0]

In [None]:
st=[]
for i in range(len(encoded_input['input_ids'][0])):
  st.append( tokenizer.decode(encoded_input['input_ids'][0][i]))

" ".join(st).replace(" ##","")


"[CLS] vaisampayana said , ' having offered oblations , of water unto all their friends and kinsmen , the sons of pandu , and vidura , and dhritarashtra , and all the bharata ladies , continued to dwell there ( on the banks of the sacred stream ) . the high - souled sons of pandu desired to pass the period of mourning , which extended for a month , outside the kuru city . after king yudhishthira the just had performed the water - rites , many high - souled sages crowned with ascetic success and many foremost of regenerate rishis came there to see the monarch . among them were the island - born ( vyasa ) , and narada , and the great rishi devala , and devasthana , and kanwa . they were all accompanied by best of their pupils . many other members of the regenerate order , possessed of wisdom and accomplished in the vedas , leading lives of domesticity or belonging to the snataka class , came to behold the kuru king . those high - souled ones , as they came , were duly worshipped by yudhi

In [None]:
len(encoded_input['input_ids'][0])

260

In [None]:
encoded_input['input_ids'].shape

torch.Size([1, 128])

In [None]:
sentence_embeddings[0]==sentence_embeddings[1]

# PART 2
https://www.sbert.net/docs/training/overview.html

In [1]:
!pip install -qU sentence_transformers

In [2]:
from sentence_transformers import SentenceTransformer, models

word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=256) # 440MB Model
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [3]:
from sentence_transformers import SentenceTransformer, models
from torch import nn

word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=256)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh())

model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])

In [4]:
model.tokenizer

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [5]:
model.modules

<bound method Module.modules of SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Dense({'in_features': 768, 'out_features': 256, 'bias': True, 'activation_function': 'torch.nn.modules.activation.Tanh'})
)>

In [20]:
# functions in sentencebert model
i=0
for m in dir(model):
  if m[0:1]!="_" :
    if i%6==0:
      sep="\n"
    else:
      sep=", "
    print(m, end=sep)
    i+=1

T_destination
add_module, append, apply, bfloat16, buffers, call_super_init
children, cpu, cuda, device, double, dump_patches
encode, encode_multi_process, eval, evaluate, extend, extra_repr
fit, float, forward, get_buffer, get_extra_state, get_max_seq_length
get_parameter, get_sentence_embedding_dimension, get_sentence_features, get_submodule, half, insert
ipu, load, load_state_dict, max_seq_length, modules, named_buffers
named_children, named_modules, named_parameters, parameters, pop, register_backward_hook
register_buffer, register_forward_hook, register_forward_pre_hook, register_full_backward_hook, register_full_backward_pre_hook, register_load_state_dict_post_hook
register_module, register_parameter, register_state_dict_pre_hook, requires_grad_, save, save_to_hub
set_extra_state, share_memory, smart_batching_collate, start_multi_process_pool, state_dict, stop_multi_process_pool
to, to_empty, tokenize, tokenizer, train, training
type, xpu, zero_grad, 

In [21]:
from sentence_transformers import SentenceTransformer, InputExample
from torch.utils.data import DataLoader

model = SentenceTransformer('distilbert-base-nli-mean-tokens') # 265MB Model
train_examples = [InputExample(texts=['My first sentence', 'My second sentence'], label=0.8),
   InputExample(texts=['Another pair', 'Unrelated sentence'], label=0.3)]
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

Downloading (…)925a9/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)1a515925a9/README.md:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

Downloading (…)515925a9/config.json:   0%|          | 0.00/550 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)925a9/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading (…)1a515925a9/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)15925a9/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [3]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

#Define the model. Either from scratch of by loading a pre-trained model
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

#Define your train examples. You need more than just two examples...
train_examples = [InputExample(texts=['My first sentence', 'My second sentence'], label=0.8),
                  InputExample(texts=['Another pair', 'Unrelated sentence'], label=0.3)]

#Define your train dataset, the dataloader and the train loss
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model)

#Tune the model
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=2, warmup_steps=50)

Downloading (…)925a9/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)1a515925a9/README.md:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

Downloading (…)515925a9/config.json:   0%|          | 0.00/550 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)925a9/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading (…)1a515925a9/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)15925a9/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1 [00:00<?, ?it/s]

fit(
  - train_objectives: typing.Iterable[typing.Tuple[torch.utils.data.dataloader.DataLoader, torch.nn.modules.module.Module]],
  - evaluator: typing.Optional[sentence_transformers.evaluation.SentenceEvaluator.SentenceEvaluator] = None,
  - epochs: int = 1,
  - steps_per_epoch=None,
  - scheduler: str = 'WarmupLinear',
  - warmup_steps: int = 10000,
  - optimizer_class: typing.Type[torch.optim.optimizer.Optimizer] = <class 'torch.optim.adamw.AdamW'>,
  - optimizer_params: typing.Dict[str, object] = {'lr': 2e-05},
  - weight_decay: float = 0.01,
  - evaluation_steps: int = 0,
  - output_path: typing.Optional[str] = None,
  - save_best_model: bool = True,
  - max_grad_norm: float = 1,
  - use_amp: bool = False,
  - callback: typing.Optional[typing.Callable[[float, int, int], None]] = None,
  - show_progress_bar: bool = True,
  - checkpoint_path: typing.Optional[str] = None,
  - checkpoint_save_steps: int = 500,
  - checkpoint_save_total_limit: int = 0)

In [24]:
from sentence_transformers import evaluation
sentences1 = ['This list contains the first column', 'With your sentences', 'You want your model to evaluate on']
sentences2 = ['Sentences contains the other column', 'The evaluator matches sentences1[i] with sentences2[i]', 'Compute the cosine similarity and compares it to scores[i]']
scores = [0.3, 0.6, 0.2]

evaluator = evaluation.EmbeddingSimilarityEvaluator(sentences1, sentences2, scores)

# ... Your other code to load training data

model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100, evaluator=evaluator, evaluation_steps=500)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1 [00:00<?, ?it/s]

In [35]:
from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, losses
from sentence_transformers.readers import InputExample
train_batch_size=2
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
train_examples = [InputExample(texts=['Sentence from class 0'], label=0), InputExample(texts=['Another sentence from class 0'], label=0),
    InputExample(texts=['Sentence from class 1'], label=1), InputExample(texts=['Sentence from class 2'], label=2)]
train_dataset = SentencesDataset(train_examples, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.BatchHardSoftMarginTripletLoss(model=model)

model.fit([(train_dataloader, train_loss)], show_progress_bar=True)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2 [00:00<?, ?it/s]

In [33]:
train_dataset = SentencesDataset(train_examples, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
# for x in train_dataloader:
#   print(x.texts)
for x in train_dataloader.dataset.examples:
  print (x.texts)

['Sentence from class 0']
['Another sentence from class 0']
['Sentence from class 1']
['Sentence from class 2']


In [34]:
train_loss.float

<bound method Module.float of BatchHardSoftMarginTripletLoss(
  (sentence_embedder): SentenceTransformer(
    (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: DistilBertModel 
    (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  )
)>

In [36]:
from sentence_transformers import SentenceTransformer, SentencesDataset, losses
from sentence_transformers.readers import InputExample

model = SentenceTransformer('distilbert-base-nli-mean-tokens')
train_examples = [InputExample(texts=['First pair, sent A', 'First pair, sent B'], label=0),
    InputExample(texts=['Second Pair, sent A', 'Second Pair, sent B'], label=3)]
train_dataset = SentencesDataset(train_examples, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=2)
train_loss.modules

<bound method Module.modules of SoftmaxLoss(
  (model): SentenceTransformer(
    (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: DistilBertModel 
    (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  )
  (classifier): Linear(in_features=2304, out_features=2, bias=True)
  (loss_fct): CrossEntropyLoss()
)>

In [37]:
from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, losses
from sentence_transformers.readers import InputExample

model = SentenceTransformer('distilbert-base-nli-mean-tokens')
train_examples = [InputExample(texts=['Anchor 1', 'Positive 1', 'Negative 1']),
    InputExample(texts=['Anchor 2', 'Positive 2', 'Negative 2'])]
train_dataset = SentencesDataset(train_examples, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.TripletLoss(model=model)
train_loss.modules

<bound method Module.modules of TripletLoss(
  (model): SentenceTransformer(
    (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: DistilBertModel 
    (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  )
)>

In [39]:
from sentence_transformers import SentenceTransformer, models
word_embedding_model = models.Transformer('bert-base-uncased')

tokens = ["[DOC]", "[QRY]"]
word_embedding_model.tokenizer.add_tokens(tokens, special_tokens=True)
word_embedding_model.auto_model.resize_token_embeddings(len(word_embedding_model.tokenizer))

pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 30524. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


In [40]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("microsoft/mpnet-base")
model = AutoModelForMaskedLM.from_pretrained("microsoft/mpnet-base") #532MB

Downloading (…)lve/main/config.json:   0%|          | 0.00/493 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/472k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/532M [00:00<?, ?B/s]

In [41]:
from transformers import pipeline
unmasker = pipeline('fill-mask', model='bert-base-uncased')
unmasker("Hello I'm a [MASK] model.")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.weight', 'cls.seq_relationship.weight', 'bert.pooler.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'score': 0.10731075704097748,
  'token': 4827,
  'token_str': 'fashion',
  'sequence': "hello i'm a fashion model."},
 {'score': 0.08774492889642715,
  'token': 2535,
  'token_str': 'role',
  'sequence': "hello i'm a role model."},
 {'score': 0.05338392034173012,
  'token': 2047,
  'token_str': 'new',
  'sequence': "hello i'm a new model."},
 {'score': 0.04667224735021591,
  'token': 3565,
  'token_str': 'super',
  'sequence': "hello i'm a super model."},
 {'score': 0.027095869183540344,
  'token': 2986,
  'token_str': 'fine',
  'sequence': "hello i'm a fine model."}]