In [104]:
# CASL conda env
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F
import numpy as np

In [2]:
tokenizer = AutoTokenizer.from_pretrained("amberoad/bert-multilingual-passage-reranking-msmarco")

Downloading:   0%|          | 0.00/696 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

In [3]:
tokenizer

PreTrainedTokenizer(name_or_path='amberoad/bert-multilingual-passage-reranking-msmarco', vocab_size=105879, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [4]:
model = AutoModelForSequenceClassification.from_pretrained("amberoad/bert-multilingual-passage-reranking-msmarco")

Downloading:   0%|          | 0.00/669M [00:00<?, ?B/s]

In [5]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [67]:
queries = ['what is stock market?', 'what is stock market?', 'what is stock market?']

docs = ['machine learning is an area of artificial intelligence',
       'stock market is a place to buy and sell company shares',
       'S&P 500 is the biggest index for stock market in US']

In [68]:
encodings = tokenizer(queries, docs, padding = True, return_tensors= 'pt')

In [69]:
encodings.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [70]:
encodings['input_ids']

tensor([[  101, 11523, 10127, 16913, 15200,   136,   102, 14338, 19956, 10127,
         10144, 10793, 10108, 30621, 19334,   102,     0,     0,     0,     0,
             0],
        [  101, 11523, 10127, 16913, 15200,   136,   102, 16913, 15200, 10127,
           143, 11125, 10114, 35172, 10110, 32282, 11062, 43102,   102,     0,
             0],
        [  101, 11523, 10127, 16913, 15200,   136,   102,   161,   111,   158,
         10755, 10127, 10103, 31575, 10534, 10139, 16913, 15200, 10104, 10763,
           102]])

In [71]:
encodings['token_type_ids']

tensor([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [72]:
encodings['attention_mask']

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [73]:
for tokens in encodings['input_ids']:
    print(tokenizer.convert_ids_to_tokens(tokens))

['[CLS]', 'what', 'is', 'stock', 'market', '?', '[SEP]', 'machine', 'learning', 'is', 'an', 'area', 'of', 'artificial', 'intelligence', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
['[CLS]', 'what', 'is', 'stock', 'market', '?', '[SEP]', 'stock', 'market', 'is', 'a', 'place', 'to', 'buy', 'and', 'sell', 'company', 'shares', '[SEP]', '[PAD]', '[PAD]']
['[CLS]', 'what', 'is', 'stock', 'market', '?', '[SEP]', 's', '&', 'p', '500', 'is', 'the', 'biggest', 'index', 'for', 'stock', 'market', 'in', 'us', '[SEP]']


In [74]:
model.eval()
with torch.no_grad():
    scores = model(**encodings)
    print(scores)

(tensor([[ 5.9359, -5.1967],
        [-3.6326,  3.4526],
        [ 5.4076, -4.3516]]),)


In [75]:
pt_predictions = F.softmax(scores[0], dim=-1)
pt_predictions

tensor([[9.9999e-01, 1.4628e-05],
        [8.3671e-04, 9.9916e-01],
        [9.9994e-01, 5.7756e-05]])

In [66]:
F.softmax(scores[0], dim=0)

tensor([[0.5674, 0.0920],
        [0.0981, 0.6937],
        [0.3345, 0.2143]])

### Using SentenceTransformers

In [111]:
from sentence_transformers.cross_encoder import CrossEncoder
model = CrossEncoder('amberoad/bert-multilingual-passage-reranking-msmarco', max_length=512)
# model = CrossEncoder('cross-encoder/distilroberta-base-stsb')

In [112]:
query = 'what is stock market?'

docs = ['machine learning is an area of artificial intelligence',
                 'stock market is a place to buy and sell company shares',
                 'S&P 500 is the biggest index for stock market in US']

In [113]:
sentence_combinations = [[query, doc] for doc in docs]

In [114]:
similarity_scores = model.predict(sentence_combinations)
similarity_scores

array([[ 5.935896 , -5.1966953],
       [-3.6325831,  3.4526143],
       [ 5.407645 , -4.3515887]], dtype=float32)

In [115]:
sim_scores_argsort = reversed(np.argsort(similarity_scores))
sim_scores_argsort

<reversed at 0x219a4d52fd0>

In [116]:
list(sim_scores_argsort)

[array([1, 0], dtype=int64),
 array([0, 1], dtype=int64),
 array([1, 0], dtype=int64)]

In [117]:
print("Query:", query)
for idx in sim_scores_argsort:
    print("{:.2f}\t{}".format(similarity_scores[idx], docs[idx]))

Query: what is stock market?
