In [1]:
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
model1 = SentenceTransformer("./all-MiniLM-L6-v2")
model1

No sentence-transformers model found with name ./all-MiniLM-L6-v2. Creating a new one with MEAN pooling.


SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [4]:
model2 = SentenceTransformer("./bge-large-en-v1.5")
model2

No sentence-transformers model found with name ./bge-large-en-v1.5. Creating a new one with MEAN pooling.


SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [6]:
def test1(model):
    query_embedding = model.encode("How big is London")
    passage_embedding = model.encode([
        "London has 9,787,426 inhabitants at the 2011 census",
        "London is known for its finacial district",
    ])

    print("Similarity:", util.dot_score(query_embedding, passage_embedding))

test1(model1)
test1(model2)


Similarity: tensor([[18.7014, 19.2733]])
Similarity: tensor([[211.7341, 179.6893]])


In [7]:
def test2(model):
    # Two lists of sentences
    sentences1 = [
        "The cat sits outside",
        "A man is playing guitar",
        "The new movie is awesome",
    ]
    
    sentences2 = [
        "The dog plays in the garden",
        "A woman watches TV",
        "The new movie is so great",
    ]
    
    # Compute embedding for both lists
    embeddings1 = model.encode(sentences1, convert_to_tensor=True)
    embeddings2 = model.encode(sentences2, convert_to_tensor=True)
    
    # Compute cosine-similarities
    cosine_scores = util.cos_sim(embeddings1, embeddings2)
    
    # Output the pairs with their score
    for i in range(len(sentences1)):
        print("{} \t\t {} \t\t Score: {:.4f}".format(
            sentences1[i], sentences2[i], cosine_scores[i][i]
        ))

test2(model1)
test2(model2)

The cat sits outside 		 The dog plays in the garden 		 Score: 0.2838
A man is playing guitar 		 A woman watches TV 		 Score: -0.0327
The new movie is awesome 		 The new movie is so great 		 Score: 0.8939
The cat sits outside 		 The dog plays in the garden 		 Score: 0.5644
A man is playing guitar 		 A woman watches TV 		 Score: 0.4045
The new movie is awesome 		 The new movie is so great 		 Score: 0.9627


In [9]:
def test3(model):
    # Single list of sentences
    sentences = [
        "The cat sits outside",
        "A man is playing guitar",
        "I love pasta",
        "The new movie is awesome",
        "The cat plays in the garden",
        "A woman watches TV",
        "The new movie is so great",
        "Do you like pizza?",
    ]
    
    # Compute embeddings
    embeddings = model.encode(sentences, convert_to_tensor=True)
    
    # Compute cosine-similarities for each sentence with each other sentence
    cosine_scores = util.cos_sim(embeddings, embeddings)
    
    # Find the pairs with the highest cosine similarity scores
    pairs = []
    for i in range(cosine_scores.shape[0]):
        for j in range(cosine_scores.shape[1]):
            pairs.append({"index": [i, j], "score": cosine_scores[i][j]})
    
    # Sort scores in decreasing order
    pairs = sorted(pairs, key=lambda x: x["score"], reverse=True)
    
    for pair in pairs[0:10]:
        i, j = pair["index"]
        print("{} \t\t {} \t\t Score: {:.4f}".format(
            sentences[i], sentences[j], pair["score"]
        ))

test3(model1)
print("====================================================================================================================")
test3(model2)

I love pasta 		 I love pasta 		 Score: 1.0000
The cat sits outside 		 The cat sits outside 		 Score: 1.0000
The cat plays in the garden 		 The cat plays in the garden 		 Score: 1.0000
A woman watches TV 		 A woman watches TV 		 Score: 1.0000
The new movie is so great 		 The new movie is so great 		 Score: 1.0000
Do you like pizza? 		 Do you like pizza? 		 Score: 1.0000
A man is playing guitar 		 A man is playing guitar 		 Score: 1.0000
The new movie is awesome 		 The new movie is awesome 		 Score: 1.0000
The new movie is awesome 		 The new movie is so great 		 Score: 0.8939
The new movie is so great 		 The new movie is awesome 		 Score: 0.8939
I love pasta 		 I love pasta 		 Score: 1.0000
A woman watches TV 		 A woman watches TV 		 Score: 1.0000
The new movie is so great 		 The new movie is so great 		 Score: 1.0000
A man is playing guitar 		 A man is playing guitar 		 Score: 1.0000
The cat sits outside 		 The cat sits outside 		 Score: 1.0000
The cat plays in the garden 		 The cat pla

In [10]:
"""
This is a simple application for sentence embeddings: semantic search

We have a corpus with various sentences. Then, for a given query sentence,
we want to find the most similar sentence in this corpus.

This script outputs for various queries the top 5 most similar sentences in the corpus.
"""


import torch

def test4(embedder):

    
    # Corpus with example sentences
    corpus = [
        "A man is eating food.",
        "A man is eating a piece of bread.",
        "The girl is carrying a baby.",
        "A man is riding a horse.",
        "A woman is playing violin.",
        "Two men pushed carts through the woods.",
        "A man is riding a white horse on an enclosed ground.",
        "A monkey is playing drums.",
        "A cheetah is running behind its prey.",
    ]
    corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)
    
    # Query sentences:
    queries = [
        "A man is eating pasta.",
        "Someone in a gorilla costume is playing a set of drums.",
        "A cheetah chases prey on across a field.",
    ]
    
    
    # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
    top_k = min(5, len(corpus))
    for query in queries:
        query_embedding = embedder.encode(query, convert_to_tensor=True)
    
        # We use cosine-similarity and torch.topk to find the highest 5 scores
        cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
        top_results = torch.topk(cos_scores, k=top_k)
    
        print("\n\n======================\n\n")
        print("Query:", query)
        print("\nTop 5 most similar sentences in corpus:")
    
        for score, idx in zip(top_results[0], top_results[1]):
            print(corpus[idx], "(Score: {:.4f})".format(score))
    
        # """
        # Alternatively, we can also use util.semantic_search to perform cosine similarty + topk
        hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=5)
        hits = hits[0]      #Get the hits for the first query
        for hit in hits:
            print(corpus[hit['corpus_id']], "(Score: {:.4f})".format(hit['score']))
        # """

test4(model1)
print("\n\n")
test4(model2)





Query: A man is eating pasta.

Top 5 most similar sentences in corpus:
A man is eating food. (Score: 0.7035)
A man is eating a piece of bread. (Score: 0.5272)
A man is riding a horse. (Score: 0.1889)
A man is riding a white horse on an enclosed ground. (Score: 0.1047)
A cheetah is running behind its prey. (Score: 0.0980)
A man is eating food. (Score: 0.7035)
A man is eating a piece of bread. (Score: 0.5272)
A man is riding a horse. (Score: 0.1889)
A man is riding a white horse on an enclosed ground. (Score: 0.1047)
A cheetah is running behind its prey. (Score: 0.0980)




Query: Someone in a gorilla costume is playing a set of drums.

Top 5 most similar sentences in corpus:
A monkey is playing drums. (Score: 0.6433)
A woman is playing violin. (Score: 0.2564)
A man is riding a horse. (Score: 0.1389)
A man is riding a white horse on an enclosed ground. (Score: 0.1191)
A cheetah is running behind its prey. (Score: 0.1080)
A monkey is playing drums. (Score: 0.6433)
A woman is playing v

In [20]:

def test5(model):

    docs = [
        "I love python",
        "Python is one of the most popular language in AI development",
        "Python is better than Java.",
        "My first paragraph. That contains information",
        "Python is a programming language.",
    ]
    document_embeddings = model.encode(docs, convert_to_tensor=True)
    
    query = "What is Python?"
    query_embedding = model.encode(query, convert_to_tensor=True)
    
    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.cos_sim(query_embedding, document_embeddings)[0]
    top_k = min(5, len(docs))
    top_results = torch.topk(cos_scores, k=top_k)

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for score, idx in zip(top_results[0], top_results[1]):
        print(docs[idx], "(Score: {:.4f})".format(score))

test5(model1)
test5(model2)





Query: What is Python?

Top 5 most similar sentences in corpus:
Python is a programming language. (Score: 0.9118)
I love python (Score: 0.7800)
Python is one of the most popular language in AI development (Score: 0.6519)
Python is better than Java. (Score: 0.5992)
My first paragraph. That contains information (Score: 0.0640)




Query: What is Python?

Top 5 most similar sentences in corpus:
Python is a programming language. (Score: 0.8517)
I love python (Score: 0.7958)
Python is better than Java. (Score: 0.7570)
Python is one of the most popular language in AI development (Score: 0.7310)
My first paragraph. That contains information (Score: 0.5170)


In [5]:
from sentence_transformers.cross_encoder import CrossEncoder

model = CrossEncoder("./all-MiniLM-L6-v2")

scores = model.predict([["My first", "sentence pair"], ["Second text", "pair"]])
scores = model.predict(['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.'])


In [1]:
from FlagEmbedding import FlagReranker
reranker = FlagReranker('./bge-reranker-large', use_fp16=False) # Setting use_fp16 to True speeds up computation with a slight performance degradation

score = reranker.compute_score(['query', 'passage'])
print(score)

scores = reranker.compute_score([['what is panda?', 'hi'], ['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']])
print(scores)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ./bge-reranker-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


-0.055359140038490295
[-0.22660410404205322, 0.17735664546489716]


In [1]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('./bge-reranker-large')
model = AutoModelForSequenceClassification.from_pretrained('./bge-reranker-large')
model.eval()

pairs = [['what is panda?', 'hi'], ['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']]
with torch.no_grad():
    inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)
    scores = model(**inputs, return_dict=True).logits.view(-1, ).float()
    print(scores)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ./bge-reranker-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([ 0.2681, -0.0465])


In [6]:
print(scores)

[-0.0400845  -0.00561158]


In [22]:
"""
This example computes the score between a query and all possible
sentences in a corpus using a Cross-Encoder for semantic textual similarity (STS).
It output then the most similar sentences for the given query.
"""

# from sentence_transformers.cross_encoder import CrossEncoder
import numpy as np

# # Pre-trained cross encoder
model = CrossEncoder("cross-encoder/stsb-distilroberta-base")

In [23]:
# We want to compute the similarity between the query sentence
query = "A man is eating pasta."

# With all sentences in the corpus
corpus = [
    "A man is eating food.",
    "A man is eating a piece of bread.",
    "The girl is carrying a baby.",
    "A man is riding a horse.",
    "A woman is playing violin.",
    "Two men pushed carts through the woods.",
    "A man is riding a white horse on an enclosed ground.",
    "A monkey is playing drums.",
    "A cheetah is running behind its prey.",
]

# 1. We rank all sentences in the corpus for the query
ranks = model.rank(query, corpus)

# Print the scores
print("Query:", query)
for rank in ranks:
    print(f"{rank['score']:.2f}\t{corpus[rank['corpus_id']]}")

# 2. Alternatively, you can also manually compute the score between two sentences
sentence_combinations = [[query, sentence] for sentence in corpus]
scores = model.predict(sentence_combinations)

# Sort the scores in decreasing order to get the corpus indices
ranked_indices = np.argsort(scores)[::-1]
print("scores:", scores)
print("indices:", ranked_indices)

Query: A man is eating pasta.
0.67	A man is eating food.
0.34	A man is eating a piece of bread.
0.08	A man is riding a horse.
0.07	A man is riding a white horse on an enclosed ground.
0.01	The girl is carrying a baby.
0.01	Two men pushed carts through the woods.
0.01	A monkey is playing drums.
0.01	A woman is playing violin.
0.01	A cheetah is running behind its prey.
scores: [0.67323726 0.34102532 0.00542465 0.07569354 0.00525378 0.00536814
 0.06676241 0.00534824 0.00516718]
indices: [0 1 3 6 2 5 7 4 8]
