In [2]:
#pip install -U sentence-transformers

In [1]:
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Load https://huggingface.co/sentence-transformers/all-mpnet-base-v2
model = SentenceTransformer("all-mpnet-base-v2", 
                            cache_folder=r'D:\AI-DATASETS\07-Hugging-Face-Data\sentence-transformers')



In [3]:
embeddings = model.encode([
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium.",
])

In [4]:
similarities = model.similarity(embeddings, embeddings)
similarities

tensor([[1.0000, 0.6817, 0.0492],
        [0.6817, 1.0000, 0.0421],
        [0.0492, 0.0421, 1.0000]])

#### Ex 2

In [5]:
# Initializing a SentenceTransformer model with the 'multi-qa-mpnet-base-cos-v1'
model = SentenceTransformer('sentence-transformers/multi-qa-mpnet-base-cos-v1',
                            cache_folder=r'D:\AI-DATASETS\07-Hugging-Face-Data\sentence-transformers')

In [6]:
# Defining a list of documents to generate embeddings for
docs = [
    "Around 9 million people live in London",
    "London is known for its financial district"
]

In [7]:
# Generate vector embeddings for the documents
doc_emb = model.encode(
                docs, # Our documents (an iterable of strings)
                batch_size=32, # Batch the embeddings by this size
                show_progress_bar=True # Display a progress bar
)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
# The shape of the embeddings is (2, 768), indicating a length of 768 

doc_emb.shape

(2, 768)

#### Ex 03

**Cross Encoders**

`Similarity Calculation`: Cross Encoders calculate similarity scores by taking pairs of texts as input and directly computing their similarity. Unlike Sentence Transformers, which encode texts separately, Cross Encoders consider both texts simultaneously.

`Performance`: Cross Encoders generally provide superior performance in terms of accuracy because they capture the interactions between the two texts more effectively. This makes them especially good for tasks where precise semantic matching is crucial.

`Computational Cost`: Cross Encoders are often slower than Sentence Transformers because they need to process each pair of texts individually. This results in higher computational costs, especially when comparing many pairs of texts.

`Use Case` - Re-ranking: Due to their higher computational cost, Cross Encoders are commonly used to re-rank the top-k results from a Sentence Transformer model. The typical workflow involves:

    - Using a Sentence Transformer to encode a large corpus and quickly retrieve the top-k most similar texts.
    - Applying a Cross Encoder to the top-k pairs to re-rank them more accurately.

In [11]:
from sentence_transformers import CrossEncoder

In [13]:
# 1. Load a pre-trained CrossEncoder model
model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2",
                     )

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [14]:
# 2. Predict scores for a pair of sentences
scores = model.predict([
    ("How many people live in Berlin?", "Berlin had a population of 3,520,031 registered inhabitants in an area of 891.82 square kilometers."),
    ("How many people live in Berlin?", "Berlin is well known for its museums."),
])

scores

array([ 8.6071415, -4.320076 ], dtype=float32)

In [15]:
# 3. Rank a list of passages for a query
query = "How many people live in Berlin?"

passages = [
    "Berlin had a population of 3,520,031 registered inhabitants in an area of 891.82 square kilometers.",
    "Berlin is well known for its museums.",
    "In 2014, the city state Berlin had 37,368 live births (+6.6%), a record number since 1991.",
    "The urban area of Berlin comprised about 4.1 million people in 2014, making it the seventh most populous urban area in the European Union.",
    "The city of Paris had a population of 2,165,423 people within its administrative city limits as of January 1, 2019",
    "An estimated 300,000-420,000 Muslims reside in Berlin, making up about 8-11 percent of the population.",
    "Berlin is subdivided into 12 boroughs or districts (Bezirke).",
    "In 2015, the total labour force in Berlin was 1.85 million.",
    "In 2013 around 600,000 Berliners were registered in one of the more than 2,300 sport and fitness clubs.",
    "Berlin has a yearly total of about 135 million day visitors, which puts it in third place among the most-visited city destinations in the European Union.",
]

In [16]:
ranks = model.rank(query, passages)

In [18]:
# Print the scores
print("Query:", query)
for rank in ranks:
    print(f"{rank['score']:.2f}\t{passages[rank['corpus_id']]}")

Query: How many people live in Berlin?
8.92	The urban area of Berlin comprised about 4.1 million people in 2014, making it the seventh most populous urban area in the European Union.
8.61	Berlin had a population of 3,520,031 registered inhabitants in an area of 891.82 square kilometers.
8.24	An estimated 300,000-420,000 Muslims reside in Berlin, making up about 8-11 percent of the population.
7.60	In 2014, the city state Berlin had 37,368 live births (+6.6%), a record number since 1991.
6.35	In 2013 around 600,000 Berliners were registered in one of the more than 2,300 sport and fitness clubs.
5.42	Berlin has a yearly total of about 135 million day visitors, which puts it in third place among the most-visited city destinations in the European Union.
3.45	In 2015, the total labour force in Berlin was 1.85 million.
0.33	Berlin is subdivided into 12 boroughs or districts (Bezirke).
-4.24	The city of Paris had a population of 2,165,423 people within its administrative city limits as of Jan

#### Ex 04

- Cross Encoder for re-ranking

In [19]:
from sentence_transformers import SentenceTransformer, CrossEncoder, util

In [20]:
# Load a bi-encoder model for initial retrieval
bi_encoder = SentenceTransformer('all-MiniLM-L6-v2')


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [21]:
# Load a cross-encoder model for re-ranking
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')

config.json:   0%|          | 0.00/791 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [22]:
# Define some sentences and a query
sentences = ["This is a sentence.", "This is another sentence.", "Yet another sentence."]
query = "What is this sentence?"

In [23]:
# Encode sentences using bi-encoder
sentence_embeddings = bi_encoder.encode(sentences)
query_embedding     = bi_encoder.encode(query)

In [24]:
# Compute cosine similarities with the query
cosine_scores = util.pytorch_cos_sim(query_embedding, sentence_embeddings)[0]
cosine_scores

tensor([0.6087, 0.6072, 0.5031])

In [25]:
# Get the top-k most similar sentences
top_k = min(3, len(sentences))
top_k_indices = cosine_scores.topk(k=top_k)[1]
top_k_indices

tensor([0, 1, 2])

In [26]:
# Prepare pairs for the cross-encoder
pairs = [[query, sentences[idx]] for idx in top_k_indices]

In [27]:
# Re-rank using cross-encoder
cross_scores = cross_encoder.predict(pairs)

In [28]:
# Combine indices and scores
re_ranked_results = sorted(zip(top_k_indices, cross_scores), key=lambda x: x[1], reverse=True)

print("Re-ranked results:")
for idx, score in re_ranked_results:
    print(f"Sentence: {sentences[idx]}, Score: {score}")

Re-ranked results:
Sentence: This is a sentence., Score: 3.8629605770111084
Sentence: This is another sentence., Score: 1.9956350326538086
Sentence: Yet another sentence., Score: -2.280318260192871


Cross Encoders provide a powerful method for computing precise similarity scores between pairs of texts by leveraging the joint processing capabilities of transformer models. They offer higher accuracy for tasks requiring detailed interaction analysis but come with higher computational costs. 

Combining Cross Encoders with bi-encoders allows for efficient and effective retrieval and ranking, making them a valuable tool for various advanced NLP applications.

In [29]:
# Define pairs of sentences
sentence_pairs = [
    ("This is a good book.", "This book is really good."),
    ("The weather is nice today.", "It is raining heavily."),
    ("I love playing football.", "Soccer is my favorite sport."),
    ("She enjoys reading novels.", "He likes watching movies.")
]

In [30]:
# Compute similarity scores for each pair of sentences
similarity_scores = cross_encoder.predict(sentence_pairs)

# Print the results
for pair, score in zip(sentence_pairs, similarity_scores):
    print(f"Sentence Pair: {pair}")
    print(f"Similarity Score: {score}\n")

Sentence Pair: ('This is a good book.', 'This book is really good.')
Similarity Score: 7.161910057067871

Sentence Pair: ('The weather is nice today.', 'It is raining heavily.')
Similarity Score: -5.041144847869873

Sentence Pair: ('I love playing football.', 'Soccer is my favorite sport.')
Similarity Score: 3.345524549484253

Sentence Pair: ('She enjoys reading novels.', 'He likes watching movies.')
Similarity Score: -7.894623756408691

