In [None]:
# compute the cosine-similarity between the query and all entries in the corpus.
# embedding techniques are used to represent words/text mathematically with numeric vectors using encoders/transformer
# such as  one-hot encoding..
# SentenceTransformer('all-MiniLM-L6-v2') defines which embedding transformer model we like to use.
# In this example, we load all-MiniLM-L6-v2, which is a MiniLM model fine tuned on a large dataset of over 1 billion training
# pairs.
# !pip install -U sentence-transformers

from sentence_transformers import SentenceTransformer, util
import torch

encoder = SentenceTransformer('all-MiniLM-L6-v2')

# Corpus with example sentences
corpus = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'The girl is carrying a baby.',
          'A man is riding a horse.',
          'A woman is playing violin.',
          'Two men pushed carts through the woods.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'A cheetah is running behind its prey.'
          ]
embeddings =encoder.encode(corpus, convert_to_tensor=True)

# Query sentences:
queries = ['A man is eating pasta.', 'Someone in a gorilla costume is playing a set of drums.',
           'A cheetah chases prey on across a field.']

# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
# top_k = min(5, len(corpus)) # for cases where the corpus is shorter than 5
for q in queries:
    query_embedding = encoder.encode(q, convert_to_tensor=True)

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.cos_sim(query_embedding, embeddings)[0]
    top_results = torch.topk(cos_scores, k=5)

    print("Query:", q)
    print("\nTop 5 most similar sentences in corpus:")
    print("\n")

    for score, idx in zip(top_results[0], top_results[1]):
        print(corpus[idx], "(Score: {:.4f})".format(score))

In [None]:
# we can use util.semantic_search instead
# util.semantic_search performs a cosine similarity search between a list of query embeddings and a list of corpus embeddings.
# It can be used for Information Retrieval / Semantic Search for corpora up to about 1 Million entries.
# By default, up to 100 queries are processed in parallel.
# Further, the corpus is chunked into set of up to 500k entries. 
# You can increase query_chunk_size and corpus_chunk_size, which leads to increased speed for large corpora,
# but also increases the memory requirement.
# returns a list with one entry for each query.
# Each entry is a dictionaries with the keys ‘corpus_id’ and ‘score’, 
# sorted by decreasing cosine similarity scores.
query_embeddings =[]
for q in queries:
    query_embedding = encoder.encode(q, convert_to_tensor=True)
    query_embeddings.append(query_embedding)
    
sim_scores = util.semantic_search(query_embeddings, embeddings, top_k = 5)
print(sim_scores)

In [None]:
# Text summarization on congress bills (draft laws) dataset
import pandas as pd
WH_legis= pd.read_csv('house_legislation_116.csv')
# 'summary' column contains a summary of the bill
print(WH_legis['summary'][0])
WH_legis = WH_legis[WH_legis['summary'].notna()] # drop Nan rows

In [None]:
WH_legis.reset_index(inplace =True)
summary= WH_legis['summary']
queries =['medication pricing', 'foreigner residence in united states', 'climate change']
query_embeddings =[]
for q in queries:
    query_embedding = encoder.encode(q, convert_to_tensor=True)
    query_embeddings.append(query_embedding)
corpus_embeddings =encoder.encode(summary, convert_to_tensor=True) 
sim_cos_scores = util.semantic_search(query_embeddings, corpus_embeddings, top_k = 10)# it takes some time

print(sim_cos_scores)

In [None]:
# Approximate Nearest Neighbor (ANN) can be helpful, since the data is partitioned into smaller fractions of similar embeddings.
# that is KNN is first applied to group the data based on similarity.
# the index forest can be searched efficiently and the embeddings with the highest similarity (the nearest neighbors) can be retrieved within milliseconds,
# even if you have millions of vectors.The main disadvatage is that some vectors with high similarity may be missed; that's
# why this is called Approximate Nearest Neighbor
#For all ANN methods, there are usually one or more parameters to tune that determine the recall-speed trade-off.
#If you want the highest speed, you have a high chance of missing hits. If you want high recall, the search speed decreases.
# AnnoyIndex() takes an argument represting the embedding size ;the number of features in an indexed vector;get the min embedding length

# install annoy
!pip install annoy
from annoy import AnnoyIndex
# [min(len(emb) for emb in corpus_embeddings)]# result in 384
embedding_size = 384
n_tree= 200 # No. of clusters
annoy_index = AnnoyIndex(embedding_size, 'angular')
for i in range(len(corpus_embeddings)):
        annoy_index.add_item(i, corpus_embeddings[i])

annoy_index.build(n_tree) #apply ANN to build a forest of index trees (200 trees)
#annoy_index.save(annoy_index_path)# to save the ANN model to a file 

In [None]:
top_k_hits =5 
for q in queries:
    query_embedding = encoder.encode(q, convert_to_tensor=True)
    #Search the 5 closest items.
    #include_distances: The flag indicating whether to returns all corresponding distances.
    corpus_ids, scores = annoy_index.get_nns_by_vector(query_embedding, top_k_hits, include_distances=True)
    hits = []
    for i, score in zip(corpus_ids, scores):
        ##  the scores returned by Annoy_index is euclidean distance,
        # we need to calculate the cosine distance(in case comparison with other cosine similarity methods) 
        # the cosine distance is equals to 1 - e^2/2, where e is the euclidean distance value
        hits.append({'corpus_id': i, 'score': 1-((score**2) / 2)})
    print("\n Input question:", q)
    for hit in hits[0:top_k_hits]:
        print("\t{:.3f}\t{}".format(hit['score'], summary[hit['corpus_id']]))