In [None]:
# sbert.net, SentenceTransformers: for sentence, image embeddings
# python 3.6 or above
!pip install -U sentence-transformers

In [3]:
# use cases
# sentence embedding: nlp process to map sentences to vectors of real numbers
# sentence similarity
# sentence search
# clustering
from sentence_transformers import SentenceTransformer, util

##  Generate embeddings

In [13]:
# all-MiniLM-L6-v2: maps sentences & paragraphs to a 384 dimensional
# dene vector space and can be used for tasks like clustering or 
# semantic search
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

In [14]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [15]:
sentences = [
    "This framework generates embeddings for each input sentence", 
    "Sentences are passed as a list of string.",
    ]
embeddings = model.encode(sentencces)
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

Sentence: This framework generates embeddings for each input sentence
Embedding: [-1.37173515e-02 -4.28515263e-02 -1.56286191e-02  1.40537461e-02
  3.95537652e-02  1.21796273e-01  2.94333976e-02 -3.17524374e-02
  3.54959629e-02 -7.93140009e-02  1.75878499e-02 -4.04369719e-02
  4.97259013e-02  2.54912358e-02 -7.18700811e-02  8.14968571e-02
  1.47071795e-03  4.79627065e-02 -4.50336374e-02 -9.92175341e-02
 -2.81769596e-02  6.45046309e-02  4.44670394e-02 -4.76217121e-02
 -3.52952704e-02  4.38671745e-02 -5.28566167e-02  4.33056121e-04
  1.01921491e-01  1.64072365e-02  3.26996744e-02 -3.45986672e-02
  1.21339224e-02  7.94870928e-02  4.58344305e-03  1.57778151e-02
 -9.68209282e-03  2.87625547e-02 -5.05805984e-02 -1.55793717e-02
 -2.87906677e-02 -9.62280110e-03  3.15556712e-02  2.27348469e-02
  8.71449336e-02 -3.85027304e-02 -8.84718299e-02 -8.75501614e-03
 -2.12343130e-02  2.08923612e-02 -9.02077407e-02 -5.25731780e-02
 -1.05638774e-02  2.88310535e-02 -1.61455069e-02  6.17838791e-03
 -1.23234

## Cosine-similarity

In [17]:
emb1 = model.encode("To be or not to be, this is a question")
emb2 = model.encode("What is the meaning of life?")
cos_sim = util.cos_sim(emb1, emb2)
print("Cosine-Similarity: ", cos_sim)

Cosine-Similarity:  tensor([[0.3170]])


In [24]:
emb1 = model.encode("elastic kubernetes service")
emb2 = model.encode("kubernetes")
cos_sim = util.cos_sim(emb1, emb2)
print("Cosine-Similarity: ", cos_sim)

Cosine-Similarity:  tensor([[0.7611]])


In [66]:
# Define the sentences and punctuation
sentences = [
    "From fairest creatures we desire increase,",
    "That thereby beauty's rose might never die,",
    "But as the riper should by time decease,",
    "His tender heir might bear his memory:",
    "But thou contracted to thine own bright eyes,",
    "Feed'st thy light's flame with self-substantial fuel,",
    "Making a famine where abundance lies,",
    "Thy self thy foe, to thy sweet self too cruel:",
    "Thou art thy mother's glass and she in thee",
    "Calls back the lovely April of her prime,",
    "So thou through windows of thine age shalt see,",
    "Despite of wrinkles this thy golden time.",
    "I ate a breakfast",
    "I have food in the morning"
    
]

# encode 
embeddings = model.encode(sentences)

# compute cosine similarity between all pairs 
cos_sim = util.cos_sim(embeddings, embeddings)

cos_sim

tensor([[ 1.0000,  0.2138,  0.1923,  0.0860,  0.2770,  0.1321,  0.3439,  0.3416,
          0.1560,  0.0824,  0.2171,  0.1350,  0.0687,  0.0982],
        [ 0.2138,  1.0000,  0.2704,  0.2730,  0.3632,  0.1993,  0.1861,  0.3520,
          0.2934,  0.3038,  0.2178,  0.3459,  0.0764, -0.0202],
        [ 0.1923,  0.2704,  1.0000,  0.1779,  0.2300,  0.0971,  0.2947,  0.1151,
          0.1267,  0.1234,  0.2544,  0.3219,  0.1100,  0.2012],
        [ 0.0860,  0.2730,  0.1779,  1.0000,  0.3036,  0.0751,  0.0682,  0.2294,
          0.1932,  0.2045,  0.1986,  0.2721,  0.0542, -0.0148],
        [ 0.2770,  0.3632,  0.2300,  0.3036,  1.0000,  0.2682,  0.1339,  0.4942,
          0.4714,  0.1898,  0.5292,  0.4168,  0.1413,  0.0709],
        [ 0.1321,  0.1993,  0.0971,  0.0751,  0.2682,  1.0000,  0.1854,  0.3150,
          0.2787,  0.1005,  0.1885,  0.1707,  0.0885,  0.0895],
        [ 0.3439,  0.1861,  0.2947,  0.0682,  0.1339,  0.1854,  1.0000,  0.2619,
          0.0824,  0.1079,  0.1198,  0.1217,  0.1

In [58]:
# add all pairs to a list with their cosine similarity score
all_sentence_combinations = [] 
for i in range(len(cos_sim)-1):
    for j in range(i+1, len(cos_sim)):
        all_sentence_combinations.append((cos_sim[i][j], i, j))

all_sentence_combinations

[(tensor(0.2138), 0, 1),
 (tensor(0.1923), 0, 2),
 (tensor(0.0860), 0, 3),
 (tensor(0.2770), 0, 4),
 (tensor(0.1321), 0, 5),
 (tensor(0.3439), 0, 6),
 (tensor(0.3416), 0, 7),
 (tensor(0.1560), 0, 8),
 (tensor(0.0824), 0, 9),
 (tensor(0.2171), 0, 10),
 (tensor(0.1350), 0, 11),
 (tensor(0.0687), 0, 12),
 (tensor(0.0982), 0, 13),
 (tensor(0.2704), 1, 2),
 (tensor(0.2730), 1, 3),
 (tensor(0.3632), 1, 4),
 (tensor(0.1993), 1, 5),
 (tensor(0.1861), 1, 6),
 (tensor(0.3520), 1, 7),
 (tensor(0.2934), 1, 8),
 (tensor(0.3038), 1, 9),
 (tensor(0.2178), 1, 10),
 (tensor(0.3459), 1, 11),
 (tensor(0.0764), 1, 12),
 (tensor(-0.0202), 1, 13),
 (tensor(0.1779), 2, 3),
 (tensor(0.2300), 2, 4),
 (tensor(0.0971), 2, 5),
 (tensor(0.2947), 2, 6),
 (tensor(0.1151), 2, 7),
 (tensor(0.1267), 2, 8),
 (tensor(0.1234), 2, 9),
 (tensor(0.2544), 2, 10),
 (tensor(0.3219), 2, 11),
 (tensor(0.1100), 2, 12),
 (tensor(0.2012), 2, 13),
 (tensor(0.3036), 3, 4),
 (tensor(0.0751), 3, 5),
 (tensor(0.0682), 3, 6),
 (tensor(0.2

In [76]:
### sort list by the highest cosine similarity score 
all_sentence_combinations = sorted(all_sentence_combinations, 
                                   key=lambda x: x[0], reverse=True)
print("Top-5 most similar pairs:")
print(all_sentence_combinations[0:2])
for score, i, j in all_sentence_combinations[0:5]:
    #print(score, i, j)
    print(f'{sentences[i]}\t',f'{sentences[j]}\t', f'{cos_sim[i][j]:.4f}')

Top-5 most similar pairs:
[(tensor(0.6596), 12, 13), (tensor(0.5292), 4, 10)]
I ate a breakfast	 I have food in the morning	 0.6596
But thou contracted to thine own bright eyes,	 So thou through windows of thine age shalt see,	 0.5292
Thou art thy mother's glass and she in thee	 So thou through windows of thine age shalt see,	 0.5073
But thou contracted to thine own bright eyes,	 Thy self thy foe, to thy sweet self too cruel:	 0.4942
But thou contracted to thine own bright eyes,	 Thou art thy mother's glass and she in thee	 0.4714


## semantic search
A query search aims to not only find keywords, but to determine the intent and contextual meaning of the words for search

In [81]:
# multilingual faq retrieval model trained on the MFAQ dataset, 
# it ranks candidate answers according to a given question.
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('clips/mfaq')



In [82]:
question = "What is the meaning of life?"
answer_1 = "The meaning of life is to fullfill self dreams."
answer_2 = "Carpe Diem"
answer_3 = "After year of AI winter, AI spring comes in 2023."


query_embedding = model.encode(question)
corpus_embeddings = model.encode([answer_1, answer_2, answer_3])
print(util.semantic_search(query_embedding, corpus_embeddings))

[[{'corpus_id': 0, 'score': 0.7034963369369507}, {'corpus_id': 1, 'score': 0.5959230661392212}, {'corpus_id': 2, 'score': 0.591738224029541}]]


## QA model

In [84]:
from transformers import pipeline 
qa_model = pipeline("question-answering")
question = "How many models can I host in huggingface?"
context =  "Alll plans come with unlimited private models and datasets."
qa_model(question = question,  context = context)

No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

{'score': 0.7601079344749451, 'start': 21, 'end': 30, 'answer': 'unlimited'}

## Sentences clustering

In [31]:
from sklearn.cluster import KMeans
import numpy as np 

embedder = SentenceTransformer('all-MiniLM-l6-v2')

In [46]:
corpus = [
    'An animal on the tree.',
    'A man is eating food.',
    'He is eating a piece of bread.',
    'The woman is carrying a baby.',
    'The girl is walking with her sister.',
    'A monkey sits on the tree branch.',
    'There is a gorilla on the tree.'
]
corpus_embeddings = embedder.encode(corpus)

# normalize the embeddings to unit length
corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

In [47]:
# corpus_embeddings[0]

In [48]:
clustering_model = KMeans(n_clusters=3)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_
print(cluster_assignment)

[1 2 2 0 0 1 1]


  super()._check_params_vs_input(X, default_n_init=10)


In [49]:
clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(corpus[sentence_id])
clustered_sentences


{1: ['An animal on the tree.',
  'A monkey sits on the tree branch.',
  'There is a gorilla on the tree.'],
 2: ['A man is eating food.', 'He is eating a piece of bread.'],
 0: ['The woman is carrying a baby.', 'The girl is walking with her sister.']}