Using SBERT to find similar sentences

references: 

https://medium.com/mlearning-ai/nice-classification-recommendation-using-sentence-bert-b1af32d0131e

https://www.sbert.net/docs/usage/semantic_textual_similarity.html

https://www.sbert.net/examples/applications/paraphrase-mining/README.html


In [12]:
# %pip install sentence-transformers
# %pip install pandas 
# %pip install numpy
# %pip install iProgress
# %pip install pyarrow

In [17]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
import os

In [13]:
# load the dataframe 
title_number = "12"
df = pd.read_parquet(f"../dataframe/{title_number}.parquet")
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117306 entries, 0 to 117305
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   p_id          117306 non-null  object
 1   text          117306 non-null  object
 2   child_ids     117306 non-null  object
 3   cfr_links     117306 non-null  object
 4   other_links   117306 non-null  object
 5   link_targets  117306 non-null  object
dtypes: object(6)
memory usage: 5.4+ MB


In [38]:
# Rather than use the whole title, let's just use a sample
# sample_size = 1000
# df_sample = df[:sample_size]
# df_sample.info()

In [14]:
#Load the SBERT model
#Takes 26 minutes to train on my non-GPU notebook 
#12th Gen Intel i7-1260P (16) @ 4.700GHz, 32MB RAM
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
batch_size = 128

#Create the sentence embeddings, but load if it already exists
try:
    sentence_embeddings = np.load(f"../embeddings/{title_number}_sentence_embeddings.npz")['arr_0']
    loaded_from_file = True
    print("Loaded embeddings from file")
except:
    loaded_from_file = False
    sentence_embeddings = sbert_model.encode(df['text'], batch_size=batch_size, show_progress_bar=True)
print("Shape of embeddings = ", sentence_embeddings.shape)
# sentence_embeddings[0]

Batches: 100%|██████████| 917/917 [13:36<00:00,  1.12it/s]


Shape of embeddings =  (117306, 384)


In [18]:
# save the embeddings
if not loaded_from_file:
    os.makedirs("../embeddings", exist_ok=True)
    np.savez_compressed(f"../embeddings/{title_number}_sentence_embeddings.npz", sentence_embeddings)

In [19]:

#Create the sentence embeddings for the example product description

query = 'Transfer of fund via electronic methods such as ATM, POS, internet banking, mobile banking, etc.'
# query = 'personal trading of securities and other financial instruments for own account'
query_vec = sbert_model.encode([query])[0]
print("Shape of query embeddings = ", query_vec.shape)

Shape of query embeddings =  (384,)


In [20]:

#Calculate the similarity of the query to each text block in the corpus
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

cfr_sim = []
for cfr in sentence_embeddings:
    cfr_sim.append(cosine(query_vec, cfr))

df['similarity'] = cfr_sim

#Display the top matches
df.sort_values(by=['similarity'], ascending=False).head(5).style.set_properties(subset=['text'], **{'width-min': '50px'})

Unnamed: 0,p_id,text,child_ids,cfr_links,other_links,link_targets,similarity
18333,205.3(b),(b)Electronic fund transfer—,['205.3(b)(1)' '205.3(b)(1)(i)' '205.3(b)(1)(ii)' '205.3(b)(1)(iii)'  '205.3(b)(1)(iv)' '205.3(b)(1)(v)' '205.3(b)(2)' '205.3(b)(2)(i)'  '205.3(b)(2)(ii)' '205.3(b)(2)(iii)' '205.3(b)(2)(iv)' '205.3(b)(3)'  '205.3(b)(3)(i)' '205.3(b)(3)(ii)' '205.3(b)(3)(iii)'],['/on/2023-09-28/title-12/section-205.3#p-205.3(b)(2)(iii)'  '/on/2023-09-28/title-12/section-205.3#p-205.3(b)(2)(iii)'  '/on/2023-09-28/title-12/section-205.3#p-205.3(b)(2)'  '/on/2023-09-28/title-12/section-205.3#p-205.3(b)(3)(ii)'  '/on/2023-09-28/title-12/section-205.3#p-205.3(b)(3)(i)'  '/on/2023-09-28/title-12/section-205.3#p-205.3(b)(3)(ii)'],[],['205.3(b)(2)(iii)' '205.3(b)(2)(iii)' '205.3(b)(2)' '205.3(b)(3)(ii)'  '205.3(b)(3)(i)' '205.3(b)(3)(ii)'],0.738358
84092,1005.3(b),(b)Electronic fund transfer—,['1005.3(b)(1)' '1005.3(b)(1)(i)' '1005.3(b)(1)(ii)' '1005.3(b)(1)(iii)'  '1005.3(b)(1)(iv)' '1005.3(b)(1)(v)' '1005.3(b)(2)' '1005.3(b)(2)(i)'  '1005.3(b)(2)(ii)' '1005.3(b)(2)(iii)' '1005.3(b)(3)' '1005.3(b)(3)(i)'  '1005.3(b)(3)(ii)'],['/on/2023-09-28/title-12/section-1005.3#p-1005.3(b)(2)'  '/on/2023-09-28/title-12/section-1005.3#p-1005.3(b)(3)(ii)'  '/on/2023-09-28/title-12/section-1005.3#p-1005.3(b)(3)(i)'],[],['1005.3(b)(2)' '1005.3(b)(3)(ii)' '1005.3(b)(3)(i)'],0.738358
18340,205.3(b)(2),(2)Electronic fund transfer using information from a check.,['205.3(b)(2)(i)' '205.3(b)(2)(ii)' '205.3(b)(2)(iii)' '205.3(b)(2)(iv)'],['/on/2023-09-28/title-12/section-205.3#p-205.3(b)(2)(iii)'  '/on/2023-09-28/title-12/section-205.3#p-205.3(b)(2)(iii)'  '/on/2023-09-28/title-12/section-205.3#p-205.3(b)(2)'],[],['205.3(b)(2)(iii)' '205.3(b)(2)(iii)' '205.3(b)(2)'],0.731448
84099,1005.3(b)(2),(2)Electronic fund transfer using information from a check.,['1005.3(b)(2)(i)' '1005.3(b)(2)(ii)' '1005.3(b)(2)(iii)'],['/on/2023-09-28/title-12/section-1005.3#p-1005.3(b)(2)'],[],['1005.3(b)(2)'],0.731448
85036,Supplement-I-to-Part-1005 1.,1.Fund transfers covered.The term “electronic fund transfer” includes:,['Supplement-I-to-Part-1005(1.)(i.)' 'Supplement-I-to-Part-1005(1.)(ii.)'  'Supplement-I-to-Part-1005(1.)(iii.)'  'Supplement-I-to-Part-1005(1.)(iv.)' 'Supplement-I-to-Part-1005(1.)(v.)'  'Supplement-I-to-Part-1005(1.)(vi.)'],[],[],[],0.730888


In [62]:
# using util, much simpler!
result = util.semantic_search(query_vec, sentence_embeddings, top_k=5)
result_p_id = [df.iloc[r.get('corpus_id')]['p_id'] for r in result[0]]
result_text = [df.iloc[r.get('corpus_id')]['text'] for r in result[0]]
result_score = [r.get('score') for r in result[0]]
result_df = pd.DataFrame({'p_id': result_p_id, 'text': result_text, 'score': result_score})
result_df.style.set_properties(subset=['text'], **{'width-min': '50px'})


Unnamed: 0,p_id,text,score
0,205.3(b),(b)Electronic fund transfer—,0.738359
1,1005.3(b),(b)Electronic fund transfer—,0.738359
2,1005.3(b)(2),(2)Electronic fund transfer using information from a check.,0.731448
3,205.3(b)(2),(2)Electronic fund transfer using information from a check.,0.731448
4,Supplement-I-to-Part-1005 1.,1.Fund transfers covered.The term “electronic fund transfer” includes:,0.730888


In [23]:
# Now let's use the paraphrase model
sentences = df['text'].tolist()
paraphrases = util.paraphrase_mining(sbert_model, sentences, 
                                     query_chunk_size=32, corpus_chunk_size=len(sentences),
                                     top_k=1, max_pairs=100,
                                     show_progress_bar=True)

for paraphrase in paraphrases[0:10]:
    score, i, j = paraphrase
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences[i], sentences[j], score))

Batches:   2%|▏         | 77/3666 [01:46<1:22:42,  1.38s/it]


KeyboardInterrupt: 