Using SBERT to find similar sentences

references: 

https://medium.com/mlearning-ai/nice-classification-recommendation-using-sentence-bert-b1af32d0131e
https://www.sbert.net/docs/usage/semantic_textual_similarity.html
https://www.sbert.net/examples/applications/paraphrase-mining/README.html


In [5]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

In [27]:

#Cosine Similarity function
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

# #Load the Nice classification data
# ncl_all = pd.read_csv('consolidated_nice_classifications.csv', sep=',')

# load the dataframe 
title_number = "12"
df = pd.read_parquet(f"../dataframe/{title_number}.parquet")
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117306 entries, 0 to 117305
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   p_id          117306 non-null  object
 1   text          117306 non-null  object
 2   child_ids     117306 non-null  object
 3   cfr_links     117306 non-null  object
 4   other_links   117306 non-null  object
 5   link_targets  117306 non-null  object
dtypes: object(6)
memory usage: 5.4+ MB


In [38]:
# Rather than use the whole title, let's just use a sample
# sample_size = 1000
# df_sample = df[:sample_size]
# df_sample.info()

In [44]:
#Load the SBERT model
#Takes 26 minutes to train on my non-GPU notebook 
#12th Gen Intel i7-1260P (16) @ 4.700GHz, 32MB RAM
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
batch_size = 128

#Create the sentence embeddings, but load if it already exists
try:
    sentence_embeddings = np.load(f"../embeddings/{title_number}_sentence_embeddings.npz")['arr_0']
    print("Loaded embeddings from file")
except:
    sentence_embeddings = sbert_model.encode(df['text'], batch_size=batch_size, show_progress_bar=True)
print("Shape of embeddings = ", sentence_embeddings.shape)
# sentence_embeddings[0]

Loaded embeddings from file
Shape of embeddings =  (117306, 384)


In [42]:
# save the embeddings
np.savez_compressed(f"../embeddings/{title_number}_sentence_embeddings.npz", sentence_embeddings)

In [47]:

#Create the sentence embeddings for the example product description

# query = 'Transfer of fund via electronic methods such as ATM, POS, internet banking, mobile banking, etc.'
query = 'personal trading of securities and other financial instruments for own account'
query_vec = sbert_model.encode([query])[0]
print("Shape of query embeddings = ", query_vec.shape)

Shape of query embeddings =  (384,)


In [48]:

#Calculate the similarity of the product description to the Nice classifications
cfr_sim = []
for cfr in sentence_embeddings:
    cfr_sim.append(cosine(query_vec, cfr))

df['similarity'] = cfr_sim

#Display the top 20 matches
df.sort_values(by=['similarity'], ascending=False).head(20).style.set_properties(subset=['text'], **{'width-min': '50px'})

Unnamed: 0,p_id,text,child_ids,cfr_links,other_links,link_targets,similarity
40713,248.3(b)(2),(2)Trading account application for certain banking entities.,['248.3(b)(2)(i)' '248.3(b)(2)(ii)'],['/on/2023-09-28/title-12/section-248.3#p-248.3(b)(1)(ii)'  '/on/2023-09-28/title-12/section-248.3#p-248.3(b)(1)(i)'  '/on/2023-09-28/title-12/section-248.3#p-248.3(b)(1)(ii)'  '/on/2023-09-28/title-12/section-248.3#p-248.3(b)(1)(ii)'  '/on/2023-09-28/title-12/section-248.3#p-248.3(b)(1)(i)'],[],['248.3(b)(1)(ii)' '248.3(b)(1)(i)' '248.3(b)(1)(ii)' '248.3(b)(1)(ii)'  '248.3(b)(1)(i)'],0.647494
12124,44.3(b)(2),(2)Trading account application for certain banking entities.,['44.3(b)(2)(i)' '44.3(b)(2)(ii)'],['/on/2023-09-28/title-12/section-44.3#p-44.3(b)(1)(ii)'  '/on/2023-09-28/title-12/section-44.3#p-44.3(b)(1)(i)'  '/on/2023-09-28/title-12/section-44.3#p-44.3(b)(1)(ii)'  '/on/2023-09-28/title-12/section-44.3#p-44.3(b)(1)(ii)'  '/on/2023-09-28/title-12/section-44.3#p-44.3(b)(1)(i)'],[],['44.3(b)(1)(ii)' '44.3(b)(1)(i)' '44.3(b)(1)(ii)' '44.3(b)(1)(ii)'  '44.3(b)(1)(i)'],0.647494
61102,351.3(b)(2),(2)Trading account application for certain banking entities.,['351.3(b)(2)(i)' '351.3(b)(2)(ii)'],['/on/2023-09-28/title-12/section-351.3#p-351.3(b)(1)(ii)'  '/on/2023-09-28/title-12/section-351.3#p-351.3(b)(1)(i)'  '/on/2023-09-28/title-12/section-351.3#p-351.3(b)(1)(ii)'  '/on/2023-09-28/title-12/section-351.3#p-351.3(b)(1)(ii)'  '/on/2023-09-28/title-12/section-351.3#p-351.3(b)(1)(i)'],[],['351.3(b)(1)(ii)' '351.3(b)(1)(i)' '351.3(b)(1)(ii)' '351.3(b)(1)(ii)'  '351.3(b)(1)(i)'],0.647494
40706,248.3(b),(b)Definition of trading account—,['248.3(b)(1)' '248.3(b)(1)(i)' '248.3(b)(1)(ii)' '248.3(b)(1)(iii)'  '248.3(b)(1)(iii)(A)' '248.3(b)(1)(iii)(B)' '248.3(b)(2)'  '248.3(b)(2)(i)' '248.3(b)(2)(ii)' '248.3(b)(3)' '248.3(b)(3)(i)'  '248.3(b)(3)(ii)' '248.3(b)(4)'],['/on/2023-09-28/title-12/section-248.3#p-248.3(b)(1)(ii)'  '/on/2023-09-28/title-12/section-248.3#p-248.3(b)(1)(i)'  '/on/2023-09-28/title-12/section-248.3#p-248.3(b)(1)(ii)'  '/on/2023-09-28/title-12/section-248.3#p-248.3(b)(1)(ii)'  '/on/2023-09-28/title-12/section-248.3#p-248.3(b)(1)(i)'  '/on/2023-09-28/title-12/section-248.3#p-248.3(b)(2)(ii)'  '/on/2023-09-28/title-12/part-248/subpart-D'  '/on/2023-09-28/title-12/section-248.3#p-248.3(b)(2)(ii)'  '/on/2023-09-28/title-12/section-248.3#p-248.3(b)(1)(i)'  '/on/2023-09-28/title-12/section-248.3#p-248.3(b)(1)(i)'],[],['248.3(b)(1)(ii)' '248.3(b)(1)(i)' '248.3(b)(1)(ii)' '248.3(b)(1)(ii)'  '248.3(b)(1)(i)' '248.3(b)(2)(ii)' '248.3(b)(2)(ii)' '248.3(b)(1)(i)'  '248.3(b)(1)(i)'],0.615286
61095,351.3(b),(b)Definition of trading account—,['351.3(b)(1)' '351.3(b)(1)(i)' '351.3(b)(1)(ii)' '351.3(b)(1)(iii)'  '351.3(b)(1)(iii)(A)' '351.3(b)(1)(iii)(B)' '351.3(b)(2)'  '351.3(b)(2)(i)' '351.3(b)(2)(ii)' '351.3(b)(3)' '351.3(b)(3)(i)'  '351.3(b)(3)(ii)' '351.3(b)(4)'],['/on/2023-09-28/title-12/section-351.3#p-351.3(b)(1)(ii)'  '/on/2023-09-28/title-12/section-351.3#p-351.3(b)(1)(i)'  '/on/2023-09-28/title-12/section-351.3#p-351.3(b)(1)(ii)'  '/on/2023-09-28/title-12/section-351.3#p-351.3(b)(1)(ii)'  '/on/2023-09-28/title-12/section-351.3#p-351.3(b)(1)(i)'  '/on/2023-09-28/title-12/section-351.3#p-351.3(b)(2)(ii)'  '/on/2023-09-28/title-12/part-351/subpart-D'  '/on/2023-09-28/title-12/section-351.3#p-351.3(b)(2)(ii)'  '/on/2023-09-28/title-12/section-351.3#p-351.3(b)(1)(ii)'  '/on/2023-09-28/title-12/section-351.3#p-351.3(b)(1)(i)'  '/on/2023-09-28/title-12/section-351.3#p-351.3(b)(1)(i)'],[],['351.3(b)(1)(ii)' '351.3(b)(1)(i)' '351.3(b)(1)(ii)' '351.3(b)(1)(ii)'  '351.3(b)(1)(i)' '351.3(b)(2)(ii)' '351.3(b)(2)(ii)' '351.3(b)(1)(ii)'  '351.3(b)(1)(i)' '351.3(b)(1)(i)'],0.615286
112048,1270.1 “Securities Intermediary” (2),"(2)A Person (other than an individual, unless such individual is registered as a broker or dealer under the Federal securities laws), including a bank or broker, that in the ordinary course of its business maintains securities accounts for others and is acting in that capacity.",[],[],[],[],0.606972
115496,1511.1 “Securities Intermediary” (2),"(2)A Person (other than an individual, unless such individual is registered as a broker or dealer under the federal securities laws) including a bank or broker, that in the ordinary course of its business maintains securities accounts for others and is acting in that capacity.",[],[],[],[],0.605385
70357,615.5450(r)(2),"(2)A person (other than an individual, unless such individual is registered as a broker or dealer under the Federal securities laws) including a bank or broker, that in the ordinary course of its business maintains securities accounts for others and is acting in that capacity.",[],[],[],[],0.605385
12119,44.3(b)(1)(i),"(i)Any account that is used by a banking entity to purchase or sell one or more financial instruments principally for the purpose of short-term resale, benefitting from actual or expected short-term price movements, realizing short-term arbitrage profits, or hedging one or more of the positions resulting from the purchases or sales of financial instruments described in this paragraph;",[],[],[],[],0.593689
61097,351.3(b)(1)(i),"(i)Any account that is used by a banking entity to purchase or sell one or more financial instruments principally for the purpose of short-term resale, benefitting from actual or expected short-term price movements, realizing short-term arbitrage profits, or hedging one or more of the positions resulting from the purchases or sales of financial instruments described in this paragraph;",[],[],[],[],0.593689
