# Semantic Search

`pip install sentence-transformers`

In [None]:
import json
import os
import pickle

from sentence_transformers import SentenceTransformer, util

(1) Select a Model

In [None]:
# MODEL = 'msmarco-distilbert-cos-v5'
MODEL = 'msmarco-distilbert-base-tas-b'
#################################################
model = SentenceTransformer(MODEL)

(2) Select docs folder and whether to save embeddings for future use

In [None]:
FOLDER = None
SAVE_AS = 'filename'
#################################################
paths = []
docs = []
text = None
for root, dirs, files in os.walk(FOLDER):
    for file in files:
        paths.append(os.path.join(root, file))
for path in paths:
    with open(path, encoding='utf-8') as f:
        doc = json.load(f).get('bodyText', '')
        if doc.strip():
            docs.append(doc.strip())
doc_emb = model.encode(docs)

if SAVE_AS:
    with open(f'{SAVE_AS}.pkl', "wb") as fOut:
        pickle.dump({'docs': docs, 'doc_emb': doc_emb}, fOut, protocol=pickle.HIGHEST_PROTOCOL)


... Or load embeddings from a pickle file

In [None]:
PICKLE_FILE = 'filename'
#################################################
with open(PICKLE_FILE, "rb") as fIn:
    stored_data = pickle.load(fIn)
    docs = stored_data['docs']
    doc_emb = stored_data['doc_emb']

(3) Run query

In [None]:
QUERY = 'password'
# Show top N docs
N = 10
#################################################
query_emb = model.encode(QUERY)
scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()
# scores = util.cos_sim(query_emb, doc_emb)[0].cpu().tolist()
doc_score_pairs = list(zip(docs, scores))
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
for doc, score in doc_score_pairs[:10]:
    doc = doc.replace('\r', '\n')
    doc = '\n'.join([line for line in doc.split('\n') if line.strip()])
    # print('SCORE: ', score, '\n\n')
    print(doc)
    print('\n----------------------------------------------------------------------\n')