## Build Semantic Search System With The Haystack Python Framework 

In [19]:
# pros: more accurate 
# cons: slower, more complex

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

model = SentenceTransformer('bert-base-nli-mean-tokens')

sentences = ['This is an example sentence', 'Each sentence is converted into a list of embeddings', 'These embeddings are then compared for semantic similarity']


sentence_embeddings = model.encode(sentences)

query = 'This is a sample query'
query_embedding = model.encode([query])[0]

cos_scores = cosine_similarity([query_embedding], sentence_embeddings)[0]
top_results = cos_scores.argsort()[-5:][::-1]

print("\n\n======================\n\n")
print("Query:", query)
print("\nTop 5 most similar sentences in corpus:")

for idx in top_results:
    print(sentences[idx].strip(), "(Score: %.4f)" % (cos_scores[idx]))





Query: This is a sample query

Top 5 most similar sentences in corpus:
This is an example sentence (Score: 0.7110)
These embeddings are then compared for semantic similarity (Score: 0.6175)
Each sentence is converted into a list of embeddings (Score: 0.5886)


In [26]:
#pros: faster, simpler
#cons: less accurate

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


documents = ['This is an example sentence', 'Each sentence is converted into a list of embeddings', 'These embeddings are then compared for semantic similarity']

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(documents)


query = vectorizer.transform(['This is a sample query'])
cosine_similarities = cosine_similarity(query, X).flatten()

relevant_docs = [doc for doc, score in zip(documents, cosine_similarities) if score >= 0.5]

print(relevant_docs)

['This is an example sentence', 'Each sentence is converted into a list of embeddings']
