#**Goal: Create a code explanation for each cell as text below it.**

**Creating a hybrid search system using**
* Embeddings for semantic search (sentence_transformers)
* BM25 for keyword ranking (Sparse retrieval)
* FAISS as a index.









In [1]:
!pip install sentence-transformers



In [2]:
!pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [3]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.2


In [4]:
import sentence_transformers

In [5]:
import numpy as np
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
import faiss

In [6]:
documents = [
    "Artificial Intelligence is changing the world.",
    "Machine Learning is a subset of AI.",
    "Deep Learning is a subset of Machine Learning.",
    "Natural Language Processing involves understanding text.",
    "Computer Vision allows machines to see and understand.",
    "AI includes areas like NLP and Computer Vision.",
    "The Pyramids of Giza are architectural marvels.",
    "Mozart was a prolific composer during the classical era.",
    "Mount Everest is the tallest mountain on Earth.",
    "The Nile is one of the world's longest rivers.",
    "Van Gogh's Starry Night is a popular piece of art.",
    "Basketball is a sport played with a round ball and two teams."
]

In [7]:
query = "Tell me about AI in text and vision."

In [8]:
tokenized_corpus = [doc.split(" ") for doc in documents]

In [9]:
bm25 = BM25Okapi(tokenized_corpus)

In [None]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [11]:
document_embeddings = model.encode(documents)

In [12]:
index = faiss.IndexFlatL2(document_embeddings.shape[1])

In [13]:
index.add(np.array(document_embeddings).astype('float32'))


In [14]:
top_n =10

In [15]:
bm25_scores = bm25.get_scores(query.split(" "))

In [16]:
top_docs_indices = np.argsort(bm25_scores)[-top_n:]

In [17]:
top_docs_embeddings = [document_embeddings[i] for i in top_docs_indices]

In [18]:
query_embedding = model.encode([query])

In [19]:
sub_index = faiss.IndexFlatL2(top_docs_embeddings[0].shape[0])

In [20]:
sub_index.add(np.array(top_docs_embeddings).astype('float32'))

In [24]:
_,sub_dense_ranked_indices = sub_index.search(np.array(query_embedding).astype('float32'), top_n)

In [25]:
sub_dense_ranked_indices


array([[9, 8, 1, 0, 6, 7, 2, 4, 3, 5]])

In [26]:
final_ranked_indices = [top_docs_indices[i] for i in sub_dense_ranked_indices[0]]

In [27]:
ranked_docs = [documents[i] for i in final_ranked_indices]

In [28]:
ranked_docs

['AI includes areas like NLP and Computer Vision.',
 'Computer Vision allows machines to see and understand.',
 'Natural Language Processing involves understanding text.',
 'Deep Learning is a subset of Machine Learning.',
 "Van Gogh's Starry Night is a popular piece of art.",
 'Basketball is a sport played with a round ball and two teams.',
 'Mozart was a prolific composer during the classical era.',
 "The Nile is one of the world's longest rivers.",
 'The Pyramids of Giza are architectural marvels.',
 'Mount Everest is the tallest mountain on Earth.']

#Provide a brief description of the process this code implements.