In [1]:
import json
import numpy as np
import pandas as pd
import torch

from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
from transformers import BertTokenizer, BertModel
from tqdm.auto import tqdm

### Document Loader

In [2]:
# Document Loader from the workshop
# response = requests.get("https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json")
# documents = response.json()
# document_df = pd.json_normalize(documents, record_path="documents", meta="course").rename(columns={"text": "answer"}).reindex(columns=["course", "section", "question", "answer"])

# Document Loader from etl.ipynb
with open("data/documents.json") as f:
    documents = json.load(f)

document_df = pd.json_normalize(documents, record_path="documents", meta="course").reindex(columns=["course", "section", "question", "answer"])

### Search

In [3]:
class SemanticSearch:
    def __init__(self, fields):
        self.fields = fields
        self.indexes = {}
        self.pipes = {}
    
    def index(self, document_df, vectorizer_params=None, embedder_params=None, embedder_name=None):
        if vectorizer_params is None: vectorizer_params = {}
        if embedder_params is None: embedder_params = {}

        for field in self.fields:
            vectorizer = TfidfVectorizer(**vectorizer_params)
            embedder = TruncatedSVD(**embedder_params) if embedder_name is None or embedder_name == "svd" else NMF(**embedder_params)
            pipe = Pipeline([("vectorizer", vectorizer), ("embedder", embedder)])
            D = pipe.fit_transform(document_df[field])

            self.indexes[field] = D
            self.pipes[field] = pipe
        
        self.indexes["document"] = document_df

    def search(self, query, k=5, weights=None, filters=None):
        if weights is None: weights = {"question": 3}

        if filters is None: filters = {}
        document_df = self.indexes["document"]
        filtered_document_df = document_df[(document_df[filters.keys()] == pd.Series(filters)).all(axis=1)]

        scores = np.zeros(len(filtered_document_df))

        for field in self.fields:
            w = weights.get(field, 1)
            q = self.pipes[field].transform([query])

            scores += w * cosine_similarity(q, self.indexes[field][filtered_document_df.index.values]).reshape(-1)

        idx = np.argsort(-scores)[:k]
        search_results = filtered_document_df.iloc[idx]

        return search_results

In [4]:
# Indexing: Embedding via SVD
# https://scikit-learn.org/stable/modules/decomposition.html#about-truncated-svd-and-latent-semantic-analysis-(lsa)
engine = SemanticSearch(["section", "question", "answer"])
engine.index(document_df, vectorizer_params={"stop_words": "english", "min_df": 3}, embedder_params={"n_components": 16, "random_state": 42})

In [5]:
# IR: Cosine Similarity
search_results = engine.search(
    query="I just signed up. Is it too late to join the course?",
    k=5,
    filters={"course": "data-engineering-zoomcamp"}
).to_dict(orient="records")

search_results

[{'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Course - When will the course start？',
  'answer': "The next cohort starts January 13th 2025. More info at DTC Article.\nRegister before the course starts using this link.\nJoint the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel."},
 {'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'answer': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.'},
 {'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Course - Can I still joi

In [6]:
# Indexing: Embedding via NMF
# https://scikit-learn.org/stable/modules/decomposition.html#non-negative-matrix-factorization-nmf-or-nnmf
engine = SemanticSearch(["section", "question", "answer"])
engine.index(document_df, vectorizer_params={"stop_words": "english", "min_df": 3}, embedder_params={"n_components": 16, "random_state": 42, "max_iter": 1000}, embedder_name="nmf")

In [7]:
# IR: Cosine Similarity
search_results = engine.search(
    query="I just signed up. Is it too late to join the course?",
    k=5,
    filters={"course": "data-engineering-zoomcamp"}
).to_dict(orient="records")

search_results

[{'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'answer': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute."},
 {'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Course - When will the course start？',
  'answer': "The next cohort starts January 13th 2025. More info at DTC Article.\nRegister before the course starts using this link.\nJoint the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel."},
 {'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'answer': 'Yes, we will keep all the 

In [8]:
# Indexing: Embedding via context-based representation using BERT
# https://huggingface.co/google-bert/bert-base-uncased

In [9]:
def compute_embeddings(texts, batch_size=8):
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model = BertModel.from_pretrained("bert-base-uncased")
    model.eval()  # Set the model to evaluation mode if not training
    
    batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]

    embeddings = []
    for batch in tqdm(batches):
        encoded_inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")

        with torch.no_grad(): # Disable gradient calculation for inference
            outputs = model(**encoded_inputs)
            hidden_states = outputs.last_hidden_state
        
            batch_embeddings = hidden_states.mean(dim=1).cpu().numpy() # Mean Pooling: Aggregate the embeddings of all input tokens as sentence embedding representation
            embeddings.append(batch_embeddings)

    return np.concatenate(embeddings)

In [10]:
embeddings = {field: compute_embeddings(document_df[field].tolist()) for field in ["section", "question", "answer"]}

  0%|          | 0/136 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

  0%|          | 0/136 [00:00<?, ?it/s]

In [11]:
embeddings

{'section': array([[ 0.37748626, -0.1682663 , -0.717946  , ...,  0.32759324,
         -0.12342941,  0.18710016],
        [ 0.37748626, -0.1682663 , -0.717946  , ...,  0.32759324,
         -0.12342941,  0.18710016],
        [ 0.37748626, -0.1682663 , -0.717946  , ...,  0.32759324,
         -0.12342941,  0.18710016],
        ...,
        [ 0.05408813, -0.03563844, -0.26071563, ...,  0.07740071,
         -0.12605353,  0.06296202],
        [ 0.05408813, -0.03563844, -0.26071563, ...,  0.07740071,
         -0.12605353,  0.06296202],
        [ 0.25193617,  0.06506072, -0.14671353, ..., -0.14556682,
         -0.06536502,  0.08461889]], shape=(1086, 768), dtype=float32),
 'question': array([[-0.07698561, -0.43341422,  0.44869146, ..., -0.07128868,
         -0.10150252,  0.05264673],
        [ 0.2285549 ,  0.04207362,  0.20274155, ..., -0.0888651 ,
          0.00049368,  0.00081991],
        [ 0.03474644, -0.27245435,  0.22815742, ...,  0.05961451,
         -0.12863296,  0.1585731 ],
        ..

In [12]:
embeddings["answer"].shape

(1086, 768)

In real world, the most applicable approach to do information retrieval is to combine both keyword search and semantic search together so called *Hybrid Search*. The retrieved results are calculated based on weighted scores on both systems.

Moreover, in practice there are a massive set of documents paired with just a query that we need to compute for similarity scores. We can not do efficient real-time querying despite preparing documents before hand or using good algorithm when retrieving. Therefore, we need to optimize each step so that we can build an effective IR system.

In IR, a task is simply that:
> Given a query q, find a set of relevant documents D.

There are two main steps involved in the system. The first one is **ETL**, which is a process to do data ingestion to build 'external' knowledge (i.e., database). This process is analogous to data engineering process. The most common techniques in this stage for key word search is *inverted index* which is a map of word associated with its posting lists. A posting list can contain various information such as DocID, position in the document and the occurrence of the word. The data structure allows fast retrieval of postings given terms in the query.

Next step is **Searching**. The main goal is to calculate a similarity score for each document-query pair via an algorithm like k-nearest neighbors. This is very inefficient to compute when we do not know the query in advance.

One way to mitigate this is to pre-compute its k-nearest neighbors documents for each term before hand. However, it means we have to store additional data in the database and waste our storage for duplicate data. The more common technique is to use *approximate nearest neighbors (ANN)* specifically for IR, *Locality-Sensitive Hashing (LSH) via random projections*, which is just a hash function for retrieving similar items that in the same bucket.

Note that those are algorithms and data structures implemented in the services that we can apply in our application. We do not have to build it from scratch. For example, in key word search, we can use *Elastic Search* to implement production-ready search system. Also, we have a lot of on-the-shelf vector databases like *Pinecone* or *FAISS* for implementing semantic search or hybrid search.