# RAG -Search


- [Introduction to information Retrieval](https://nlp.stanford.edu/IR-book/information-retrieval-book.html)


Install packages


In [8]:
!uv pip install -q \
    pandas==2.3.2 \
    pandas-stubs==2.3.2.250827 \
    numpy==2.3.2 \
    matplotlib==3.10.6 \
    scikit-learn==1.7.1 \
    requests==2.32.5

Import packages


In [None]:
import numpy as np
import pandas as pd
import requests
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Get documents


In [None]:
docs_url = "https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json"
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course["course"]
    for doc in course["documents"]:
        doc["course"] = course_name
        documents.append(doc)

documents[0]

Create a DataFrame


In [None]:
df = pd.DataFrame(documents, columns=["course", "section", "question", "text"])
df.head()

Vector spaces

- Turn docs into vectors
- Term-document-matrix:
  - rows: documents
  - columns: words/tokens
- Bag of words: counts occurrences of each word in each document

| Document | word1 | word2 | word3 |
| -------- | ----- | ----- | ----- |
| Doc1     | 1     | 0     | 2     |
| Doc2     | 0     | 3     | 1     |


In [None]:
cv = CountVectorizer(min_df=5, stop_words="english")  # Min Document Frequency
X = cv.fit_transform(df.text)
column_names = cv.get_feature_names_out()
column_names

Number of distinct tokens


In [None]:
cv.get_feature_names_out().shape

Tokens by occurrences


In [None]:
df_docs = pd.DataFrame(X.toarray(), columns=column_names)
df_docs.head()

Tokens by importance


In [None]:
tv = TfidfVectorizer(min_df=5, stop_words="english")  # Min Document Frequency
X = tv.fit_transform(df.text)
column_names = tv.get_feature_names_out()
column_names
df_docs = pd.DataFrame(X.toarray(), columns=column_names)
df_docs.head()

## Text Search


In [None]:
query = "I just discovered the course, is it too late to join?"

q = cv.transform([query])
q.toarray()

Relevant documents


In [None]:
score = cosine_similarity(X, q).flatten()
np.argsort(score)[-5:]

Document example


In [None]:
df.iloc[0].text

Creating matrices


In [None]:
fields = ["section", "question", "text"]

matrices = {}
vectorizers = {}

for f in fields:
    cv = TfidfVectorizer(stop_words="english", min_df=5)
    X = cv.fit_transform(df[f])
    matrices[f] = X
    vectorizers[f] = cv

Scoring


In [None]:
n = len(df)
score = np.zeros(n)
boosts = {"question": 3}

for f in fields:
    q = vectorizers[f].transform([query])
    X = matrices[f]

    f_score = cosine_similarity(X, q).flatten()
    boost = boosts.get(f, 1.0)
    score = score + boost * f_score

Filtering


In [None]:
filters = {"course": "data-engineering-zoomcamp"}

for field, value in filters.items():
    mask = (df[field] == value).values
    score = score * mask

Getting most relevant documents


In [None]:
idx = np.argsort(score)[-5:][::-1]
df.iloc[idx]

Putting all together


In [None]:
class TextSearch:
    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}

    def fit(self, records, vectorizer_params={}):
        self.df = pd.DataFrame(records)

        for f in self.text_fields:
            cv = TfidfVectorizer(**vectorizer_params)
            X = cv.fit_transform(self.df[f])
            self.matrices[f] = X
            self.vectorizers[f] = cv

    def search(self, query, n_results=10, boost={}, filters={}):
        score = np.zeros(len(self.df))

        for f in self.text_fields:
            b = boost.get(f, 1.0)
            q = self.vectorizers[f].transform([query])
            s = cosine_similarity(self.matrices[f], q).flatten()
            score = score + b * s

        for field, value in filters.items():
            mask = (self.df[field] == value).values
            score = score * mask

        idx = np.argsort(-score)[:n_results]
        results = self.df.iloc[idx]
        return results.to_dict(orient="records")

Searching


In [None]:
index = TextSearch(text_fields=["section", "question", "text"])
index.fit(documents)

index.search(
    query="I just signed up. Is it too late to join the course?",
    n_results=5,
    boost={"question": 3.0},
    filters={"course": "data-engineering-zoomcamp"},
)

## Vector Search


In [None]:
X = matrices["text"]
cv = vectorizers["text"]

svd = TruncatedSVD(n_components=16)
X_emb = svd.fit_transform(X)

X_emb.shape

Dense representation of documents (embeddings)


In [None]:
X_emb[0]

In [None]:
query = "I just signed up. Is it too late to join the course?"

Q = cv.transform([query])
Q_emb = svd.transform(Q)
Q_emb[0]

In [None]:
np.dot(X_emb[0], Q_emb[0])

In [None]:
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:10]
df.loc[idx]

In [None]:
nmf = NMF(n_components=16)
X_emb = nmf.fit_transform(X)
X_emb[0]

In [None]:
Q = cv.transform([query])
Q_emb = nmf.transform(Q)
Q_emb[0]

In [None]:
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:10]
df.loc[idx]