In [1]:
import json
import numpy as np
import pandas as pd
import requests

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Document Loader

In [2]:
# Document Loader from the workshop
# response = requests.get("https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json")
# documents = response.json()
# document_df = pd.json_normalize(documents, record_path="documents", meta="course").rename(columns={"text": "answer"}).reindex(columns=["course", "section", "question", "answer"])

# Document Loader from etl.ipynb
with open("data/documents.json") as f:
    documents = json.load(f)

document_df = pd.json_normalize(documents, record_path="documents", meta="course").reindex(columns=["course", "section", "question", "answer"])

### Search

In [3]:
class KeywordSearch:
    def __init__(self, fields):
        self.fields = fields
        self.indexes = {}
        self.vectorizers = {}

    def index(self, document_df, vectorizer_params=None):
        if vectorizer_params is None: vectorizer_params = {}

        for field in self.fields:
            vectorizer = TfidfVectorizer(**vectorizer_params)
            D = vectorizer.fit_transform(document_df[field])
            
            self.indexes[field] = D
            self.vectorizers[field] = vectorizer
        
        self.indexes["document"] = document_df

    def search(self, query, k=5, weights=None, filters=None):
        if weights is None: weights = {"question": 3}

        if filters is None: filters = {}
        document_df = self.indexes["document"]
        filtered_document_df = document_df[(document_df[filters.keys()] == pd.Series(filters)).all(axis=1)]

        scores = np.zeros(len(filtered_document_df))

        for field in self.fields:
            w = weights.get(field, 1)
            q = self.vectorizers[field].transform([query])

            scores += w * cosine_similarity(q, self.indexes[field][filtered_document_df.index.values]).reshape(-1)

        idx = np.argsort(-scores)[:k]
        search_results = filtered_document_df.iloc[idx]

        return search_results

Note that we can use another word representation instead of TF-IDF such as Bag-of-Words which in *scikit-learn* is a class called `CountVectorizer`.

In [4]:
# Indexing: vector representation via TF-IDF
engine = KeywordSearch(["section", "question", "answer"])
engine.index(document_df, vectorizer_params={"stop_words": "english", "min_df": 3})

In [5]:
# IR: Cosine Similarity
search_results = engine.search(
    query="I just signed up. Is it too late to join the course?",
    k=5,
    filters={"course": "data-engineering-zoomcamp"}
).to_dict(orient="records")

search_results

[{'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'answer': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute."},
 {'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'answer': 'GitHub - See DE-zoomcamp  prerequisites'},
 {'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'answer': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see