In [1]:
import json

In [2]:
with open("./documents.json", "rt") as f_in:
    documents_file = json.load(f_in)

In [3]:
documents = []

for course in documents_file:
    course_name = course["course"]

    for doc in course["documents"]:
        doc["course"] = course_name
        documents.append(doc)


In [4]:
import pandas as pd

df = pd.DataFrame(documents, columns=["course", "section", "question", "text"])
df.head()


Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When does the course start?,Data Engineering Zoomcamp FAQ\nData Engineerin...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,See DE zoomcamp 2025 pre-course Q&A\nTo get th...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,Start by installing and setting up all the dep...


In [8]:
df[df.course == "data-engineering-zoomcamp"].head()
documents[2]


{'text': "Yes, even if you don't register, you're still eligible to submit the homework.\nBe aware, however, that there will be deadlines for turning in homeworks and the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words="english", min_df=5)
cv.fit(df.text)


In [19]:
cv.get_feature_names_out().shape


(1631,)

In [25]:
docs_example = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course",
]
cv = CountVectorizer(stop_words="english")
X = cv.fit_transform(docs_example)
names = cv.get_feature_names_out()
df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs





Unnamed: 0,0,1,2,3,4
15th,1,0,0,0,0
2024,1,0,0,0,0
cloud,0,0,0,0,1
course,1,0,0,0,1
date,0,0,1,0,0
github,0,1,0,0,0
google,0,0,0,0,1
homeworks,0,0,1,0,0
jan,1,0,0,0,0
listed,0,1,0,0,0


In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer(stop_words="english")
X = cv.fit_transform(docs_example)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs.round(2)

# We represent the query in the same vector space - i.e. using the same vectorizer
query = "Do I need to know python to sign up for the January course?"

q = cv.transform([query])
q.toarray()

array([[0.        , 0.        , 0.        , 0.62791376, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.77828292, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ]])

In [39]:
# We can see the words of the query and the words of some document:
query_dict = dict(zip(names, q.toarray()[0]))
query_dict

{'15th': np.float64(0.0),
 '2024': np.float64(0.0),
 'cloud': np.float64(0.0),
 'course': np.float64(0.6279137616509933),
 'date': np.float64(0.0),
 'github': np.float64(0.0),
 'google': np.float64(0.0),
 'homeworks': np.float64(0.0),
 'jan': np.float64(0.0),
 'listed': np.float64(0.0),
 'participation': np.float64(0.0),
 'prerequisites': np.float64(0.0),
 'python': np.float64(0.7782829228046183),
 'registration': np.float64(0.0),
 'required': np.float64(0.0),
 'setup': np.float64(0.0),
 'start': np.float64(0.0),
 'starts': np.float64(0.0),
 'submit': np.float64(0.0)}

In [40]:
doc_dict = dict(zip(names, X.toarray()[1]))
doc_dict

{'15th': np.float64(0.0),
 '2024': np.float64(0.0),
 'cloud': np.float64(0.0),
 'course': np.float64(0.0),
 'date': np.float64(0.0),
 'github': np.float64(0.5773502691896258),
 'google': np.float64(0.0),
 'homeworks': np.float64(0.0),
 'jan': np.float64(0.0),
 'listed': np.float64(0.5773502691896258),
 'participation': np.float64(0.0),
 'prerequisites': np.float64(0.5773502691896258),
 'python': np.float64(0.0),
 'registration': np.float64(0.0),
 'required': np.float64(0.0),
 'setup': np.float64(0.0),
 'start': np.float64(0.0),
 'starts': np.float64(0.0),
 'submit': np.float64(0.0)}

In [43]:
# The more words in common - the better the matching score. Let's calculate it
df_qd = pd.DataFrame([query_dict, doc_dict], index=["query", "doc"]).T

(df_qd["query"] * df_qd["doc"]).sum()

# This is a dot-product. So we can use matrix multiplication to compute the score:
X.dot(q.T).toarray()
# https://github.com/DataTalksClub/machine-learning-zoomcamp/blob/master/01-intro/08-linear-algebra.md

array([[0.23490553],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.59579005]])

In [44]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity(X, q)

array([[0.23490553],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.59579005]])

In [48]:
# Vectorizing all the documents
fields = ["section", "question", "text"]
transformers = {}
matrices = {}

for field in fields:
    cv = TfidfVectorizer(stop_words="english", min_df=3)
    X = cv.fit_transform(df[field])

    transformers[field] = cv
    matrices[field] = X

transformers["text"].get_feature_names_out()
matrices["text"]

# Search
query = "I just signed up. Is it too late to join the course?"

q = transformers["text"].transform([query])
score = cosine_similarity(matrices["text"], q).flatten()
score


array([0.12217419, 0.10007046, 0.        , ..., 0.        , 0.        ,
       0.        ], shape=(1217,))

In [47]:
mask = (df.course == "data-engineering-zoomcamp").values
score = score * mask
score


array([0.12217419, 0.10007046, 0.        , ..., 0.        , 0.        ,
       0.        ], shape=(1217,))

In [49]:
import numpy as np

idx = np.argsort(-score)[:10]
df.iloc[idx].text

14     No, as long as you do the peer-reviewed capsto...
461    Yes, you can. Even though you missed the start...
453    The process is automated now, so you should re...
24     It's up to you which platform and environment ...
460    Here’s how you join in Slack: https://slack.co...
881    Q: When can I expect to receive the confirmati...
17     No, late submissions are not allowed. But if t...
896    To clarify on "Late homework submissions": we ...
824    If you have submitted two projects (and peer-r...
858    Depends on whether the form will still be open...
Name: text, dtype: object

In [50]:
# Search with all the fields & boosting + filtering
boost = {"question": 3.0}

score = np.zeros(len(df))

for f in fields:
    b = boost.get(f, 1.0)
    q = transformers[f].transform([query])
    s = cosine_similarity(matrices[f], q).flatten()
    score = score + b * s

# And add filters (in this case, only one):
filters = {"course": "data-engineering-zoomcamp"}

for field, value in filters.items():
    mask = (df[field] == value).values
    score = score * mask

# Getting the results:
idx = np.argsort(-score)[:10]
results = df.iloc[idx]
results.to_dict(orient="records")

[{'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'text': "Yes, even if you don't register, you're still eligible to submit the homework.\nBe aware, however, that there will be deadlines for turning in homeworks and the final projects. So don't leave everything for the last minute."},
 {'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'text': 'See DE zoomcamp 2025 pre-course Q&A\nTo get the most out of this course, you should have:\nBasic coding experience\nFamiliarity with SQL\nExperience with Python (helpful but not required)\nNo prior data engineering experience is necessary. See Readme on GitHub'},
 {'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'te

In [51]:
# Putting it all together
class TextSearch:
    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}

    def fit(self, records, vectorizer_params={}):
        self.df = pd.DataFrame(records)

        for f in self.text_fields:
            cv = TfidfVectorizer(**vectorizer_params)
            X = cv.fit_transform(self.df[f])
            self.matrices[f] = X
            self.vectorizers[f] = cv

    def search(self, query, n_results=10, boost={}, filters={}):
        score = np.zeros(len(self.df))

        for f in self.text_fields:
            b = boost.get(f, 1.0)
            q = self.vectorizers[f].transform([query])
            s = cosine_similarity(self.matrices[f], q).flatten()
            score = score + b * s

        for field, value in filters.items():
            mask = (self.df[field] == value).values
            score = score * mask

        idx = np.argsort(-score)[:n_results]
        results = self.df.iloc[idx]
        return results.to_dict(orient="records")


In [None]:
index = TextSearch(text_fields=["section", "question", "text"])
index.fit(documents)

index.search(
    query="I just signed up. Is it too late to join the course?",
    n_results=5,
    boost={"question": 3.0},
    filters={"course": "data-engineering-zoomcamp"},
)

[{'text': 'Deploy and Access the Kubernetes Dashboard\nLuke',
  'section': '10. Kubernetes and TensorFlow Serving',
  'question': 'Kubernetes-dashboard',
  'course': 'machine-learning-zoomcamp'},
 {'text': 'In Kubernetes resource specifications, such as CPU requests and limits, the "m" stands for milliCPU, which is a unit of computing power. It represents one thousandth of a CPU core.\ncpu: "100m" means the container is requesting 100 milliCPUs, which is equivalent to 0.1 CPU core.\ncpu: "500m" means the container has a CPU limit of 500 milliCPUs, which is equivalent to 0.5 CPU core.\nThese values are specified in milliCPUs to allow fine-grained control over CPU resources. It allows you to express CPU requirements and limits in a more granular way, especially in scenarios where your application might not need a full CPU core.\nAdded by Andrii Larkin',
  'section': '10. Kubernetes and TensorFlow Serving',
  'question': 'Why cpu vals for Kubernetes deployment.yaml look like “100m” and “5

In [54]:
# Embeddings and Vector Search
# Problem with text - only exact matches. How about synonyms?

# What are Embeddings?
# Conversion to Numbers: Embeddings transform different words, sentences and documents into dense vectors (arrays with numbers).
# Capturing Similarity: They ensure similar items have similar numerical vectors, illustrating their closeness in terms of characteristics.
# Dimensionality Reduction: Embeddings reduce complex characteristics into vectors.
# Use in Machine Learning: These numerical vectors are used in machine learning models for tasks such as recommendations, text analysis, and pattern recognition.

# SVD
# Singular Value Decomposition is the simplest way to turn Bag-of-Words representation into embeddings

# This way we still don't preserve the word order (because it wasn't in the Bag-of-Words representation) but we reduce dimensionality and capture synonyms.

# We won't go into mathematics, it's sufficient to know that SVD "compresses" our input vectors in such a way that as much as possible of the original information is retained.

# This compression is lossy compression - meaning that we won't be able to restore the 100% of the original vector, but the result is close enough.
from sklearn.decomposition import TruncatedSVD

X = matrices["text"]
cv = transformers["text"]

svd = TruncatedSVD(n_components=16)
X_emb = svd.fit_transform(X)

X_emb[0]

array([ 0.15899054, -0.13055678, -0.14711878, -0.00738405,  0.0428926 ,
       -0.07430867,  0.2383229 , -0.02376318,  0.02161945,  0.11931707,
        0.00448728, -0.01192337, -0.10081416,  0.10159348, -0.13165635,
        0.04112152])

In [55]:
query = "I just signed up. Is it too late to join the course?"

Q = cv.transform([query])
Q_emb = svd.transform(Q)
Q_emb[0]


array([ 0.04637539, -0.04207082, -0.06094551,  0.03049344,  0.02280079,
       -0.09000214,  0.15935008, -0.03017362,  0.00779424,  0.07875426,
       -0.02318851, -0.03753381, -0.00827278,  0.05543424,  0.02244351,
        0.02215487])

In [56]:
np.dot(X_emb[0], Q_emb[0])

np.float64(0.08229735462618742)

In [57]:
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:10]
list(df.loc[idx].text)


['No, it’s not possible. The form is closed after the due date. But don’t worry, homework is not mandatory for finishing the course.',
 "Yes, even if you don't register, you're still eligible to submit the homework.\nBe aware, however, that there will be deadlines for turning in homeworks and the final projects. So don't leave everything for the last minute.",
 "Yes, even if you don't register, you're still eligible to submit the homeworks as long as the form is still open and accepting submissions.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything to the last minute.",
 'The display name listed on the leaderboard is an auto-generated randomized name. You can edit it to be a nickname, or your real name, if you prefer. Your entry on the Leaderboard is the one highlighted in teal(?) / light green (?).\nThe Certificate name should be your actual name that you want to appear on your certificate after completing the course.\nThe "D

In [58]:
# Non-Negative Matrix Factorization

# SVD creates values with negative numbers. It's difficult to interpet them.

# NMF (Non-Negative Matrix Factorization) is a similar concept, except for non-negative input matrices it produces non-negative results.

# We can interpret each of the columns (features) of the embeddings as different topic/concepts and to what extent this document is about this concept.

# Let's use it for the documents:
from sklearn.decomposition import NMF

nmf = NMF(n_components=16)
X_emb = nmf.fit_transform(X)
X_emb[0]

Q = cv.transform([query])
Q_emb = nmf.transform(Q)
Q_emb[0]

score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:10]
list(df.loc[idx].text)

["No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) after submitting a project. You can only peer-review projects at the time the course is running.",
 'No, as long as you do the peer-reviewed capstone projects in time then you can get the certificate. You do not need to do the homeworks if you join late for example.',
 'Yes, if you finish at least 2 out of 3 projects and review 3 peers’ Projects by the deadline, you will get a certificate. This is what it looks like: k.lin',
 "No, you can only get a certificate if you finish the course with a “live” cohort.\nWe don't award certificates for the self-paced mode. The reason is you need to peer-review 3 capstone(s) after submitting your project.\nYou can only peer-review projects at the time the course is running; after the form is closed and the peer-review list is compiled.",
 'After the submission deadl

In [59]:
# BERT
# The problem with the previous two approaches is that they don't take into account the word order. They just treat all the words separately (that's why it's called "Bag-of-Words")

# BERT and other transformer models don't have this problem.

# Let's create embeddings with BERT. We will use the Hugging Face library for that
import torch
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()  # Set the model to evaluation mode if not training

# We need:

# tokenizer - for turning text into vectors
# model - for compressing the text into embeddings
# First, we tokenize the text
texts = [
    "Yes, we will keep all the materials after the course finishes.",
    "You can follow the course at your own pace after it finishes",
]
encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Then we compute the embeddings:
with torch.no_grad():  # Disable gradient calculation for inference
    outputs = model(**encoded_input)
    hidden_states = outputs.last_hidden_state

# Now we need to compress the embeddings:
sentence_embeddings = hidden_states.mean(dim=1)
sentence_embeddings.shape

# And convert them to a numpy array
X_emb = sentence_embeddings.numpy()

def make_batches(seq, n):
    result = []
    for i in range(0, len(seq), n):
        batch = seq[i : i + n]
        result.append(batch)
    return result



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [60]:
from tqdm.auto import tqdm

texts = df["text"].tolist()
text_batches = make_batches(texts, 8)

all_embeddings = []

for batch in tqdm(text_batches):
    encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**encoded_input)
        hidden_states = outputs.last_hidden_state

        batch_embeddings = hidden_states.mean(dim=1)
        batch_embeddings_np = batch_embeddings.cpu().numpy()
        all_embeddings.append(batch_embeddings_np)

final_embeddings = np.vstack(all_embeddings)


  0%|          | 0/153 [00:00<?, ?it/s]

In [61]:
def compute_embeddings(texts, batch_size=8):
    text_batches = make_batches(texts, 8)

    all_embeddings = []

    for batch in tqdm(text_batches):
        encoded_input = tokenizer(
            batch, padding=True, truncation=True, return_tensors="pt"
        )

        with torch.no_grad():
            outputs = model(**encoded_input)
            hidden_states = outputs.last_hidden_state

            batch_embeddings = hidden_states.mean(dim=1)
            batch_embeddings_np = batch_embeddings.cpu().numpy()
            all_embeddings.append(batch_embeddings_np)

    final_embeddings = np.vstack(all_embeddings)
    return final_embeddings


In [62]:
X_text = compute_embeddings(df["text"].tolist())


  0%|          | 0/153 [00:00<?, ?it/s]