In [1]:
import pandas as pd
import numpy as np
import requests

In [2]:
# Finding the frequently asked questions and answers on github
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
# Request these document to fetch them into the code
docs_response = requests.get(docs_url)
# Create the json of the request response
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

# Examine the final json
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [3]:
# Create a pandas dataframe based on this JSON
df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
# See the first 5 rows of the dataframe
df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [4]:
# Limit the questions and answers to a specific course
#df[df.course == 'data-engineering-zoomcamp'].head()

In [5]:
# Vectorize the documents that we want to search

In [6]:
# Use Bag of words to vectorize the text
from sklearn.feature_extraction.text import CountVectorizer

# Initializw the vectorizer model
cv = CountVectorizer(stop_words='english', min_df = 5)
# Fit the vectorizer with the data to convert them to vectors
X = cv.fit_transform(df['text'])
# Take the words appearing in the text as column names
names = cv.get_feature_names_out()

# Create the dataframe from this text
df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,938,939,940,941,942,943,944,945,946,947
01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
02,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
03,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
04,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
05,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yes,0,0,1,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
yml,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
youtube,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
zip,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Vectorizing the text using TD-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer(stop_words='english', min_df = 5)
X = cv.fit_transform(df['text'])

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs.round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,938,939,940,941,942,943,944,945,946,947
01,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
02,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
03,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
04,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
05,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yes,0.0,0.00,0.28,0.0,0.0,0.00,0.21,0.2,0.15,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
yml,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.11,0.0,0.0,0.0,0.00
youtube,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.15,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
zip,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00


# Try to find an answer, searching only in the answer body

For that we will try to find query to document similarities by:
- Vectorizing the query using the vectorizer / dictionary of words that we fit with our documents
- Find the dot product of the vector/query and all the documents to find a similarity score of each document with the query -> cosine similarity
- Then we short the documents based on their scores and retrieve the more relevant ones to our query

In [8]:
# User query
query = "Do I need to know python to sign up for the January course?"

# Vectorize the query using the fitted vectorizer
q = cv.transform([query]) # need to be a list
# See the array of the query's vector reprensentation
print(q.toarray())
# See the words that has weight in the query's vector
query_dict = dict(zip(names, q.toarray()[0]))
query_dict

doc_dict = dict(zip(names, X.toarray()[1]))
#doc_dict

[[0. 0. 0. ... 0. 0. 0.]]


In [9]:
# Calculate the similarity between the query and all documents using the dot product
similarity_scores = X.dot(q.T).toarray() # need to transpose the query before using the dot product
# Then we can sort these similarities and find the indexes of the most relevant documents
indxs = np.argsort(similarity_scores.flatten())[-5:]
print(indxs)

[764  27 806 577 445]


In [10]:
# See the most relevant answer
df['text'].iloc[764]

'If you have submitted two projects (and peer-reviewed at least 3 course-mates’ projects for each submission), you will get the certificate for the course. According to the course coordinator, Alexey Grigorev, only two projects are needed to get the course certificate.\n(optional) David Odimegwu'

### We can calculate the scores using directly the cosine_similarity from Sklearn

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate the scores using cosine similarity
similarity_scores = cosine_similarity(X,q).flatten()
# Sort them by similarity score and find the indexes 
indxs = np.argsort(similarity_scores)[-5:]
print(indxs)

[764  27 806 577 445]


### Find the similarities scores for all the fields in the questions

In [12]:
fields = ['section', 'question', 'text']
transformers = {}
matrices = {}

# For each field in the dataframe convert it into vectors
for field in fields:
    # Fit the vectorizer 
    cv = TfidfVectorizer(stop_words='english', min_df=3)
    # Vectorize the text
    X = cv.fit_transform(df[field])

    # Created a dictionary of all the fitted vectorizers
    transformers[field] = cv
    # Create a dictionary with all the matrices of vectors
    matrices[field] = X

transformers['text'].get_feature_names_out()
matrices['text']

<948x2118 sparse matrix of type '<class 'numpy.float64'>'
	with 26463 stored elements in Compressed Sparse Row format>

In [13]:
# Vectorize a query and search for all the fields

# Initialize the array of scores with zeros
score = np.zeros(len(df))

query = 'I just discovered the course, is it too late to join?'

# Set up the boosting weights of specific fields
boost = {'question': 3.0}

# Search for all the fields and aggregate the scores
for f in fields:
    # Create the query vector using the previous transformers
    q = transformers[f].transform([query])
    # Fetch the matrices
    X = matrices[f]
    # Calculate the cosine similarity for the field
    f_score = cosine_similarity(X,q).flatten()
    # Calculate the boost
    b = boost.get(f, 1.0) # in case that we didn't specify this field get weight as 1
    # Append this into the score
    score = score + b * f_score

# Find the indexes of the most relevant documents
indxs = np.argsort(score)[-10:]
# display the relevant answers
df.iloc[indxs]

Unnamed: 0,course,section,question,text
452,machine-learning-zoomcamp,General course-related questions,I just joined. What should I do next? How can ...,Welcome to the course! Go to the course page (...
8,data-engineering-zoomcamp,General course-related questions,Course - Can I get support if I take the cours...,"Yes, the slack channel remains open and you ca..."
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
9,data-engineering-zoomcamp,General course-related questions,Course - Which playlist on YouTube should I re...,All the main videos are stored in the Main “DA...
5,data-engineering-zoomcamp,General course-related questions,Course - how many Zoomcamps in a year?,"There are 3 Zoom Camps in a year, as of 2024. ..."
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
34,data-engineering-zoomcamp,General course-related questions,How can we contribute to the course?,Star the repo! Share it with friends if you fi...


In [14]:
# You can create a filter to narrow down the retrieving documents
filter = {
    'course': 'data-engineering-zoomcamp'
}
# Create the mask 
for field, value in filter.items():
    mask = (df[field] == value).astype(int).values
    # Adjust the scores based on the mask
    filtered_scores = score* mask

In [15]:
# Find the indexes of the most relevant documents
indxs = np.argsort(-filtered_scores)[:10]  # I am sorting the negative values to short as DESC
# display the relevant answers
df.iloc[indxs]

Unnamed: 0,course,section,question,text
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
34,data-engineering-zoomcamp,General course-related questions,How can we contribute to the course?,Star the repo! Share it with friends if you fi...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
5,data-engineering-zoomcamp,General course-related questions,Course - how many Zoomcamps in a year?,"There are 3 Zoom Camps in a year, as of 2024. ..."
9,data-engineering-zoomcamp,General course-related questions,Course - Which playlist on YouTube should I re...,All the main videos are stored in the Main “DA...
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
8,data-engineering-zoomcamp,General course-related questions,Course - Can I get support if I take the cours...,"Yes, the slack channel remains open and you ca..."
10,data-engineering-zoomcamp,General course-related questions,Course - ​​How many hours per week am I expect...,It depends on your background and previous exp...


In [16]:
# Create a function for the search
class TextSearch:
    
    # Initialize the parameters to store the vector matrices and vectorizers
    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}
    
    # Vectorize all the fields of the dataframe
    def fit(self, records, vectorizer_params={}):
        self.df = pd.DataFrame(records)

        for f in self.text_fields:
            cv = TfidfVectorizer(**vectorizer_params)
            X = cv.fit_transform(self.df[f])
            self.matrices[f] = X
            self.vectorizers[f] = cv

    # Perform the search with boosting and filtering to return top 10 results
    def search(self, query, n_results=10, boost={}, filters={}):
        score = np.zeros(len(self.df))

        for f in self.text_fields:
            b = boost.get(f, 1.0)
            q = self.vectorizers[f].transform([query])
            s = cosine_similarity(self.matrices[f], q).flatten()
            score = score + b * s

        for field, value in filters.items():
            mask = (self.df[field] == value).values
            score = score * mask

        idx = np.argsort(-score)[:n_results]
        results = self.df.iloc[idx]
        return results.to_dict(orient='records')

In [17]:
# Using the text function
# Initialize a search class instance
index = TextSearch(
    text_fields=['section', 'question', 'text']
)
# Fit the document to the search 
index.fit(documents)

# Perform the search
index.search(
    query='I just singned up. Is it too late to join the course?',
    n_results=5,
    boost={'question': 3.0},
    filters={'course': 'data-engineering-zoomcamp'}
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineerin

# Vector Search: Try to find the right answer by creating embeddings of the vector text without the word order

The idea is to reduce the dimension of the Matrix of vectors from the number of words in the dictionary from the vectorizer to the specific number of components specified by us, keeping all the relevant information. To do that we will use the following:
- Singular Value Decomposition (SVD): This will lossy compress the dimensions of the vector-matrix returning positive and negative values for each component
- Non - Negative Matrix Factorization (NMF): This will compress the dimensions of the vector-matrix returning positive and zero values for each component

Disclaimer: This will not take into account the word order

In [20]:
# Using the Singular Value Decomposition
from sklearn.decomposition import TruncatedSVD

# Retrieve the Matrix of Vectors 
X = matrices['text']
# Retrieve the Vectorizer
cv = transformers['text']

# Initialize the decomposition model and specify the number of components
svd = TruncatedSVD(n_components = 16)
# Create the embedding from the matrix of vectors
X_emb = svd.fit_transform(X)

# Take the same query again
query = 'I just singned up. Is it too late to join the course?'

# Create a vector of the query using the above vectorizer
Q = cv.transform([query])
# Create the embedding of the query
Q_emb = svd.transform(Q)

# Find the cosine similarity scores for all the documents and the query
scores = cosine_similarity(X_emb, Q_emb).flatten()

# Find the most relevant documents
idx = np.argsort(-scores)[:5]
df.loc[idx]

Unnamed: 0,course,section,question,text
764,machine-learning-zoomcamp,Projects (Midterm and Capstone),What If I submitted only two projects and fail...,If you have submitted two projects (and peer-r...
436,machine-learning-zoomcamp,General course-related questions,Is it going to be live? When?,"The course videos are pre-recorded, you can st..."
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
449,machine-learning-zoomcamp,General course-related questions,The course has already started. Can I still jo...,"Yes, you can. You won’t be able to submit some..."
814,mlops-zoomcamp,+-General course questions,What if my answer is not exactly the same as t...,Please choose the closest one to your answer. ...


In [21]:
# Using the Non-Negative Matrix Factorization
from sklearn.decomposition import NMF

# Retrieve the Matrix of Vectors 
X = matrices['text']
# Retrieve the Vectorizer
cv = transformers['text']

# Initialize the decomposition model and specify the number of components
nmf = NMF(n_components = 16)
# Create the embedding from the matrix of vectors
X_emb = nmf.fit_transform(X)

# Take the same query again
query = 'I just singned up. Is it too late to join the course?'

# Create a vector of the query using the above vectorizer
Q = cv.transform([query])
# Create the embedding of the query
Q_emb = nmf.transform(Q)

# Find the cosine similarity scores for all the documents and the query
scores = cosine_similarity(X_emb, Q_emb).flatten()

# Find the most relevant documents
idx = np.argsort(-scores)[:5]
df.loc[idx]

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
814,mlops-zoomcamp,+-General course questions,What if my answer is not exactly the same as t...,Please choose the closest one to your answer. ...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
451,machine-learning-zoomcamp,General course-related questions,Can I submit the homework after the due date?,"No, it’s not possible. The form is closed afte..."
449,machine-learning-zoomcamp,General course-related questions,The course has already started. Can I still jo...,"Yes, you can. You won’t be able to submit some..."
