In [1]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [2]:
documents[2]

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

In [3]:
import pandas as pd

df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [None]:
Vector Spaces
- turn the docs into vectors
- term-docments matrix:
    - rows: documents
    - columns: words/tokens
- bag of words:
    - word order is lost
    - sparse matrix

In [4]:
docs_example = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course"
]

CountVectorizer

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(docs_example)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs

Unnamed: 0,0,1,2,3,4
15th,1,0,0,0,0
2024,1,0,0,0,0
cloud,0,0,0,0,1
course,1,0,0,0,1
date,0,0,1,0,0
github,0,1,0,0,0
google,0,0,0,0,1
homeworks,0,0,1,0,0
jan,1,0,0,0,0
listed,0,1,0,0,0


TfidfVectorizer

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_cv = TfidfVectorizer(stop_words = 'english')
X = tf_cv.fit_transform(docs_example)

names = tf_cv.get_feature_names_out()

df_docs_tf = pd.DataFrame(X.toarray(), columns = names).T
df_docs_tf

Unnamed: 0,0,1,2,3,4
15th,0.463693,0.0,0.0,0.0,0.0
2024,0.463693,0.0,0.0,0.0,0.0
cloud,0.0,0.0,0.0,0.0,0.463693
course,0.374105,0.0,0.0,0.0,0.374105
date,0.0,0.0,0.5,0.0,0.0
github,0.0,0.57735,0.0,0.0,0.0
google,0.0,0.0,0.0,0.0,0.463693
homeworks,0.0,0.0,0.5,0.0,0.0
jan,0.463693,0.0,0.0,0.0,0.0
listed,0.0,0.57735,0.0,0.0,0.0


In [7]:
df_docs_tf.shape

(19, 5)

Query-Document Similarity

In [8]:
query = "Do I need to know python to sign up for the January course?"

q = tf_cv.transform([query])
q.toarray()

array([[0.        , 0.        , 0.        , 0.62791376, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.77828292, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ]])

In [9]:
q.shape

(1, 19)

In [10]:
#  importance of the words when using TfidfVectorizer on test data
query_dict = dict(zip(names, q.toarray()[0]))

#  importance of the words when using TfidfVectorizer on traning data
doc_dict = dict(zip(names, X.toarray()[0]))


# compare importance between query and doc
df_qd = pd.DataFrame([query_dict, doc_dict], index=['query', 'doc']).T
df_qd

Unnamed: 0,query,doc
15th,0.0,0.463693
2024,0.0,0.463693
cloud,0.0,0.0
course,0.627914,0.374105
date,0.0,0.0
github,0.0,0.0
google,0.0,0.0
homeworks,0.0,0.0
jan,0.0,0.463693
listed,0.0,0.0


In [11]:
(df_qd['query'] * df_qd['doc']).sum()

np.float64(0.23490553492076713)

In [12]:
X.dot(q.T).toarray()

array([[0.23490553],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.59579005]])

In [13]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np 

scores = cosine_similarity(X, q).flatten()
print("scores", scores)

# sorting the documents by the relevance to the query question
np.argsort(scores)

scores [0.23490553 0.         0.         0.         0.59579005]


array([1, 2, 3, 0, 4])

Vectorizing all the documents

In [14]:
fields = ['section', 'question', 'text']
transformers = {} #vectorizers
matrices = {}

for field in fields:
    cv = TfidfVectorizer(stop_words='english', min_df=5)
    X = cv.fit_transform(df[field])

    transformers[field] = cv
    matrices[field] = X

transformers['text'].get_feature_names_out()
matrices['text']

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 23808 stored elements and shape (948, 1333)>

In [15]:
query = "Do I need to know SQL to sign up for the January course?"

q = transformers['text'].transform([query])
score = cosine_similarity(matrices['text'], q).flatten()

In [16]:
# top 5 answers
score_index = np.argsort(score)
score_index[-5:]

array([209, 577, 861, 445, 328])

In [17]:
df.iloc[445].text

'Check this article. If you know everything in this article, you know enough. If you don’t, read the article and join the coursIntroduction to Pythone too :)\nIntroduction to Python – Machine Learning Bookcamp\nYou can follow this English course from the OpenClassrooms e-learning platform, which is free and covers the python basics for data analysis: Learn Python Basics for Data Analysis - OpenClassrooms . It is important to know some basics such as: how to run a Jupyter notebook, how to import libraries (and what libraries are), how to declare a variable (and what variables are) and some important operations regarding data analysis.\n(Mélanie Fouesnard)'

checking similarity

In [18]:
n = len(df)
score = np.zeros(n)

query = "I just singned up. Is it too late to join the course?"

for f in fields: 
    q = transformers[f].transform([query])
    X = matrices[f]

    f_score = cosine_similarity(X, q).flatten()
    score = score + f_score

In [19]:
score_index = np.argsort(score)[-5:]
score_index

array([  4,   1,   7, 448,   0])

In [20]:
df.iloc[score_index]

Unnamed: 0,course,section,question,text
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."
448,machine-learning-zoomcamp,General course-related questions,I’m new to Slack and can’t find the course cha...,Here’s how you join a in Slack: https://slack....
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...


searching and filtering

In [21]:
filters = {
    'course':'data-engineering-zoomcamp'
}

In [22]:
for field, value in filters.items():
    mask = (df[field] == value).astype(int)
    score = score * mask

In [23]:
score_idx = np.argsort(score)[-5:]
print(score_idx)
df.iloc[score_idx]

943     4
944    34
945     1
946     7
947     0
Name: course, dtype: int64


Unnamed: 0,course,section,question,text
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
34,data-engineering-zoomcamp,General course-related questions,How can we contribute to the course?,Star the repo! Share it with friends if you fi...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...


Search with all the fields & boosting + filtering

In [24]:
# boosting one of the fields - question - to give it more importance than to others
query = "I just singned up. Is it too late to join the course?"

n = len(df)
score = np.zeros(n)
boost = {'question': 3}

for f in fields: 
    b = boost.get(f, 1.0)
    q = transformers[f].transform([query])
    s = cosine_similarity(matrices[f], q).flatten()
  
    score = score + s * b

In [31]:
for field, value in filters.items():
    mask = (df[field] == value).astype(int)
    score = score * mask

idx = np.argsort(-score)[:5]
results = df.iloc[idx]
results.to_dict(orient='records')

[{'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.'},
 {'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcem

## creating class from all the section above

In [33]:
class TextSearch:

    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}

    def fit(self, records, vectorizer_params={}):
        self.df = pd.DataFrame(records)

        for f in self.text_fields:
            cv = TfidfVectorizer(**vectorizer_params)
            X = cv.fit_transform(self.df[f])
            self.matrices[f] = X
            self.vectorizers[f] = cv

    def search(self, query, n_results=10, boost={}, filters={}):
        score = np.zeros(len(self.df))

        for f in self.text_fields:
            b = boost.get(f, 1.0)
            q = self.vectorizers[f].transform([query])
            s = cosine_similarity(self.matrices[f], q).flatten()
            score = score + b * s

        for field, value in filters.items():
            mask = (self.df[field] == value).values
            score = score * mask

        idx = np.argsort(-score)[:n_results]
        results = self.df.iloc[idx]
        return results.to_dict(orient='records')

In [39]:
index = TextSearch(
    text_fields=['section', 'question', 'text']
)
index.fit(documents)

index.search(
    query='I just singned up. Is it too late to join the course?',
    n_results=5,
    boost={'question': 3.0},
    filters={'course': 'data-engineering-zoomcamp'}
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineerin