In [50]:
import requests
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()
documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [4]:
df = pd.DataFrame(documents, 
                  columns=['course', 'section', 'question', 'text'],
                           )

df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [5]:
docs_example = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course"
]

In [6]:
# doing some count vectorization
cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(docs_example)
names = cv.get_feature_names_out()
df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs

Unnamed: 0,0,1,2,3,4
15th,1,0,0,0,0
2024,1,0,0,0,0
cloud,0,0,0,0,1
course,1,0,0,0,1
date,0,0,1,0,0
github,0,1,0,0,0
google,0,0,0,0,1
homeworks,0,0,1,0,0
jan,1,0,0,0,0
listed,0,1,0,0,0


In [7]:
# tfidf
cv = TfidfVectorizer(stop_words='english')
X = cv.fit_transform(docs_example)
names = cv.get_feature_names_out()
df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs.round(2)

Unnamed: 0,0,1,2,3,4
15th,0.46,0.0,0.0,0.0,0.0
2024,0.46,0.0,0.0,0.0,0.0
cloud,0.0,0.0,0.0,0.0,0.46
course,0.37,0.0,0.0,0.0,0.37
date,0.0,0.0,0.5,0.0,0.0
github,0.0,0.58,0.0,0.0,0.0
google,0.0,0.0,0.0,0.0,0.46
homeworks,0.0,0.0,0.5,0.0,0.0
jan,0.46,0.0,0.0,0.0,0.0
listed,0.0,0.58,0.0,0.0,0.0


In [18]:
query = "Do I need to know Python to participate in the January course?"

q = cv.transform([query])
q = q.toarray()

query_dict = dict(zip(names, q.ravel()))


In [54]:
fields = ['course', 'section', 'question', 'text']
transformers = {}
matrices = {}

for field in fields:

    cv = TfidfVectorizer(stop_words='english', min_df=3)
    X = cv.fit_transform(df[field])
    transformers[field] = cv
    matrices[field] = X

names = transformers['text'].get_feature_names_out()
df_text_field = pd.DataFrame(matrices['text'].toarray(), columns=names).T

query = "I just signed up. Is it too late to join the course?"
q = transformers['text'].transform([query])
score = cosine_similarity(matrices['text'], q).flatten()
print('Highest score:', score.max())
print('Lowest score:', score.min())

# for the data-engineering course only
mask = (df['course'] == 'data-engineering-zoomcamp').values
score = score * mask
print('Highest score for data-engineering course:', score.max())

# Top 10 most similar documents (by text field)
top10 = np.argsort(score)[-10:]

Highest score: 0.3336047027395824
Lowest score: 0.0
Highest score for data-engineering course: 0.3336047027395824


In [63]:
df.iloc[top10].text

11     No, you can only get a certificate if you fini...
113    In the join queries, if we mention the column ...
7      Yes, we will keep all the materials after the ...
3      You don't need it. You're accepted. You can al...
287    This error could result if you are using some ...
38     You will have two attempts for a project. If t...
27     You can do most of the course without a cloud....
22     It's up to you which platform and environment ...
15     No, late submissions are not allowed. But if t...
0      The purpose of this document is to capture fre...
Name: text, dtype: object