In [1]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [2]:
documents[2]

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

In [3]:
import pandas as pd

df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [None]:
Vector Spaces
- turn the docs into vectors
- term-docments matrix:
    - rows: documents
    - columns: words/tokens
- bag of words:
    - word order is lost
    - sparse matrix

In [4]:
docs_example = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course"
]

CountVectorizer

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(docs_example)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs

Unnamed: 0,0,1,2,3,4
15th,1,0,0,0,0
2024,1,0,0,0,0
cloud,0,0,0,0,1
course,1,0,0,0,1
date,0,0,1,0,0
github,0,1,0,0,0
google,0,0,0,0,1
homeworks,0,0,1,0,0
jan,1,0,0,0,0
listed,0,1,0,0,0


TfidfVectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_cv = TfidfVectorizer(stop_words = 'english')
X = tf_cv.fit_transform(docs_example)

names = tf_cv.get_feature_names_out()

df_docs_tf = pd.DataFrame(X.toarray(), columns = names).T
df_docs_tf

In [36]:
df_docs_tf.shape

(19, 5)

Query-Document Similarity

In [39]:
query = "Do I need to know python to sign up for the January course?"

q = tf_cv.transform([query])
q.toarray()

array([[0.        , 0.        , 0.        , 0.62791376, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.77828292, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ]])

In [38]:
q.shape

(1, 19)

In [23]:
#  importance of the words when using TfidfVectorizer on test data
query_dict = dict(zip(names, q.toarray()[0]))

#  importance of the words when using TfidfVectorizer on traning data
doc_dict = dict(zip(names, X.toarray()[0]))


# compare importance between query and doc
df_qd = pd.DataFrame([query_dict, doc_dict], index=['query', 'doc']).T
df_qd

Unnamed: 0,query,doc
15th,0.0,0.463693
2024,0.0,0.463693
cloud,0.0,0.0
course,0.627914,0.374105
date,0.0,0.0
github,0.0,0.0
google,0.0,0.0
homeworks,0.0,0.0
jan,0.0,0.463693
listed,0.0,0.0


In [22]:
(df_qd['query'] * df_qd['doc']).sum()

np.float64(0.23490553492076713)

In [25]:
X.dot(q.T).toarray()

array([[0.23490553],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.59579005]])

In [42]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np 

scores = cosine_similarity(X, q).flatten()
print("scores", scores)

# sorting the documents by the relevance to the query question
np.argsort(scores)

scores [0.23490553 0.         0.         0.         0.59579005]


array([1, 2, 3, 0, 4])

Vectorizing all the documents

In [44]:
fields = ['section', 'question', 'text']
transformers = {}
matrices = {}

for field in fields:
    cv = TfidfVectorizer(stop_words='english', min_df=3)
    X = cv.fit_transform(df[field])

    transformers[field] = cv
    matrices[field] = X

transformers['text'].get_feature_names_out()
matrices['text']

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 26463 stored elements and shape (948, 2118)>

In [61]:
transformers

{'section': TfidfVectorizer(min_df=3, stop_words='english'),
 'question': TfidfVectorizer(min_df=3, stop_words='english'),
 'text': TfidfVectorizer(min_df=3, stop_words='english')}

In [54]:
query = "I just singned up. Is it too late to join the course?"

q = transformers['text'].transform([query])
score = cosine_similarity(matrices['text'], q).flatten()

In [45]:
query = "Do I need to know python to sign up for the January course?"

q = transformers['text'].transform([query])
score = cosine_similarity(matrices['text'], q).flatten()

In [55]:
np.argsort(score)

array([555, 537, 536, 535, 534, 533, 532, 531, 530, 529, 559, 558, 556,
       538, 554, 553, 552, 551, 550, 549, 548, 547, 546, 545, 544, 575,
       838, 514, 513, 512, 543, 542, 541, 540, 843, 842, 841, 840, 839,
       574, 837, 836, 835, 834, 833, 832, 863, 862, 861, 860, 539, 592,
       607, 606, 605, 604, 603, 602, 601, 600, 597, 596, 595, 594, 576,
       622, 621, 620, 618, 617, 616, 615, 614, 613, 612, 611, 610, 590,
       573, 572, 571, 570, 569, 567, 566, 564, 563, 562, 560, 591, 515,
       589, 586, 585, 584, 583, 582, 581, 580, 579, 578, 577, 477, 437,
       434, 433, 432, 463, 462, 461, 460, 459, 453, 479, 478, 438, 476,
       859, 856, 855, 854, 853, 852, 851, 850, 849, 848, 879, 421, 402,
       401, 400,  44, 429, 428, 427, 426, 425, 424, 423, 422, 878, 420,
       419, 418, 417, 416, 447, 446, 444, 443, 442, 441, 497, 480, 510,
       509, 508, 507, 506, 505, 504, 501, 500, 499, 498, 481, 496, 527,
       526, 524, 523, 521, 520, 519, 518, 517, 516, 494, 877, 87

In [59]:
df.iloc[15].text

'No, late submissions are not allowed. But if the form is still not closed and it’s after the due date, you can still submit the homework. confirm your submission by the date-timestamp on the Course page.y\nOlder news:[source1] [source2]'