In [24]:
import requests
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [13]:
df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])

df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [14]:
df[df.course == 'data-engineering-zoomcamp']['section'].unique()

array(['General course-related questions',
       'Module 1: Docker and Terraform',
       'Module 2: Workflow Orchestration', 'Module 3: Data Warehousing',
       "error: Error while reading table: trips_data_all.external_fhv_tripdata, error message: Parquet column 'DOlocationID' has type INT64 which does not match the target cpp_type DOUBLE.",
       'Module 4: analytics engineering with dbt', 'Module 5: pyspark',
       'Module 6: streaming with kafka', 'Project',
       'Course Management Form for Homeworks', 'Workshop 1 - dlthub',
       'Workshop 2 - RisingWave', 'Triggers in Mage via CLI'],
      dtype=object)

In [None]:
docs_example = [
    "January course details, register now",
    "Course prerequisites listed in January catalog",
    "Submit January course homework by end of month",
    "Register for January course, no prerequisites",
    "January course setup: Python and Google Cloud"
]

cv = CountVectorizer()

cv.fit(docs_example)

In [None]:
names = cv.get_feature_names_out()

names

array(['and', 'by', 'catalog', 'cloud', 'course', 'details', 'end', 'for',
       'google', 'homework', 'in', 'january', 'listed', 'month', 'no',
       'now', 'of', 'prerequisites', 'python', 'register', 'setup',
       'submit'], dtype=object)

In [None]:
print("vocabulary:", cv.vocabulary_)

Vocabulary: {'january': 11, 'course': 4, 'details': 5, 'register': 19, 'now': 15, 'prerequisites': 17, 'listed': 12, 'in': 10, 'catalog': 2, 'submit': 21, 'homework': 9, 'by': 1, 'end': 6, 'of': 16, 'month': 13, 'for': 7, 'no': 14, 'setup': 20, 'python': 18, 'and': 0, 'google': 8, 'cloud': 3}


In [None]:
X = cv.transform(docs_example)

print("feature Matrix:\n", X.toarray())

Feature Matrix:
 [[0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0]
 [0 0 1 0 1 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0]
 [0 1 0 0 1 0 1 0 0 1 0 1 0 1 0 0 1 0 0 0 0 1]
 [0 0 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 1 0 1 0 0]
 [1 0 0 1 1 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 0]]


In [22]:
df_docs = pd.DataFrame(X.toarray(), columns=names).T

df_docs

Unnamed: 0,0,1,2,3,4
and,0,0,0,0,1
by,0,0,1,0,0
catalog,0,1,0,0,0
cloud,0,0,0,0,1
course,1,1,1,1,1
details,1,0,0,0,0
end,0,0,1,0,0
for,0,0,0,1,0
google,0,0,0,0,1
homework,0,0,1,0,0


In [23]:
cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(docs_example)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs

Unnamed: 0,0,1,2,3,4
catalog,0,1,0,0,0
cloud,0,0,0,0,1
course,1,1,1,1,1
details,1,0,0,0,0
end,0,0,1,0,0
google,0,0,0,0,1
homework,0,0,1,0,0
january,1,1,1,1,1
listed,0,1,0,0,0
month,0,0,1,0,0


In [26]:
cv = TfidfVectorizer(stop_words='english')
X = cv.fit_transform(docs_example)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs.round(2)

Unnamed: 0,0,1,2,3,4
catalog,0.0,0.57,0.0,0.0,0.0
cloud,0.0,0.0,0.0,0.0,0.47
course,0.33,0.27,0.23,0.36,0.23
details,0.69,0.0,0.0,0.0,0.0
end,0.0,0.0,0.47,0.0,0.0
google,0.0,0.0,0.0,0.0,0.47
homework,0.0,0.0,0.47,0.0,0.0
january,0.33,0.27,0.23,0.36,0.23
listed,0.0,0.57,0.0,0.0,0.0
month,0.0,0.0,0.47,0.0,0.0


In [27]:
query = "Do I need to know python to sign up for the January course?"

q = cv.transform([query])
q.toarray()

array([[0.        , 0.        , 0.39515588, 0.        , 0.        ,
        0.        , 0.        , 0.39515588, 0.        , 0.        ,
        0.        , 0.829279  , 0.        , 0.        , 0.        ]])

In [29]:
query_dict = dict(zip(names, q.toarray()[0]))

query_dict

{'catalog': 0.0,
 'cloud': 0.0,
 'course': 0.39515588491314224,
 'details': 0.0,
 'end': 0.0,
 'google': 0.0,
 'homework': 0.0,
 'january': 0.39515588491314224,
 'listed': 0.0,
 'month': 0.0,
 'prerequisites': 0.0,
 'python': 0.8292789960182417,
 'register': 0.0,
 'setup': 0.0,
 'submit': 0.0}

In [32]:
doc_dict = dict(zip(names, X.toarray()[1]))
doc_dict

{'catalog': 0.5675015398728066,
 'cloud': 0.0,
 'course': 0.2704175244456293,
 'details': 0.0,
 'end': 0.0,
 'google': 0.0,
 'homework': 0.0,
 'january': 0.2704175244456293,
 'listed': 0.5675015398728066,
 'month': 0.0,
 'prerequisites': 0.45785666908911726,
 'python': 0.0,
 'register': 0.0,
 'setup': 0.0,
 'submit': 0.0}