In [1]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
DOC_EXAMPLES = [
    "January course details, register now",
    "Course prerequisites listed in January catalog",
    "Submit January course homework by end of month",
    "Register for January course, no prerequisites",
    "January course setup: Python and Google Cloud"
]

In [3]:
# Indexing via word representation: Bag of Words (BoW)
vectorizer = CountVectorizer(stop_words="english")
D = vectorizer.fit_transform(DOC_EXAMPLES)

pd.DataFrame(D.toarray(), columns=vectorizer.get_feature_names_out())

Unnamed: 0,catalog,cloud,course,details,end,google,homework,january,listed,month,prerequisites,python,register,setup,submit
0,0,0,1,1,0,0,0,1,0,0,0,0,1,0,0
1,1,0,1,0,0,0,0,1,1,0,1,0,0,0,0
2,0,0,1,0,1,0,1,1,0,1,0,0,0,0,1
3,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0
4,0,1,1,0,0,1,0,1,0,0,0,1,0,1,0


In [4]:
# Indexing via word representation: TF-IDF
vectorizer = TfidfVectorizer(stop_words="english")
D = vectorizer.fit_transform(DOC_EXAMPLES)

pd.DataFrame(D.toarray(), columns=vectorizer.get_feature_names_out()).round(2)

Unnamed: 0,catalog,cloud,course,details,end,google,homework,january,listed,month,prerequisites,python,register,setup,submit
0,0.0,0.0,0.33,0.69,0.0,0.0,0.0,0.33,0.0,0.0,0.0,0.0,0.56,0.0,0.0
1,0.57,0.0,0.27,0.0,0.0,0.0,0.0,0.27,0.57,0.0,0.46,0.0,0.0,0.0,0.0
2,0.0,0.0,0.23,0.0,0.47,0.0,0.47,0.23,0.0,0.47,0.0,0.0,0.0,0.0,0.47
3,0.0,0.0,0.36,0.0,0.0,0.0,0.0,0.36,0.0,0.0,0.61,0.0,0.61,0.0,0.0
4,0.0,0.47,0.23,0.0,0.0,0.47,0.0,0.23,0.0,0.0,0.0,0.47,0.0,0.47,0.0


In [5]:
# IR: Cosine Similarity
query = "Do I need to know python to sign up for the January course?"
q = vectorizer.transform([query])

cosine_similarity(D, q).reshape(-1)

array([0.25955955, 0.21371415, 0.17843726, 0.28419115, 0.57137158])