In [5]:
import pandas as pd
import requests

print('Getting document...')
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

print('Document sample: ')
print(documents[0])

df = pd.DataFrame(documents, columns=['course','section','question','text'])
print(df.head())

print('Filtering to data-engineering-zoomcamp')
df_de = df[df.course == 'data-engineering-zoomcamp']
print(len(df), '->',len(df_de))

Getting document...
Document sample: 
{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.", 'section': 'General course-related questions', 'question': 'Course - When will the course start?', 'course': 'data-engineering-zoomcamp'}
                      course                           section  \
0  data-engineering-zoomcamp  General course-related questions   
1  data-engineering-zoomcamp  General course-related questions   
2  data-engineering-zoomcamp  General course-related questions   
3  data-engineering-zoomcamp  General course-related questions   
4  data-engineeri

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english', min_df=5)
X = cv.fit_transform(df_de.text)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names)
df_docs

Unnamed: 0,01,04,05,10,100,11,12,13,16,17,...,www,yaml,year,yellow,yellow_tripdata_2021,yes,yml,youtube,zip,zoomcamp
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
430,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
431,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
432,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
433,0,0,0,2,0,0,1,0,0,0,...,0,1,1,0,0,0,3,0,0,1


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer(stop_words='english', min_df=5)
X = cv.fit_transform(df_de.text)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names)
df_docs

Unnamed: 0,01,04,05,10,100,11,12,13,16,17,...,www,yaml,year,yellow,yellow_tripdata_2021,yes,yml,youtube,zip,zoomcamp
0,0.0,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
1,0.0,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.460365
2,0.0,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.344378,0.000000,0.0,0.0,0.000000
3,0.0,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
4,0.0,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
430,0.0,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
431,0.0,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
432,0.0,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
433,0.0,0.0,0.0,0.088511,0.0,0.00000,0.050737,0.0,0.0,0.0,...,0.000000,0.048926,0.050737,0.0,0.0,0.000000,0.132766,0.0,0.0,0.038826


In [15]:
query = 'Do i need to know python to sign up for the January course?'

q = cv.transform([query])

query_dict = dict(zip(names,q.toarray()[0]))
print(query_dict)

doc_dict = dict(zip(names,q.toarray()[1]))
print(doc_dict)


{'01': 0.0, '04': 0.0, '05': 0.0, '10': 0.0, '100': 0.0, '11': 0.0, '12': 0.0, '13': 0.0, '16': 0.0, '17': 0.0, '2019': 0.0, '2024': 0.0, '22': 0.0, '24': 0.0, '403': 0.0, '5431': 0.0, '5432': 0.0, '7077': 0.0, '80': 0.0, '8080': 0.0, 'able': 0.0, 'access': 0.0, 'account': 0.0, 'activate': 0.0, 'add': 0.0, 'added': 0.0, 'adding': 0.0, 'additional': 0.0, 'address': 0.0, 'admin': 0.0, 'airflow': 0.0, 'alexey': 0.0, 'allows': 0.0, 'alternative': 0.0, 'alternatively': 0.0, 'anaconda': 0.0, 'anaconda3': 0.0, 'analytics': 0.0, 'anand': 0.0, 'ans': 0.0, 'answer': 0.0, 'apache': 0.0, 'api': 0.0, 'app': 0.0, 'appear': 0.0, 'appears': 0.0, 'append': 0.0, 'application': 0.0, 'apply': 0.0, 'appname': 0.0, 'apt': 0.0, 'archives': 0.0, 'argument': 0.0, 'ask': 0.0, 'assigned': 0.0, 'attempting': 0.0, 'auth': 0.0, 'authentication': 0.0, 'automatically': 0.0, 'available': 0.0, 'avoid': 0.0, 'azure': 0.0, 'bad': 0.0, 'base': 0.0, 'based': 0.0, 'bash': 0.0, 'bashrc': 0.0, 'best': 0.0, 'better': 0.0, 'big

IndexError: index 1 is out of bounds for axis 0 with size 1

In [25]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

fields = ['section','question','text']

matrices = {}
vectorizers = {}

for f in fields:
    cv = TfidfVectorizer(stop_words='english', min_df=5)
    X = cv.fit_transform(df_de[f])
    matrices[f] = X
    vectorizers[f] = cv
    
n = len(df_de)
score = np.zeros(n)

boosts = {
    'question': 3,
    'text': 0.5
}

query = 'I just discovered the course, is it too late to join?'

for f in fields:
    q = vectorizers[f].transform([query])
    X = matrices[f]

    f_score = cosine_similarity(X,q).flatten()

    boost = boosts.get(f, 1.0)

    score = score + f_score

In [26]:
idx = np.argsort(score)[-5:]

In [28]:
df_de.iloc[idx]

Unnamed: 0,course,section,question,text
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."


In [29]:
class TextSearch:

    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}

    def fit(self, records, vectorizer_params={}):
        self.df = pd.DataFrame(records)

        for f in self.text_fields:
            cv = TfidfVectorizer(**vectorizer_params)
            X = cv.fit_transform(self.df[f])
            self.matrices[f] = X
            self.vectorizers[f] = cv

    def search(self, query, n_results=10, boost={}, filters={}):
        score = np.zeros(len(self.df))

        for f in self.text_fields:
            b = boost.get(f, 1.0)
            q = self.vectorizers[f].transform([query])
            s = cosine_similarity(self.matrices[f], q).flatten()
            score = score + b * s

        for field, value in filters.items():
            mask = (self.df[field] == value).values
            score = score * mask

        idx = np.argsort(-score)[:n_results]
        results = self.df.iloc[idx]
        return results.to_dict(orient='records')