In [1]:
# Get the data
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [2]:
# Define the evaluation metrics
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

  from .autonotebook import tqdm as notebook_tqdm


## Q1

In [5]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"] # added the id field
)

index.fit(documents)

<minsearch.minsearch.Index at 0x76ef899518e0>

In [6]:
def min_search(query, course):
    boost = {'question': 1.5, 'section': 0.1}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

In [7]:
evaluate(ground_truth, lambda q: min_search(q['question'], q['course']))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [00:15<00:00, 297.91it/s]


{'hit_rate': 0.848714069591528, 'mrr': 0.7288235717887772}

### Embeddings

In [9]:
import minsearch

In [10]:
from minsearch import VectorSearch

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [9]:
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

## Q2

In [10]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x7b4804e3ea20>

In [11]:
# defining vector search in minsearch
def vector_min_search(query, course):
    # boost = {'question': 1.5, 'section': 0.1}

    results = vindex.search(
        query_vector=query,
        filter_dict={'course': course},
        # boost_dict=boost,
        num_results=5
    )

    return results

In [12]:
evaluate(
    ground_truth,
    lambda q: vector_min_search(pipeline.transform([q['question']])[0], q['course'])
)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [00:07<00:00, 644.35it/s]


{'hit_rate': 0.48173762697212014, 'mrr': 0.3572833369353793}

## Q3

In [14]:
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [14]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x7b47f4f8b500>

In [15]:
# defining vector search in minsearch
def vector_min_search(query, course):
    # boost = {'question': 1.5, 'section': 0.1}

    results = vindex.search(
        query_vector=query,
        filter_dict={'course': course},
        # boost_dict=boost,
        num_results=5
    )

    return results

In [16]:
evaluate(
    ground_truth,
    lambda q: vector_min_search(pipeline.transform([q['question']])[0], q['course'])
)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [00:07<00:00, 603.72it/s]


{'hit_rate': 0.8210503566025502, 'mrr': 0.6717347453353508}

## Q4. Qdrant

##### First do a docker pull 
docker pull qdrant/qdrant

```
docker run -p 6333:6333 -p 6334:6334 \
   -v "$(pwd)/qdrant_storage:/qdrant/storage:z" \
   qdrant/qdrant
```

In [3]:
from qdrant_client import QdrantClient, models

In [4]:
qd_client = QdrantClient("http://localhost:6333") # connecting to the local host Qdrant instance

In [5]:
from fastembed import TextEmbedding

In [6]:
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [7]:
#### Step4: Create a Collection
# Define the collection name
collection_name = "zoomcamp-03_evaluation"
EMBEDDING_DIMENSIONALITY = 512

# If the collection already exists, delete it
qd_client.delete_collection(collection_name=collection_name)

# Create the collection with specified vector parameters
qd_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,  # Dimensionality of the vectors
        distance=models.Distance.COSINE  # Distance metric for similarity search
    )
)

True

In [8]:
qd_client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword" # exact matching on string metadata fields
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [9]:
points = []

for i, doc in enumerate(documents):
    text = doc['question'] + ' ' + doc['text']
    vector = models.Document(text=text, model=model_handle)
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc
        # payload={
        #         "text": doc['text'],
        #         "section": doc['section'],
        #         "course": doc['course']
        #     } #save all needed metadata fields
    )
    points.append(point)

In [10]:
qd_client.upsert(
    collection_name=collection_name,
    points=points
)

Fetching 5 files: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:11<00:00,  2.22s/it]


UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

def qd_search(query_text, course=None, limit=5):

    results = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document( #embed the query text locally with "jinaai/jina-embeddings-v2-small-en"
            text=query_text,
            model=model_handle 
        ),
        limit=limit, # top closest matches
        with_payload=True #to get metadata in the results
    )

    # Convert results into a list of dicts with 'id'
    return [{"id": res.id} for res in results]

In [15]:
def qd_search(query, course=None, limit=5):

    results = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document( #embed the query text locally with "jinaai/jina-embeddings-v2-small-en"
            text=query,
            model=model_handle
        ),
        query_filter=models.Filter( # filter by course name
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ]
        ),
        limit=limit, # top closest matches
        with_payload=True #to get metadata in the results
    )

    return [d.payload for d in results.points]

In [12]:
resu= qd_search('How to enroll to the course?', 'data-engineering-zoomcamp')

In [13]:
[d.payload for d in resu.points]

[{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp',
  'id': 'c02e79ef'},
 {'text': "You don't need it. You're accepted. You can also just start learning and submitting homework without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.",
  'section': 'General course-related questions',
  'question': 'Course - I have registered for the Data Engineering Bootc

In [16]:
evaluate(
    ground_truth,
    lambda q: qd_search(q['question'], q['course'])  # optional: add filtering logic if needed
)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [01:36<00:00, 47.99it/s]


{'hit_rate': 0.9299762264966501, 'mrr': 0.8517722066133576}

## Q5: Cosine Similarity

In [2]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

In [4]:
df_results.head(5)

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp


In [4]:
results_gpt4o = df_results.to_dict(orient='records')

In [39]:
record = results_gpt4o[0]
record

{'answer_llm': 'You can sign up for the course by visiting the course page at [http://mlzoomcamp.com/](http://mlzoomcamp.com/).',
 'answer_orig': 'Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\nIn the course GitHub repository there’s a link. Here it is: https://airtable.com/shryxwLd0COOEaqXo\nwork',
 'document': '0227b872',
 'question': 'Where can I sign up for the course?',
 'course': 'machine-learning-zoomcamp'}

In [11]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

In [21]:
pipeline.fit(df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question)

In [61]:
import numpy as np

def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)

In [62]:
def compute_similarity(record):
    answer_orig = record.answer_orig
    answer_llm = record.answer_llm
    
    v_llm = pipeline.transform([answer_llm])[0]
    v_orig = pipeline.transform([answer_orig])[0]

    cosine_val = cosine(v_llm,v_orig)
    dot_val = v_llm.dot(v_orig)
    
    return cosine_val

In [63]:
from tqdm.auto import tqdm

similarity = []

for i in tqdm(range(len(df_results))):
    sim = compute_similarity(df_results.iloc[i])
    similarity.append(sim)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1830/1830 [00:03<00:00, 540.67it/s]


In [65]:
df_results['cosine'] = similarity

In [66]:
df_results.head()

Unnamed: 0,answer_llm,answer_orig,document,question,course,cosine
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp,0.463526
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp,0.781565
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp,0.889158
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp,0.614962
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp,0.624086


In [67]:
df_results['cosine'] = similarity
df_results['cosine'].describe()

count    1830.000000
mean        0.841584
std         0.173737
min         0.079093
25%         0.806927
50%         0.905812
75%         0.950711
max         0.996457
Name: cosine, dtype: float64

## Q6: Rouge

In [3]:
from rouge import Rouge
rouge_scorer = Rouge()

In [4]:
r = df_results.iloc[10]
r.to_dict()

{'answer_llm': "Yes, all sessions are recorded, so if you miss one, you won't miss anything. You can catch up on the content later. Additionally, you can submit your questions in advance for office hours, and those sessions are also recorded.",
 'answer_orig': 'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.',
 'document': '5170565b',
 'question': 'Are sessions recorded if I miss one?',
 'course': 'machine-learning-zoomcamp'}

In [5]:
scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [6]:
# computing the rouge for the entire dataframe
from tqdm.auto import tqdm

rouge_idx = []

for i in tqdm(range(len(df_results))):
    rouge_val = rouge_scorer.get_scores(df_results.iloc[i].answer_llm, df_results.iloc[i].answer_orig)[0]
    rouge_idx.append(rouge_val)

  from .autonotebook import tqdm as notebook_tqdm
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1830/1830 [00:05<00:00, 308.74it/s]


In [8]:
type(rouge_idx)

list

In [12]:
rouge1_f1 = [item['rouge-1']['f'] for item in rouge_idx]

In [14]:
average_rouge1_f1 = sum(rouge1_f1)/len(rouge1_f1)

In [15]:
average_rouge1_f1

0.3516946452113944