In [1]:
# Required Libraries
!pip install -U minsearch qdrant_client



In [2]:
from minsearch import Index
import requests
import pandas as pd
from tqdm.auto import tqdm

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [3]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [4]:
## Question 1
import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

def minsearch_search(query, course):
    boost = {'question': 1.5, 'section': 0.1}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = minsearch_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

hit_rate(relevance_total)

  0%|          | 0/4627 [00:00<?, ?it/s]

0.848714069591528

In [5]:
## Question 2
from minsearch import VectorSearch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline


## Create embeddings for the "question" field
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

## Now index these embeddings with minsearch
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)



def minsearch_vector_search(vector, course):
    return vindex.search(
        vector,
        filter_dict={'course': course},
        num_results=5
    )

def question_text_vector(q):
    question = q['question']
    course = q['course']

    v_q = pipeline.transform([question])

    return minsearch_vector_search(v_q, course)


relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = question_text_vector(dict(
    question=q['question'],
    course=q['course']
))
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

mrr(relevance_total)

  0%|          | 0/4627 [00:00<?, ?it/s]

0.35720048987825087

In [6]:
## Question 3
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

X = pipeline.fit_transform(texts)
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = question_text_vector(dict(
    question=q['question'],
    course=q['course']
))
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

hit_rate(relevance_total)

  0%|          | 0/4627 [00:00<?, ?it/s]

0.8210503566025502

In [7]:
## Question 4
!pip install qdrant-client sentence-transformers
from qdrant_client import QdrantClient, models
client = QdrantClient("http://localhost:6333")


collection_name = "ml-zoomcamp-faq"
model_handle = "jinaai/jina-embeddings-v2-small-en"

# Create the collection
client.recreate_collection(
    collection_name="ml-zoomcamp-faq",
    vectors_config=models.VectorParams(
        size=512,
        distance=models.Distance.COSINE
    )
)

# Create Payload
client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword"
)

points = []
for i, doc in enumerate(documents):
    text = doc['question'] + ' ' + doc['text']
    vector = models.Document(text=text, model=model_handle)
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc
    )
    points.append(point)


client.upsert(
    collection_name=collection_name,
    points=points
)

def vector_search(question,course):    
    query_points = client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=question,
            model=model_handle 
        ),
        query_filter=models.Filter( 
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ]
        ),
        limit=5,
        with_payload=True
    )
    
    results = []
    
    for point in query_points.points:
        results.append(point.payload)
    
    return results


relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    # results = minsearch_vector_search(query=q['question'], course=q['course'])
    results = vector_search(
    question=q['question'],
    course=q['course']
)
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

# mrr(relevance_total)



  client.recreate_collection(


  0%|          | 0/4627 [00:00<?, ?it/s]

In [8]:
mrr(relevance_total)

0.8517722066133576

In [9]:
## QUestion 5
import numpy as np
def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)

results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

X=pipeline.fit(df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question)

X_llm = pipeline.transform(df_results.answer_llm)
X_orig = pipeline.transform(df_results.answer_orig)

cosine_similarities = [
    cosine(u, v) for u, v in zip(X_llm, X_orig)
]

average_cosine = np.mean(cosine_similarities)
print("Average cosine similarity:", average_cosine)

Average cosine similarity: 0.8415841233490402


In [10]:
## Question 6
!pip install rouge
from rouge import Rouge
rouge_scorer = Rouge()

rouge_1_f1_scores=[]
for _, row in tqdm(df_results.iterrows(), total=len(df_results)):
    try:
        score = rouge_scorer.get_scores(row.answer_llm, row.answer_orig)[0]
        rouge_1_f1_scores.append(score['rouge-1']['f'])
    except:
        rouge_1_f1_scores.append(0.0)  # fallback in case of error

avg_rouge_1_f1 = float(np.mean(rouge_1_f1_scores))
avg_rouge_1_f1

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




  0%|          | 0/1830 [00:00<?, ?it/s]

0.3516946452113943