In [109]:
import json
import requests
import pandas as pd
from typing import Any

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [110]:
print(json.dumps(ground_truth[0], indent=2))
print(json.dumps(documents[0], indent=2))

{
  "question": "When does the course begin?",
  "course": "data-engineering-zoomcamp",
  "document": "c02e79ef"
}
{
  "text": "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  \u201cOffice Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon\u2019t forget to register in DataTalks.Club's Slack and join the channel.",
  "section": "General course-related questions",
  "question": "Course - When will the course start?",
  "course": "data-engineering-zoomcamp",
  "id": "c02e79ef"
}


In [111]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

### Question 1

In [113]:
from minsearch import Index

ms_index = Index(
    text_fields=["question", "section", "text"],
    keyword_fields=["course", "id"]
)

ms_index.fit(documents)

<minsearch.minsearch.Index at 0x30dfe62d0>

In [185]:
boost_dict = {'question': 1.5, 'section': 0.1}
def search_ms(query: dict[str, Any]) -> list[dict]:
    return ms_index.search(
        query=query["question"], 
        boost_dict=boost_dict,
        filter_dict={'course': query['course']},
        num_results=5
        )

In [115]:
evaluate(ground_truth=ground_truth, search_function=search_ms)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.848714069591528, 'mrr': 0.7288235717887772}

### Question 2

In [116]:
from minsearch import VectorSearch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline


In [117]:
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [118]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x30e18ca10>

In [119]:
def search_vs(query: dict[str, Any]) -> list[dict]:
    # Transform the query string into a vector using the trained pipeline
    query_vector = pipeline.transform([query["question"]])
    
    # Search using the query vector
    results = vindex.search(
        query_vector=query_vector[0],  # Extract the vector from the sparse matrix
        filter_dict={'course': query['course']},
        num_results=5
    )
    return results

In [120]:
evaluate(ground_truth, search_vs)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.48195374972984656, 'mrr': 0.3573085512571141}

### Question 3

In [121]:
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

pipeline2 = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X2 = pipeline2.fit_transform(texts)

vindex2 = VectorSearch(keyword_fields={'course'})
vindex2.fit(X2, documents)

def search_vs2(query: dict[str, Any]) -> list[dict]:
    # Transform the query string into a vector using the trained pipeline
    query_vector = pipeline2.transform([query["question"]])
    
    # Search using the query vector
    results = vindex2.search(
        query_vector=query_vector[0],  # Extract the vector from the sparse matrix
        filter_dict={'course': query['course']},
        num_results=5
    )
    return results

In [122]:
evaluate(ground_truth, search_vs2)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.8210503566025502, 'mrr': 0.6717347453353508}

### Question 4

In [123]:
from fastembed import TextEmbedding
from qdrant_client import QdrantClient, models

model_handle_name = "jinaai/jina-embeddings-v2-small-en"
limit = 5


for i in TextEmbedding.list_supported_models():
    if i["model"].lower() == model_handle_name:
        print(json.dumps(i, indent=2))
        break

{
  "model": "jinaai/jina-embeddings-v2-small-en",
  "sources": {
    "hf": "xenova/jina-embeddings-v2-small-en",
    "url": null,
    "_deprecated_tar_struct": false
  },
  "model_file": "onnx/model.onnx",
  "description": "Text embeddings, Unimodal (text), English, 8192 input tokens truncation, Prefixes for queries/documents: not necessary, 2023 year.",
  "license": "apache-2.0",
  "size_in_GB": 0.12,
  "additional_files": [],
  "dim": 512,
  "tasks": {}
}


```bash
docker pull qdrant/qdrant

docker run -p 6333:6333 -p 6334:6334 \
   -v "$(pwd)c/qdrant_storage:/qdrant/storage:z" \
   qdrant/qdrant
```

In [184]:
qd_client = QdrantClient(host="localhost", port=6333)
EMBEDDING_DIMENSIONALITY = 512

# Define the collection name
collection_name = "evaluation-rag"

# Create the collection with specified vector parameters
qd_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,  # Dimensionality of the vectors
        distance=models.Distance.COSINE  # Distance metric for similarity search
    )
)

points = []
for id, doc in enumerate(documents):
    text = doc['question'] + ' ' + doc['text']
    point = models.PointStruct(
        id=id,
        vector=models.Document(text=text, model=model_handle_name), #embed text locally 
        payload=doc #save all needed metadata fields,
    )
    points.append(point)


qd_client.upsert(
    collection_name=collection_name,
    points=points
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [None]:
def search_qdrant(query: dict[str, Any], limit=5):
    qd_response = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document( #embed the query text locally
            text=query["question"],
            model=model_handle_name 
        ),
        query_filter=models.Filter(
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=query["course"])
                )
            ]
        ),
        limit=limit, # top closest matches
        with_payload=True #to get metadata in the results
    )
    return [i.payload for i in qd_response.points]


In [139]:
evaluate(ground_truth, search_qdrant)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.9299762264966501, 'mrr': 0.8517722066133576}

### Question 5

In [None]:
import numpy as np

def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)

In [159]:
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

In [160]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

pipeline.fit(df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question)


0,1,2
,steps,"[('tfidfvectorizer', ...), ('truncatedsvd', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_components,128
,algorithm,'randomized'
,n_iter,5
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,1
,tol,0.0


In [173]:

def cosine_similarity(pipeline, original_answer: str, llm_answer: str) -> float:
    original_answer_vector = pipeline.transform([original_answer])[0]
    llm_answer_vector = pipeline.transform([llm_answer])[0]
    return cosine(original_answer_vector, llm_answer_vector)

In [175]:
cosine_list = []
    
for i in df_results.to_dict(orient="records"):
    cosine_val = cosine_similarity(pipeline, i["answer_orig"], i["answer_llm"])
    cosine_list.append(cosine_val)

df_results["cosine_similarity"] = cosine_list

In [176]:
df_results["cosine_similarity"].describe()

count    1830.000000
mean        0.841584
std         0.173737
min         0.079093
25%         0.806927
50%         0.905812
75%         0.950711
max         0.996457
Name: cosine_similarity, dtype: float64

### Question 6

In [182]:
from rouge import Rouge
rouge_scorer = Rouge()

r = df_results.iloc[10]
score = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
print(score)


{'rouge-1': {'r': 0.45454545454545453, 'p': 0.45454545454545453, 'f': 0.45454544954545456}, 'rouge-2': {'r': 0.21621621621621623, 'p': 0.21621621621621623, 'f': 0.21621621121621637}, 'rouge-l': {'r': 0.3939393939393939, 'p': 0.3939393939393939, 'f': 0.393939388939394}}


In [180]:
rouge1_f1_scores = []
for row in df_results.to_dict(orient="records"):
    scores = rouge_scorer.get_scores(row["answer_llm"], row["answer_orig"])[0]
    rouge1_f1_scores.append(scores["rouge-1"]["f"])

df_results["rouge1_f1_scores"] = rouge1_f1_scores

In [181]:
df_results["rouge1_f1_scores"].describe()

count    1830.000000
mean        0.351695
std         0.158905
min         0.000000
25%         0.238887
50%         0.356300
75%         0.460133
max         0.950000
Name: rouge1_f1_scores, dtype: float64