In [1]:
!pip install -U pandas requests tqdm minsearch qdrant_client scikit-learn numpy sentence-transformers rouge



### Retrieval evaluation

In [2]:
import requests
import pandas as pd
from tqdm.auto import tqdm

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents_response = requests.get(docs_url)
documents = documents_response.json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

def hit_rate(relevance_total):
    cnt = 0
    for line in relevance_total:
        if True in line:
            cnt = cnt + 1
    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0
    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)
    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []
    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)
    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [3]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

def minsearch_search(q):
    boost = {'question': 1.5, 'section': 0.1} 
    
    results = index.search(
        query=q['question'],
        filter_dict={'course': q['course']},
        boost_dict=boost,
        num_results=5
    )
    return results

In [4]:
minsearch_results = evaluate(ground_truth, minsearch_search)
print(minsearch_results)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.848714069591528, 'mrr': 0.7288235717887772}


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from minsearch import VectorSearch

question_texts = [d['question'] for d in documents]

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

X_q = pipeline.fit_transform(question_texts)

vindex = VectorSearch(keyword_fields={'course'})

vindex.fit(X_q, documents)

<minsearch.vector.VectorSearch at 0x7b7ff9172000>

In [6]:
def vector_question_search(q):
    question_embedding = pipeline.transform([q['question']])[0]
    
    results = vindex.search(
        query_vector=question_embedding,
        filter_dict={'course': q['course']},
        num_results=5
    )
    return results

In [7]:
vector_question_results = evaluate(ground_truth, vector_question_search)
print(vector_question_results)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.48173762697212014, 'mrr': 0.3572833369353793}


In [8]:
question_answer_texts = [d['question'] + ' ' + d['text'] for d in documents]

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

X_qa = pipeline.fit_transform(question_answer_texts)

vindex_qa = VectorSearch(keyword_fields={'course'})

vindex_qa.fit(X_qa, documents)

<minsearch.vector.VectorSearch at 0x7b7ff9125c70>

In [9]:
def vector_question_answer_search(q):
    question_embedding = pipeline.transform([q['question']])[0]
    
    results = vindex_qa.search(
        query_vector=question_embedding,
        filter_dict={'course': q['course']},
        num_results=5
    )
    return results

In [10]:
vector_question_answer_results = evaluate(ground_truth, vector_question_answer_search)
print(vector_question_answer_results)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.8210503566025502, 'mrr': 0.6717347453353508}


### Qdrant with Docker started

docker run -p 6333:6333 -p 6334:6334 \
    -v $(pwd)/qdrant_storage:/qdrant/storage:z \
    qdrant/qdrant

In [13]:
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient, models

model_name = 'jinaai/jina-embeddings-v2-small-en'
embedding_model = SentenceTransformer(model_name, trust_remote_code=True)

client = QdrantClient(host="localhost", port=6333)

collection_name = "faq_collection"

# Check if the collection already exists
if not client.collection_exists(collection_name=collection_name):
    # If it doesn't exist, create it
    client.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(
            size=512,
            distance=models.Distance.COSINE
        )
    )
    print(f"Collection '{collection_name}' created.")
else:
    print(f"Collection '{collection_name}' already exists.")

Collection 'faq_collection' already exists.


In [15]:
from qdrant_client import models

print("Encoding documents... (This may take a while)")
combined_texts = [d['question'] + ' ' + d['text'] for d in documents]
vectors = embedding_model.encode(combined_texts, show_progress_bar=True)
print("Encoding complete.")

print("Uploading vectors to Qdrant...")
client.upload_points(
    collection_name=collection_name,
    points=[
        models.PointStruct(
            id=idx, 
            vector=vector, 
            payload=doc
        ) for idx, (doc, vector) in enumerate(zip(documents, vectors)) 
    ],
    wait=True
)

print("Finished uploading to Qdrant.")

Encoding documents... (This may take a while)


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Encoding complete.
Uploading vectors to Qdrant...
Finished uploading to Qdrant.


In [25]:
def qdrant_search(q):
    query_vector = embedding_model.encode(q['question'])
    
    search_results = client.query_points(
        collection_name=collection_name,
        query=query_vector, # <--- THE FIX IS HERE
        query_filter=models.Filter(
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=q['course'])
                )
            ]
        ),
        limit=5
    )
    
    results = [hit.payload for hit in search_results.points]
    return results

In [26]:
qdrant_results = evaluate(ground_truth, qdrant_search)
print(qdrant_results)

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.9299762264966501, 'mrr': 0.8517722066133576}


### Quality evaluation

In [28]:
import numpy as np

# Load the data from the URL
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

# Create the pipeline
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

# Create a single large text corpus
all_texts = df_results['answer_llm'].tolist() + \
            df_results['answer_orig'].tolist() + \
            df_results['question'].tolist()

# Fit the pipeline on all available text
pipeline.fit(all_texts)

0,1,2
,steps,"[('tfidfvectorizer', ...), ('truncatedsvd', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_components,128
,algorithm,'randomized'
,n_iter,5
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,1
,tol,0.0


In [30]:
def normalize(v):
    norm = np.linalg.norm(v)
    if norm == 0: 
       return v
    return v / norm

def cosine(u, v):
    u_norm = normalize(u)
    v_norm = normalize(v)
    return np.dot(u_norm, v_norm)

In [31]:
similarities = []

for _, row in tqdm(df_results.iterrows(), total=df_results.shape[0]):
    # Get the texts
    answer_llm = row['answer_llm']
    answer_orig = row['answer_orig']
    
    # Create embeddings for both answers
    v_llm = pipeline.transform([answer_llm])[0]
    v_orig = pipeline.transform([answer_orig])[0]
    
    # Calculate and store the similarity
    sim = cosine(v_llm, v_orig)
    similarities.append(sim)

# Calculate the average
average_cosine = np.mean(similarities)

print(f"Average Cosine Similarity: {average_cosine}")

  0%|          | 0/1830 [00:00<?, ?it/s]

Average Cosine Similarity: 0.7463632445867671


In [32]:
from rouge import Rouge

rouge_scorer = Rouge()

In [34]:
scores_f1 = []

for _, row in tqdm(df_results.iterrows(), total=df_results.shape[0]):
    answer_llm = row['answer_llm']
    answer_orig = row['answer_orig']
    
    if not isinstance(answer_llm, str) or not isinstance(answer_orig, str):
        continue

    try:
        scores = rouge_scorer.get_scores(answer_llm, answer_orig)[0]
        scores_f1.append(scores['rouge-1']['f'])
    except ValueError:
        continue

average_rouge1_f1 = np.mean(scores_f1)

print(f"Average Rouge-1 F1-Score: {average_rouge1_f1}")

  0%|          | 0/1830 [00:00<?, ?it/s]

Average Rouge-1 F1-Score: 0.3516946452113943


### rag_evaluation

In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

all_texts = pd.concat([
    df_results['answer_llm'],
    df_results['answer_orig'],
    df_results['question']
]).dropna().tolist()

print("Fitting the pipeline...")
pipeline.fit(all_texts)
print("Pipeline fitted.")

Fitting the pipeline...
Pipeline fitted.


In [4]:
def normalize(v):
    norm = np.linalg.norm(v)
    if norm == 0: 
       return v
    return v / norm

def cosine(u, v):
    # The homework simplifies this, but full normalization is safer
    u_norm = normalize(u)
    v_norm = normalize(v)
    return np.dot(u_norm, v_norm)

In [6]:
similarities = []

for _, row in tqdm(df_results.iterrows(), total=df_results.shape[0]):
    answer_llm = row['answer_llm']
    answer_orig = row['answer_orig']
    
    if pd.isna(answer_llm) or pd.isna(answer_orig):
        continue
    
    v_llm = pipeline.transform([answer_llm])[0]
    v_orig = pipeline.transform([answer_orig])[0]
    
    sim = cosine(v_llm, v_orig)
    similarities.append(sim)

average_cosine = np.mean(similarities)

print(f"Average Cosine Similarity: {average_cosine}")

  0%|          | 0/1830 [00:00<?, ?it/s]

Average Cosine Similarity: 0.7463632445867671


In [7]:
from rouge import Rouge

rouge_scorer = Rouge()

In [8]:
scores_f1 = []

for _, row in tqdm(df_results.iterrows(), total=df_results.shape[0]):
    answer_llm = row['answer_llm']
    answer_orig = row['answer_orig']
    
    if not isinstance(answer_llm, str) or not isinstance(answer_orig, str):
        continue

    try:
        scores = rouge_scorer.get_scores(answer_llm, answer_orig)[0]
        scores_f1.append(scores['rouge-1']['f'])
    except ValueError:
        continue

average_rouge1_f1 = np.mean(scores_f1)

print(f"Average Rouge-1 F1-Score: {average_rouge1_f1}")

  0%|          | 0/1830 [00:00<?, ?it/s]

Average Rouge-1 F1-Score: 0.3516946452113943
