In [1]:
#pip install -U minsearch qdrant_client
#!pip install sentence_transformers
#!pip install rouge


In [15]:
from tqdm.auto import tqdm
import requests
import pandas as pd
from elasticsearch import Elasticsearch
import minsearch
from tqdm.auto import tqdm
from minsearch import VectorSearch

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from fastembed import TextEmbedding
import numpy as np
from qdrant_client import QdrantClient, models
from qdrant_client.http import models
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer
from rouge import Rouge
import json

In [2]:
url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [4]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

#### **Q1. Minsearch text**

**index the document and fit**

In [6]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)


In [7]:
index.fit(documents)

<minsearch.minsearch.Index at 0x72105528a7b0>

**define the search function**

In [8]:
def minsearch_search(query, course):
    boost = {'question': 1.5, 'section': 0.1}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

In [None]:
documents

In [10]:
# Initialize an empty list to store relevance information for each query
relevance_total = []

# Iterate through each question in the ground_truth dataset, with a progress bar
for q in tqdm(ground_truth):
    # Get the document ID of the correct answer from the ground truth
    doc_id = q['document']
    
    # Perform a search using the question text and course context
    results = minsearch_search(query=q['question'], course=q['course'])
    
    # Create a list of boolean values indicating whether each retrieved document
    # matches the correct document ID (i.e., is relevant or not)
    relevance = [d['id'] == doc_id for d in results]
    
    # Add the relevance list to the overall results
    relevance_total.append(relevance)


  0%|          | 0/4627 [00:00<?, ?it/s]

In [None]:
relevance_total

In [12]:
hit = hit_rate(relevance_total)
print("hit rate: ",hit)

hit rate:  0.848714069591528


#### **Q2. Vector search for question**

**Extract the questions and embed them**

In [13]:
# Initialize an empty list to store question texts
texts = []

# Extract the 'question' field from each document and add it to the list
for doc in documents:
    t = doc['question']
    texts.append(t)

# Create a pipeline that first vectorizes the text using TF-IDF,
# then reduces the dimensionality using Truncated SVD (a form of LSA)
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),           # Convert text to TF-IDF vectors, ignoring rare terms (appearing in fewer than 3 documents)
    TruncatedSVD(n_components=128, random_state=1)  # Reduce TF-IDF matrix to 128 dimensions for compact representation
)

# Fit the pipeline to the question texts and transform them into vector representations
X = pipeline.fit_transform(texts)


In [14]:
# Create a vector search index with keyword-based filtering on the 'course' field.
# This allows you to later filter search results by course if needed.
vindex = VectorSearch(keyword_fields={'course'})

# Fit (index) the vector search model with the vector representations (X)
# and the original documents (used for metadata and keyword filtering).
vindex.fit(X, documents)


<minsearch.vector.VectorSearch at 0x7210545dc2f0>

In [15]:
def vector_search(query, course):
    
    query_vec = pipeline.transform([query])
    
    results = vindex.search(
        query_vec, 
        filter_dict={'course': course},
        num_results=5
    )

    return results

In [16]:
# Initialize an empty list to store relevance information for each query
relevance_vector_total = []

# Iterate through each question in the ground_truth dataset, with a progress bar
for q in tqdm(ground_truth):
    # Get the document ID of the correct answer from the ground truth
    doc_id = q['document']
    
    # Perform a search using the question text and course context
    results = vector_search(query=q['question'], course=q['course'])
    
    # Create a list of boolean values indicating whether each retrieved document
    # matches the correct document ID (i.e., is relevant or not)
    relevance = [d['id'] == doc_id for d in results]
    
    # Add the relevance list to the overall results
    relevance_vector_total.append(relevance)

  0%|          | 0/4627 [00:00<?, ?it/s]

In [17]:
mrr_rate = mrr(relevance_vector_total)
print("mrr rate:", mrr_rate)

mrr rate: 0.3571284489590088


#### **Q3. Vector search for question and answer**

In [18]:
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

# Create a pipeline that first vectorizes the text using TF-IDF,
# then reduces the dimensionality using Truncated SVD (a form of LSA)
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),           # Convert text to TF-IDF vectors, ignoring rare terms (appearing in fewer than 3 documents)
    TruncatedSVD(n_components=128, random_state=1)  # Reduce TF-IDF matrix to 128 dimensions for compact representation
)

# Fit the pipeline to the question texts and transform them into vector representations
X = pipeline.fit_transform(texts)


# Create a vector search index with keyword-based filtering on the 'course' field.
# This allows you to later filter search results by course if needed.
vindex = VectorSearch(keyword_fields={'course'})

# Fit (index) the vector search model with the vector representations (X)
# and the original documents (used for metadata and keyword filtering).
vindex.fit(X, documents)

def vector_search(query, course):
    
    query_vec = pipeline.transform([query])
    
    results = vindex.search(
        query_vec, 
        filter_dict={'course': course},
        num_results=5
    )

    return results

In [19]:
# Initialize an empty list to store relevance information for each query
relevance_vector_total = []

# Iterate through each question in the ground_truth dataset, with a progress bar
for q in tqdm(ground_truth):
    # Get the document ID of the correct answer from the ground truth
    doc_id = q['document']
    
    # Perform a search using the question text and course context
    results = vector_search(query=q['question'], course=q['course'])
    
    # Create a list of boolean values indicating whether each retrieved document
    # matches the correct document ID (i.e., is relevant or not)
    relevance = [d['id'] == doc_id for d in results]
    
    # Add the relevance list to the overall results
    relevance_vector_total.append(relevance)

  0%|          | 0/4627 [00:00<?, ?it/s]

In [20]:
hit = hit_rate(relevance_vector_total)
print("hit rate: ",hit)

hit rate:  0.8210503566025502


#### **Q4. Qdrant**

In [21]:
client = QdrantClient("http://localhost:6333") #connecting to local Qdrant instance

In [None]:
embedding_model = TextEmbedding('jinaai/jina-embeddings-v2-small-en')
collection_name = "homework"

In [24]:
# Upload points (vectors with associated payloads) to a specified Qdrant collection
client.upload_points(
    collection_name=collection_name,  # The name of the collection to upload the points to
    points=[
        models.PointStruct(
            id=idx,  # Unique ID for each point, using its index in the documents list
            vector=list(  # Convert the generator returned by `embed()` into a list
                embedding_model.embed(
                    doc['question'] + ' ' + doc['text']  # Concatenate question and text to create the embedding input
                )
            )[0].tolist(),  # Get the first embedding (assuming one per input) and convert it to a plain Python list
            payload=doc  # Attach the original document as metadata (payload)
        )
        for idx, doc in enumerate(documents)  # Loop over all documents with their index
    ]
)


In [26]:
def qdrant_search(query, course):
    
    results = client.query_points(
    collection_name=collection_name,
    query = list(embedding_model.embed(query))[0],
        query_filter=models.Filter(
        must=[models.FieldCondition(key="course", match=models.MatchValue(value=course))]
        ),
        limit=5,
    ).points

    return results

In [27]:
# Initialize an empty list to store relevance information for each query
relevance_vector_qdrant = []

# Iterate through each question in the ground_truth dataset, with a progress bar
for q in tqdm(ground_truth):
    # Get the document ID of the correct answer from the ground truth
    doc_id = q['document']
    
    # Perform a search using the question text and course context
    results = qdrant_search(query=q['question'], course=q['course'])
    
    # Create a list of boolean values indicating whether each retrieved document
    # matches the correct document ID (i.e., is relevant or not)
    relevance = [d.payload['id'] == doc_id for d in results]
 
    # Add the relevance list to the overall results
    relevance_vector_qdrant.append(relevance)

  0%|          | 0/4627 [00:00<?, ?it/s]

In [28]:
mrr_rate2 = mrr(relevance_vector_qdrant)
print("mrr rate:", mrr_rate2)

mrr rate: 0.8517722066133576


#### **Q5. Cosine simiarity**

In [6]:
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

In [4]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

In [None]:
pipeline.fit(df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question)

In [12]:
def compute_similarity(pipeline, record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']

    v_llm = pipeline.transform([answer_llm])[0]
    v_orig = pipeline.transform([answer_orig])[0]

    return v_llm.dot(v_orig) / (np.linalg.norm(v_llm) * np.linalg.norm(v_orig))


In [13]:
similarity = []

for _, record in tqdm(df_results.iterrows(), total=len(df_results)):
    sim = compute_similarity(pipeline, record)
    similarity.append(sim)

  0%|          | 0/1830 [00:00<?, ?it/s]

In [18]:
df = pd.DataFrame(similarity)

In [20]:
df.describe()

Unnamed: 0,0
count,1830.0
mean,0.841584
std,0.173737
min,0.079093
25%,0.806927
50%,0.905812
75%,0.950711
max,0.996457


#### **Q6. Rouge**

In [29]:
rouge_scorer = Rouge()
r = df_results.iloc[10]
scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [18]:
hyps, refs = zip(*[
    (row["answer_llm"], row["answer_orig"])
    for _, row in df_results.iterrows()
])
hyps = list(hyps)
refs = list(refs)


In [22]:
rouge = Rouge()
#scores = rouge.get_scores(hyps, refs)
scores = rouge.get_scores(hyps, refs, avg=True)

In [28]:
scores['rouge-1']['f']

0.3516946452113944