In [56]:
pip install -U minsearch qdrant_client


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [57]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

### retrieval evaluation

In [58]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1
    
    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
    # for q in ground_truth:
        # print(q)
    #     break
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
        # 'relevance_total': relevance_total
    }

In [59]:
## set minsearch
import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x72842f352300>

In [60]:
def minsearch_search(q, boost = {'question': 1.5, 'section': 0.1}):

    results = index.search(
        query=q['question'],
        filter_dict={'course': q['course']},
        boost_dict=boost,
        num_results=5
    )

    return results

## Q1. Minsearch text

Now let's evaluate our usual minsearch approach, but tweak the parameters. Let's use the following boosting params:

```boost = {'question': 1.5, 'section': 0.1}```

What's the hitrate for this approach?

- 0.64
- 0.74
- **0.84**
- 0.94

In [61]:
boost = {'question': 1.5, 'section': 0.1}

minisearch_evaluation_results = evaluate(ground_truth, minsearch_search)

  0%|          | 0/4627 [00:00<?, ?it/s]

In [62]:
minisearch_evaluation_results

{'hit_rate': 0.848714069591528, 'mrr': 0.7288235717887772}

## Embeddings

In [63]:
from minsearch import VectorSearch

In [64]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [65]:
# embeddings for the "question" field:

texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [66]:
# embeddings for the "question" in ground_truth dataset:

ground_truth_questions = []

ground_truth_questions = [None] * len(ground_truth)

for i, q in enumerate(ground_truth):
     ground_truth_questions[i] = q['question']
    
Y = pipeline.transform(ground_truth_questions)

for i, q in enumerate(ground_truth):
    q['vector_question'] = Y[i]

## Q2. Vector search for question

Now let's index these embeddings with minsearch:

```
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)
```

Evaluate this seach method. What's MRR for it?

- 0.25
- **0.35**
- 0.45
- 0.55

In [67]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x7284236c6150>

In [68]:
def minsearch_search_vector(q):
   
    results = vindex.search(
        query_vector=q['vector_question'],
        filter_dict={'course': q['course']},
        num_results=5,
        output_ids=True
    )

    return results

In [69]:
minisearch_vector_evaluation_results = evaluate(ground_truth, minsearch_search_vector)

  0%|          | 0/4627 [00:00<?, ?it/s]

In [70]:
minisearch_vector_evaluation_results

{'hit_rate': 0.48173762697212014, 'mrr': 0.3572833369353793}

## Q3. Vector search for question and answer

Using the same pipeline `(min_df=3 for TF-IDF vectorizer and n_components=128 for SVD)`, evaluate the performance of this approach

What's the hitrate?

- 0.62
- 0.72
- **0.82**
- 0.92

In [71]:
# embedding both question and answer

# embeddings for the "question" field:

texts_1 = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts_1.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

X_1 = pipeline.fit_transform(texts_1)

# embeddings for the "question" in ground_truth dataset in the new vector space

ground_truth_questions_1 = []

ground_truth_questions_1 = [None] * len(ground_truth)

for i, q in enumerate(ground_truth):
     ground_truth_questions_1[i] = q['question']
    
Y_1 = pipeline.transform(ground_truth_questions_1)

for i, q in enumerate(ground_truth):
    q['vector_question_1'] = Y_1[i]

In [72]:
vindex_1 = VectorSearch(keyword_fields={'course'})
vindex_1.fit(X_1, documents)

<minsearch.vector.VectorSearch at 0x7284232cfb90>

In [74]:
def minsearch_search_vector_1(q):
   
    results = vindex_1.search(
        query_vector=q['vector_question_1'],
        filter_dict={'course': q['course']},
        num_results=5,
    )
    
    return results

In [75]:
minisearch_vector_evaluation_results_1 = evaluate(ground_truth, minsearch_search_vector_1)

  0%|          | 0/4627 [00:00<?, ?it/s]

In [76]:
minisearch_vector_evaluation_results_1

{'hit_rate': 0.8210503566025502, 'mrr': 0.6717347453353508}

### Q4. Qdrant

Now let's evaluate the following settings in Qdrant:

```
text = doc['question'] + ' ' + doc['text']
model_handle = "jinaai/jina-embeddings-v2-small-en"
limit = 5
```

What's the MRR?

- **0.65**
- 0.75
- 0.85
- 0.95

In [78]:
from qdrant_client import QdrantClient, models

In [79]:
qd_client = QdrantClient("http://localhost:6333")

In [84]:
EMBEDDING_DIMENSIONALITY = 512
model_handle = "jinaai/jina-embeddings-v2-small-en"
collection_name = "homework-3"

In [85]:
qd_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,
        distance=models.Distance.COSINE
    )
)

True

In [86]:
points = []

for i, doc in enumerate(documents):
    text = doc['question'] + ' ' + doc['text']
    vector = models.Document(text=text, model=model_handle)
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc
    )
    points.append(point)

qd_client.upsert(
    collection_name=collection_name,
    points=points
)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/367 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

onnx/model.onnx:   0%|          | 0.00/130M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [92]:
def vector_search(q):
    # print('vector_search is used')
    
    # course = 'data-engineering-zoomcamp'
    query_points = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=q['question'],
            model=model_handle 
        ),
        # query_filter=models.Filter( 
        #     must=[
        #         models.FieldCondition(
        #             key="course",
        #             match=models.MatchValue(value=course)
        #         )
        #     ]
        # ),
        limit=5,
        with_payload=True
    )
    
    results = []
    
    for point in query_points.points:
        results.append(point.payload)
    
    return results

In [93]:
qdrant_vector_evaluation = evaluate(ground_truth, vector_search)

  0%|          | 0/4627 [00:00<?, ?it/s]

In [94]:
qdrant_vector_evaluation

{'hit_rate': 0.7605359844391615, 'mrr': 0.6801671349326426}

## Q5. Cosine simiarity

What's the average cosine?

- 0.64
- 0.74
- **0.84**
- 0.94

In [119]:
def cosine(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)

In [120]:
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)
results_gpt4o = df_results.to_dict(orient='records')

In [109]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

pipeline.fit(df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question)

In [132]:
# embeddings for each pair of answers (llm generated and original from the ground truth dataset)

def compute_similarity(record):
    answer_orig = record['answer_orig']
    answer_llm = record['answer_llm']
    
    v_llm = pipeline.transform([answer_llm])[0]
    v_orig = pipeline.transform([answer_orig])[0]
    
    return cosine(v_llm, v_orig)

similarity = []

for record in tqdm(results_gpt4o):
    sim = compute_similarity(record)
    similarity.append(sim)

  0%|          | 0/1830 [00:00<?, ?it/s]

In [133]:
df_results['cosine'] = similarity
df_results['cosine'].describe()

count    1830.000000
mean        0.841584
std         0.173737
min         0.079093
25%         0.806927
50%         0.905812
75%         0.950711
max         0.996457
Name: cosine, dtype: float64

## Q6. Rouge

In [134]:
! pip install rouge

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [135]:
from rouge import Rouge
rouge_scorer = Rouge()

r = df_results.iloc[10]
scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [None]:
rouge_1_f1 = []

for record in tqdm(results_gpt4o):
    rouge_scores = scores = rouge_scorer.get_scores(record['answer_llm'], record['answer_orig'])[0]
    rouge_1_f1.append(rouge_scores['rouge-1']['f'])

  0%|          | 0/1830 [00:00<?, ?it/s]

In [None]:
df_results['rouge_1_f1'] = rouge_1_f1
df_results['rouge_1_f1'].describe()