In [1]:
pip install -U minsearch qdrant_client

Collecting minsearch
  Downloading minsearch-0.0.4-py3-none-any.whl.metadata (8.1 kB)
Collecting qdrant_client
  Downloading qdrant_client-1.15.0-py3-none-any.whl.metadata (11 kB)
Downloading minsearch-0.0.4-py3-none-any.whl (11 kB)
Downloading qdrant_client-1.15.0-py3-none-any.whl (337 kB)
Installing collected packages: minsearch, qdrant_client
  Attempting uninstall: minsearch
    Found existing installation: minsearch 0.0.2
    Uninstalling minsearch-0.0.2:
      Successfully uninstalled minsearch-0.0.2
  Attempting uninstall: qdrant_client
    Found existing installation: qdrant-client 1.14.3
    Uninstalling qdrant-client-1.14.3:
      Successfully uninstalled qdrant-client-1.14.3
Successfully installed minsearch-0.0.4 qdrant_client-1.15.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install

In [2]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

### retrieval evaluation

In [73]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1
    
    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
    # for q in ground_truth:
        # print(q)
    #     break
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
        # 'relevance_total': relevance_total
    }

In [63]:
## set minsearch
import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x7dd03368a390>

In [74]:
def minsearch_search(q, boost = {'question': 1.5, 'section': 0.1}):

    results = index.search(
        query=q['question'],
        filter_dict={'course': q['course']},
        boost_dict=boost,
        num_results=5
    )

    return results

## Q1. Minsearch text

Now let's evaluate our usual minsearch approach, but tweak the parameters. Let's use the following boosting params:

```boost = {'question': 1.5, 'section': 0.1}```

What's the hitrate for this approach?

- 0.64
- 0.74
- 0.84
- 0.94

In [75]:
boost = {'question': 1.5, 'section': 0.1}

minisearch_evaluation_results = evaluate(ground_truth, minsearch_search)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [00:14<00:00, 315.35it/s]


In [76]:
minisearch_evaluation_results

{'hit_rate': 0.848714069591528, 'mrr': 0.7288235717887772}

## Embeddings

In [77]:
from minsearch import VectorSearch

In [78]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [80]:
# embeddings for the "question" field:

texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [127]:
# embeddings for the "question" in ground_truth dataset:

ground_truth_questions = [None] * len(ground_truth)

for i, q in enumerate(ground_truth):
     ground_truth_questions[i] = q['question']

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
Y = pipeline.fit_transform(ground_truth_questions)

for i, q in enumerate(ground_truth):
    q['vector_question'] = Y[i]

## Q2. Vector search for question

Now let's index these embeddings with minsearch:

```
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)
```

Evaluate this seach method. What's MRR for it?

- 0.25
- 0.35
- 0.45
- 0.55

In [128]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x7dd030a0d3a0>

In [129]:
def minsearch_search_vector(q):
   
    results = vindex.search(
        query_vector=q['vector_question'],
        filter_dict={'course': q['course']},
        num_results=5
    )

    return results

In [130]:
minisearch_vector_evaluation_results = evaluate(ground_truth, minsearch_search_vector)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4627/4627 [00:03<00:00, 1261.62it/s]


In [131]:
minisearch_vector_evaluation_results

{'hit_rate': 0.014047979252215258, 'mrr': 0.006829479144153877}

## Q3. Vector search for question and answer

In [133]:
ground_truth[0]

{'question': 'When does the course begin?',
 'course': 'data-engineering-zoomcamp',
 'document': 'c02e79ef',
 'vector_question': array([ 1.48205339e-01,  5.94105488e-02, -5.95477382e-02, -1.32751999e-01,
        -4.15999152e-03,  2.02596703e-02,  2.14582013e-02, -1.02590879e-02,
         6.20179196e-02,  6.69182872e-02, -4.27196998e-02, -1.99597864e-01,
         1.58438991e-01, -1.23168138e-01,  1.68281992e-01, -3.57977343e-02,
         1.67460232e-01,  8.76489584e-02, -1.26572384e-02, -7.19601119e-02,
         2.49481472e-02, -1.62304123e-02, -3.19982042e-02,  1.49469925e-01,
         6.72919505e-02, -6.59383833e-02, -1.21652259e-01,  2.01153946e-02,
        -2.87345040e-02,  2.44779396e-02, -8.03484834e-04, -6.68915774e-03,
         4.24317489e-03,  5.83691687e-02,  1.55151520e-01,  7.60471098e-02,
         1.05027501e-01, -1.07203345e-01,  1.48006404e-02,  4.46888752e-02,
         5.26749491e-02,  3.25053031e-02,  3.50181671e-02, -4.69688168e-02,
        -5.68644234e-02,  1.61933951

In [132]:
test_result = minsearch_search_vector(ground_truth[0])
test_result

[{'text': 'Install SDKMAN:\ncurl -s "https://get.sdkman.io" | bash\nsource "$HOME/.sdkman/bin/sdkman-init.sh"\nUsing SDKMAN, install Java 11 and Spark 3.3.2:\nsdk install java 11.0.22-tem\nsdk install spark 3.3.2\nOpen a new terminal or run the following in the same shell:\nsource "$HOME/.sdkman/bin/sdkman-init.sh"\nVerify the locations and versions of Java and Spark that were installed:\necho $JAVA_HOME\njava -version\necho $SPARK_HOME\nspark-submit --version',
  'section': 'Module 5: pyspark',
  'question': 'Setting up Java and Spark (with PySpark) on Linux (Alternative option using SDKMAN)',
  'course': 'data-engineering-zoomcamp',
  'id': '1ac2c13c'},
 {'text': '✅I got it working using `gcs-connector-hadoop-2.2.5-shaded.jar` and Spark 3.1\nI also added the google_credentials.json and .p12 to auth with gcs. These files are downloadable from GCP Service account.\nTo create the SparkSession:\nspark = SparkSession.builder.master(\'local[*]\') \\\n.appName(\'spark-read-from-bigquery\') 