In [2]:
from qdrant_client import QdrantClient
from qdrant_client.http import models

client = QdrantClient("http://localhost:6333")
client.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='zoomcamp-sparse-dense'), CollectionDescription(name='zoomcamp-sparse'), CollectionDescription(name='zoomcamp-faq')])

#### BM25

In [3]:
import requests

docs_url = "https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json"
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

In [4]:
documents_raw[0]["documents"][0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?'}

In [5]:
documents = []

for course in documents_raw:
    for doc in course["documents"]:
        doc["course"] = course["course"]
        documents.append(doc)

In [6]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [18]:
collection_name = "zoomcamp-sparse"

client.create_collection(
    collection_name=collection_name,
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(
            modifier=models.Modifier.IDF
        )
    }
)

True

In [20]:
import uuid

points = []

for doc in documents:
    text = doc["question"] + " " + doc["text"]
    vector = {"bm25": models.Document(text=doc["text"], model="Qdrant/bm25")}
    point = models.PointStruct(
        id=uuid.uuid4().hex,
        vector=vector,
        payload=doc
    )
    points.append(point)
    

In [22]:
client.upsert(
    collection_name=collection_name,
    points=points
)

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 18 files:   0%|          | 0/18 [00:00<?, ?it/s]

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [55]:
def search(query, limit=1):
    results = client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=query,
            model="Qdrant/bm25"
        ),
        using="bm25",
        limit=limit,
        with_payload=True
    )

    return results.points

In [45]:
result = search("How to run Kafka")[0]
result.score, result.payload

(10.442513,
 {'text': 'If you get an error while running the command python3 stream.py worker\nRun pip uninstall kafka-python\nThen run pip install kafka-python==1.4.6\nWhat is the use of  Redpanda ?\nRedpanda: Redpanda is built on top of the Raft consensus algorithm and is designed as a high-performance, low-latency alternative to Kafka. It uses a log-centric architecture similar to Kafka but with different underlying principles.\nRedpanda is a powerful, yet simple, and cost-efficient streaming data platform that is compatible with Kafka® APIs while eliminating Kafka complexity.',
  'section': 'Module 6: streaming with kafka',
  'question': 'Error while running python3 stream.py worker',
  'course': 'data-engineering-zoomcamp'})

In [61]:
import json
import random

course = random.choice(documents_raw)
doc = random.choice(course["documents"])
print(json.dumps(doc, indent=2))

{
  "text": "Q2 asks about correlation matrix and converting median_house_value from numeric to binary. Just to make sure here we are only dealing with df_train not df_train_full, right? As the question explicitly mentions the train dataset.\nYes. I think it is only on df_train. The reason behind this is that df_train_full also contains the validation dataset, so at this stage we don't want to make conclusions based on the validation data, since we want to test how we did without using that portion of the data.\nPastor Soto",
  "section": "3. Machine Learning for Classification",
  "question": "What data should we use for correlation matrix",
  "course": "machine-learning-zoomcamp"
}


In [62]:
result = search(doc["question"])[0]
print(f'retrieved:\n {result.payload["text"]}', end="\n\n")
print(f'original:\n {doc["text"]}')

retrieved:
 The background of any dataframe can be colored (not only the correlation matrix) based on the numerical values the dataframe contains by using the method pandas.io.formats.style.Styler.background_graident.
Here an example on how to color the correlation matrix. A color map of choice can get passed, here ‘viridis’ is used.
# ensure to have only numerical values in the dataframe before calling 'corr'
corr_mat = df_numerical_only.corr()
corr_mat.style.background_gradient(cmap='viridis')
Here is an example of how the coloring will look like using a dataframe containing random values and applying “background_gradient” to it.
np.random.seed = 3
df_random = pd.DataFrame(data=np.random.random(3*3).reshape(3,3))
df_random.style.background_gradient(cmap='viridis')
Added by Sylvia Schmitt

original:
 Q2 asks about correlation matrix and converting median_house_value from numeric to binary. Just to make sure here we are only dealing with df_train not df_train_full, right? As the questi

#### Prefetching

In [1]:
collection_name = "zoomcamp-sparse-dense"
embedding_dim = 512
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [9]:
client.create_collection(
    collection_name=collection_name,
    vectors_config={
        "jina-small": models.VectorParams(
            size=embedding_dim,
            distance=models.Distance.COSINE
        )
    },
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(
            modifier=models.Modifier.IDF
        )
    }
)

True

In [10]:
import uuid

points = []

for doc in documents:
    text = doc["question"] + " " + doc["text"]
    vector = {
        "jina-small": models.Document(text=doc["text"], model=model_handle),
        "bm25": models.Document(text=doc["text"], model="Qdrant/bm25")
    }
    point = models.PointStruct(
        id=uuid.uuid4().hex,
        vector=vector,
        payload=doc
    )
    points.append(point)
    

In [11]:
client.upsert(
    collection_name=collection_name,
    points=points
)

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 18 files:   0%|          | 0/18 [00:00<?, ?it/s]



UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [65]:
def multi_stage_search(query, limit=1):
    prefetch = [models.Prefetch(
        query=models.Document(
            text=query,
            model=model_handle
        ),
        using="jina-small",
        limit=(limit * 10)
    )]
    
    results = client.query_points(
        collection_name=collection_name,
        prefetch=prefetch,
        query=models.Document(
            text=query,
            model="Qdrant/bm25"
        ),
        using="bm25",
        limit=limit,
        with_payload=True
    )

    return results.points

In [67]:
result = multi_stage_search(doc["question"])[0]
print(f'retrieved:\n {result.payload["text"]}', end="\n\n")
print(f'original:\n {doc["text"]}')

retrieved:
 The background of any dataframe can be colored (not only the correlation matrix) based on the numerical values the dataframe contains by using the method pandas.io.formats.style.Styler.background_graident.
Here an example on how to color the correlation matrix. A color map of choice can get passed, here ‘viridis’ is used.
# ensure to have only numerical values in the dataframe before calling 'corr'
corr_mat = df_numerical_only.corr()
corr_mat.style.background_gradient(cmap='viridis')
Here is an example of how the coloring will look like using a dataframe containing random values and applying “background_gradient” to it.
np.random.seed = 3
df_random = pd.DataFrame(data=np.random.random(3*3).reshape(3,3))
df_random.style.background_gradient(cmap='viridis')
Added by Sylvia Schmitt

original:
 Q2 asks about correlation matrix and converting median_house_value from numeric to binary. Just to make sure here we are only dealing with df_train not df_train_full, right? As the questi

#### Hybrid Search

In [68]:
def rrf_search(query, limit=1):
    # Reciprocal Rank Fusion
    prefetch = [
        models.Prefetch(
            query=models.Document(
                text=query,
                model=model_handle
            ),
            using="jina-small",
            limit=(limit * 5)
        ),
        models.Prefetch(
            query=models.Document(
                text=query,
                model="Qdrant/bm25"
            ),
            using="bm25",
            limit=(limit * 5)
        )
    ]

    results = client.query_points(
        collection_name=collection_name,
        prefetch=prefetch,
        query=models.FusionQuery(fusion=models.Fusion.RRF),
        limit=limit,
        with_payload=True
    )

    return results.points

In [70]:
result = rrf_search(doc["question"])[0]
print(f'retrieved:\n {result.payload["text"]}', end="\n\n")
print(f'original:\n {doc["text"]}')

retrieved:
 Should correlation be calculated after splitting or before splitting. And lastly I know how to find the correlation but how do i find the two most correlated features.
Answer: Correlation matrix of your train dataset. Thus, after splitting. Two most correlated features are the ones having the highest correlation coefficient in terms of absolute values.

original:
 Q2 asks about correlation matrix and converting median_house_value from numeric to binary. Just to make sure here we are only dealing with df_train not df_train_full, right? As the question explicitly mentions the train dataset.
Yes. I think it is only on df_train. The reason behind this is that df_train_full also contains the validation dataset, so at this stage we don't want to make conclusions based on the validation data, since we want to test how we did without using that portion of the data.
Pastor Soto
