# Intro To QDrant

In [1]:
# Built-in library
import re
import json
from typing import Any, Optional, TypeAlias, Union
import logging
import warnings

# Standard imports
import numpy as np
from pprint import pprint
import pandas as pd
import polars as pl
from rich import print
import torch

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

## Import Dependencies

In [2]:
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer

In [3]:
ENCODER_CHECKPOINT: str = "all-MiniLM-L6-v2"

encoder = SentenceTransformer(ENCODER_CHECKPOINT)

In [4]:
# Add datasets

documents: list[str, Any] = [
    {
        "name": "The Time Machine",
        "description": "A man travels through time and witnesses the evolution of humanity.",
        "author": "H.G. Wells",
        "year": 1895,
    },
    {
        "name": "Ender's Game",
        "description": "A young boy is trained to become a military leader in a war against an alien race.",
        "author": "Orson Scott Card",
        "year": 1985,
    },
    {
        "name": "Brave New World",
        "description": "A dystopian society where people are genetically engineered and conditioned to conform to a strict social hierarchy.",
        "author": "Aldous Huxley",
        "year": 1932,
    },
    {
        "name": "The Hitchhiker's Guide to the Galaxy",
        "description": "A comedic science fiction series following the misadventures of an unwitting human and his alien friend.",
        "author": "Douglas Adams",
        "year": 1979,
    },
    {
        "name": "Dune",
        "description": "A desert planet is the site of political intrigue and power struggles.",
        "author": "Frank Herbert",
        "year": 1965,
    },
    {
        "name": "Foundation",
        "description": "A mathematician develops a science to predict the future of humanity and works to save civilization from collapse.",
        "author": "Isaac Asimov",
        "year": 1951,
    },
    {
        "name": "Snow Crash",
        "description": "A futuristic world where the internet has evolved into a virtual reality metaverse.",
        "author": "Neal Stephenson",
        "year": 1992,
    },
    {
        "name": "Neuromancer",
        "description": "A hacker is hired to pull off a near-impossible hack and gets pulled into a web of intrigue.",
        "author": "William Gibson",
        "year": 1984,
    },
    {
        "name": "The War of the Worlds",
        "description": "A Martian invasion of Earth throws humanity into chaos.",
        "author": "H.G. Wells",
        "year": 1898,
    },
    {
        "name": "The Hunger Games",
        "description": "A dystopian society where teenagers are forced to fight to the death in a televised spectacle.",
        "author": "Suzanne Collins",
        "year": 2008,
    },
    {
        "name": "The Andromeda Strain",
        "description": "A deadly virus from outer space threatens to wipe out humanity.",
        "author": "Michael Crichton",
        "year": 1969,
    },
    {
        "name": "The Left Hand of Darkness",
        "description": "A human ambassador is sent to a planet where the inhabitants are genderless and can change gender at will.",
        "author": "Ursula K. Le Guin",
        "year": 1969,
    },
    {
        "name": "The Three-Body Problem",
        "description": "Humans encounter an alien civilization that lives in a dying system.",
        "author": "Liu Cixin",
        "year": 2008,
    },
]

In [5]:
# Define Storage Location
URL: str = "http://localhost:6333"
# qdrant = QdrantClient(":memory:")
qdrant = QdrantClient(URL)
INDEX_NAME: str = "my_books"

# Create a collection
# Use recreate_collection if you are experimenting and running the script several times.
# This function will first try to remove an existing collection with the same name.
qdrant.recreate_collection(
    collection_name=INDEX_NAME,
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(),  # Vector size is defined by used model
        distance=models.Distance.COSINE,
    ),
)

True

In [6]:
# vectors: list[float] =

# Upload data to collection
qdrant.upload_records(
    collection_name=INDEX_NAME,
    records=[
        models.Record(
            id=idx, vector=encoder.encode(doc["description"]).tolist(), payload=doc
        )
        for idx, doc in enumerate(documents)
    ],
)

### Query The Database

In [7]:
query: str = "alien invasion"
query_vector: list[float] = encoder.encode(query).tolist()
N: int = 3

hits: list[Any] = qdrant.search(
    collection_name=INDEX_NAME,
    query_vector=query_vector,
    limit=N,
)

for hit in hits:
    print(hit.payload, "score:", hit.score)

### Narrow down the query

- How about the most recent book from the early 2000s?

In [8]:
query: str = "alien invasion"
query_vector: list[float] = encoder.encode(query).tolist()
query_filter: models.Filter = models.Filter(
    must=[models.FieldCondition(key="year", range=models.Range(gte=2000))]
)


hits: list[Any] = qdrant.search(
    collection_name="my_books",
    query_vector=query_vector,
    query_filter=query_filter,
    limit=N,
)
for hit in hits:
    print(hit.payload, "score:", hit.score)

# Neural Search Service

In [9]:
from pathlib import Path

from datasets import Dataset, load_dataset
from tqdm.notebook import tqdm

In [10]:
data_fp: Path = Path("../../data")
fp: Path = data_fp / "startups_demo.json"
data_files: dict[str, Any] = {"train": str(fp)}

startup_data: Dataset = load_dataset(path="json", data_files=data_files)
startup_data

DatasetDict({
    train: Dataset({
        features: ['name', 'images', 'alt', 'description', 'link', 'city'],
        num_rows: 40474
    })
})

In [11]:
print(startup_data.get("train")[:3])

In [12]:
ENCODER_CHECKPOINT: str = "all-MiniLM-L6-v2"

encoder = SentenceTransformer(ENCODER_CHECKPOINT)

In [13]:
encoder.encode(query).tolist()

[0.008490558713674545,
 0.0070844413712620735,
 0.012432715855538845,
 0.016528213396668434,
 0.018315991386771202,
 -0.05017716437578201,
 -0.01185440830886364,
 -0.036697763949632645,
 0.015199929475784302,
 -0.011771843768656254,
 0.02491699904203415,
 -0.03999173268675804,
 0.027960997074842453,
 0.010655614547431469,
 -0.02657207101583481,
 0.035112686455249786,
 0.014779333956539631,
 -0.07972410321235657,
 -0.021407857537269592,
 -0.004412065725773573,
 -0.08960099518299103,
 0.05520902946591377,
 0.011839141137897968,
 0.03228887543082237,
 -0.01812012866139412,
 0.021024376153945923,
 0.021151550114154816,
 0.05961310490965843,
 -0.05800142511725426,
 -0.07097145169973373,
 -0.0015109328087419271,
 0.08469453454017639,
 -0.005802256055176258,
 0.0498209185898304,
 -0.006269002798944712,
 -0.027293283492326736,
 0.02292904444038868,
 -0.09682813286781311,
 0.06549759209156036,
 -0.009998810477554798,
 -0.022183040156960487,
 -0.09672120958566666,
 0.05387067049741745,
 0.004859

In [14]:
def encode_data(example: dict[str, Any]) -> dict[str, Any]:
    """This is used to encode the data using the encoding model."""
    example["vector"] = encoder.encode(example.get("description")).tolist()
    return example

In [15]:
# Encode the description
def encode_data(example: dict[str, Any]) -> dict[str, Any]:
    """This is used to encode the data using the encoding model.
    Parallelised version.
    """
    return {"vector": [encoder.encode(x).tolist() for x in example.get("description")]}

In [16]:
A: Dataset = startup_data.get("train").shuffle(123).select(range(10))

A

Dataset({
    features: ['name', 'images', 'alt', 'description', 'link', 'city'],
    num_rows: 10
})

In [17]:
# Test the function
A = A.map(encode_data, batched=True)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [18]:
RUN_CELL: bool = False

if RUN_CELL:
    # Add embeddings to the data
    startup_data_emb: Dataset = startup_data.map(encode_data, batched=True)
    startup_data_emb

###  Save Normal Dataset object

```python
startup_data_emb.to_json(f"{fp}.jsonl")
```

In [19]:
# Save data
if RUN_CELL:
    fp: Path = data_fp / "startup_data_enb"

    # Iterate and save because it's a DatasetDict object
    for _split, _dataset in startup_data_emb.items():
        _dataset.to_json(f"{fp}-{_split}.jsonl")

<hr><br>

### Save The Vectors

- DataFrame Approach
  
```python
df: pd.DataFrame = pd.read_json("./startups_demo.json", lines=True)

vectors = encoder.encode(
    [row.alt + ". " + row.description for row in df.itertuples()],
    show_progress_bar=True,
)

print(vectors.shape)

# Download the saved vectors into a new file named startup_vectors.npy
np.save("startup_vectors.npy", vectors, allow_pickle=False)
```

In [21]:
from qdrant_client.models import VectorParams, Distance


# Define Storage Location
URL: str = "http://localhost:6333"
qdrant = QdrantClient(URL)
INDEX_NAME: str = "startups"
RUN_CELL: bool = True

if RUN_CELL:
    # Create a collection
    # Use recreate_collection if you are experimenting and running the script several times.
    # This function will first try to remove an existing collection with the same name.
    qdrant.create_collection(
        collection_name=INDEX_NAME,
        vectors_config=VectorParams(
            size=encoder.get_sentence_embedding_dimension(),  # Vector size is defined by used model
            distance=Distance.COSINE,
        ),
    )

### Load Data


In [22]:
from datasets import Dataset, DatasetDict, load_dataset

fp: Path = data_fp / "startup_data_enb-train.jsonl"

emb_data: DatasetDict = load_dataset("json", data_files=str(fp))
emb_data

DatasetDict({
    train: Dataset({
        features: ['name', 'images', 'alt', 'description', 'link', 'city', 'vector'],
        num_rows: 40474
    })
})

In [23]:
print(emb_data.get("train")[10])

In [24]:
print(emb_data.get("train").select([3, 4]).remove_columns("vector").to_list())

In [25]:
if RUN_CELL:
    vectors_dict: list[dict[str, list[float]]] = (
        emb_data.get("train").select_columns("vector").to_list()
    )
    vectors: list[list[float]] = [x.get("vector") for x in vectors_dict]
    payload: list[dict[str, Any]] = (
        emb_data.get("train").remove_columns("vector").to_list()
    )

In [26]:
if RUN_CELL:
    # Number of vectors that will be uploaded in a single request
    BATCH_SIZE: int = 256

    qdrant.upload_collection(
        collection_name=INDEX_NAME,
        vectors=vectors,
        payload=payload,
        ids=None,  # Vector ids will be assigned automatically
        batch_size=BATCH_SIZE,
    )