# Intro To QDrant

In [1]:
# Built-in library
import re
import json
from typing import Any, Optional, TypeAlias, Union
import logging
import warnings

# Standard imports
import numpy as np
from pprint import pprint
import pandas as pd
import polars as pl
from rich import print
import torch

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

## Import Dependencies

In [2]:
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer

In [3]:
ENCODER_CHECKPOINT: str = "all-MiniLM-L6-v2"

encoder = SentenceTransformer(ENCODER_CHECKPOINT)

In [4]:
# Add datasets

documents: list[str, Any] = [
    {
        "name": "The Time Machine",
        "description": "A man travels through time and witnesses the evolution of humanity.",
        "author": "H.G. Wells",
        "year": 1895,
    },
    {
        "name": "Ender's Game",
        "description": "A young boy is trained to become a military leader in a war against an alien race.",
        "author": "Orson Scott Card",
        "year": 1985,
    },
    {
        "name": "Brave New World",
        "description": "A dystopian society where people are genetically engineered and conditioned to conform to a strict social hierarchy.",
        "author": "Aldous Huxley",
        "year": 1932,
    },
    {
        "name": "The Hitchhiker's Guide to the Galaxy",
        "description": "A comedic science fiction series following the misadventures of an unwitting human and his alien friend.",
        "author": "Douglas Adams",
        "year": 1979,
    },
    {
        "name": "Dune",
        "description": "A desert planet is the site of political intrigue and power struggles.",
        "author": "Frank Herbert",
        "year": 1965,
    },
    {
        "name": "Foundation",
        "description": "A mathematician develops a science to predict the future of humanity and works to save civilization from collapse.",
        "author": "Isaac Asimov",
        "year": 1951,
    },
    {
        "name": "Snow Crash",
        "description": "A futuristic world where the internet has evolved into a virtual reality metaverse.",
        "author": "Neal Stephenson",
        "year": 1992,
    },
    {
        "name": "Neuromancer",
        "description": "A hacker is hired to pull off a near-impossible hack and gets pulled into a web of intrigue.",
        "author": "William Gibson",
        "year": 1984,
    },
    {
        "name": "The War of the Worlds",
        "description": "A Martian invasion of Earth throws humanity into chaos.",
        "author": "H.G. Wells",
        "year": 1898,
    },
    {
        "name": "The Hunger Games",
        "description": "A dystopian society where teenagers are forced to fight to the death in a televised spectacle.",
        "author": "Suzanne Collins",
        "year": 2008,
    },
    {
        "name": "The Andromeda Strain",
        "description": "A deadly virus from outer space threatens to wipe out humanity.",
        "author": "Michael Crichton",
        "year": 1969,
    },
    {
        "name": "The Left Hand of Darkness",
        "description": "A human ambassador is sent to a planet where the inhabitants are genderless and can change gender at will.",
        "author": "Ursula K. Le Guin",
        "year": 1969,
    },
    {
        "name": "The Three-Body Problem",
        "description": "Humans encounter an alien civilization that lives in a dying system.",
        "author": "Liu Cixin",
        "year": 2008,
    },
]

In [5]:
# Define Storage Location
URL: str = "http://localhost:6333"
# qdrant = QdrantClient(":memory:")
qdrant = QdrantClient(URL)
INDEX_NAME: str = "my_books"

# Create a collection
# Use recreate_collection if you are experimenting and running the script several times.
# This function will first try to remove an existing collection with the same name.
qdrant.recreate_collection(
    collection_name=INDEX_NAME,
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(),  # Vector size is defined by used model
        distance=models.Distance.COSINE,
    ),
)

True

In [6]:
# vectors: list[float] =

# Upload data to collection
qdrant.upload_records(
    collection_name=INDEX_NAME,
    records=[
        models.Record(
            id=idx, vector=encoder.encode(doc["description"]).tolist(), payload=doc
        )
        for idx, doc in enumerate(documents)
    ],
)

### Query The Database

In [7]:
query: str = "alien invasion"
query_vector: list[float] = encoder.encode(query).tolist()
N: int = 3

hits: list[Any] = qdrant.search(
    collection_name=INDEX_NAME,
    query_vector=query_vector,
    limit=N,
)

for hit in hits:
    print(hit.payload, "score:", hit.score)

### Narrow down the query

- How about the most recent book from the early 2000s?

In [8]:
query: str = "alien invasion"
query_vector: list[float] = encoder.encode(query).tolist()
query_filter: models.Filter = models.Filter(
    must=[models.FieldCondition(key="year", range=models.Range(gte=2000))]
)


hits: list[Any] = qdrant.search(
    collection_name="my_books",
    query_vector=query_vector,
    query_filter=query_filter,
    limit=N,
)
for hit in hits:
    print(hit.payload, "score:", hit.score)

# Neural Search Service

In [9]:
from pathlib import Path

from datasets import Dataset, load_dataset
from tqdm.notebook import tqdm

In [10]:
data_fp: Path = Path("../../data")
fp: Path = data_fp / "startups_demo.json"
data_files: dict[str, Any] = {"train": str(fp)}

startup_data: Dataset = load_dataset(path="json", data_files=data_files)
startup_data

DatasetDict({
    train: Dataset({
        features: ['name', 'images', 'alt', 'description', 'link', 'city'],
        num_rows: 40474
    })
})

In [11]:
print(startup_data.get("train")[:3])