In [None]:
# Imports
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance
import pandas as pd
import numpy as np
import json
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer
from qdrant_client.models import Filter

In [25]:
# Create a model to generate embeddings for the text in the dataframe
model = SentenceTransformer(
    model_name_or_path="all-MiniLM-L12-v2", device="cpu",
    similarity_fn_name="cosine"
)  # device="cpu" or device="cuda" for GPU

In [None]:
# Load data Startup Demo Dataset
# df = pd.read_json("https://storage.googleapis.com/generall-shared-data/startups_demo.json", lines=True)
# df = df.sample(2000)
# df.to_json("stt.json", orient="records", lines=True)

In [40]:
# Load data Health News
df = pd.read_json("healthnews.json", lines=True)
df.reset_index(drop=True, inplace=True)
df["published_year"] = df["Date"].dt.year
df.drop(columns=["Date"], inplace=True)
df.to_json("healthnews_yr.json", orient="records", lines=True)
df.sample(3)

Unnamed: 0,News,published_year
66,Jonathan Nuñez had a liver transplant in which...,2014
1430,"Cancer Risk Doubles After Organ Transplant, S...",2011
1014,"Well: Life, Interrupted: A Golden Opportunity ...",2013


In [5]:
# Encode the text: embeddings
vectors = model.encode(
    df["News"].tolist(),
    show_progress_bar=True,
)

Batches:   0%|          | 0/63 [00:00<?, ?it/s]

Batches: 100%|██████████| 63/63 [00:13<00:00,  4.79it/s]


In [26]:
# Startups dataset: Encode the text: embeddings
# vectors = model.encode(
#     [row.alt + ". " + row.description for row in df.itertuples()],
#     show_progress_bar=True,
# )

Batches: 100%|██████████| 63/63 [00:25<00:00,  2.49it/s]


All of the descriptions are now converted into vectors. There are 2000 vectors of 384 dimensions. The output layer of the model has this dimension

In [7]:
# View first row of the dataframe
print('Dataframe:', df["News"][1])
print('---')
print('Vector:', vectors[0][0:30])
print('---')
print(vectors.shape)

Dataframe: Well: Savory and Sweet Whole Wheat Focaccia http://nyti.ms/1oXFmKS
---
Vector: [ 0.081046   -0.01129879  0.05162172  0.03999408  0.02012923  0.00079972
 -0.02407607  0.05108334 -0.04262673 -0.02786539 -0.02297052 -0.02726727
 -0.00324672  0.05730832 -0.05912046 -0.02010428  0.0059504  -0.04927275
 -0.05108761  0.0612146   0.04456822  0.00351208  0.08726088  0.00944512
  0.01153558 -0.1233921   0.00166522 -0.04746857 -0.0656888   0.00487615]
---
(2000, 384)


In [8]:
# Save the vectors
np.save("healthnews_vectors.npy", vectors, allow_pickle=False)

There are 3 modes of use:
* Memory mode
* Local host
* Cloud host

In [66]:
# LOCAL HOST MODE: Open Qdrant client
# qdrant_client = QdrantClient("http://localhost:6333")

# MEMORY MAPPED MODE: Open Qdrant client
qdrant_client = QdrantClient(":memory:")

# from qdrant_client import QdrantClient

# qdrant_client = QdrantClient(
#     url="https://xxxxxx-xxxxx-xxxxx-xxxx-xxxxxxxxx.us-east.aws.cloud.qdrant.io:6333",
#     api_key="<your-api-key>",
# )

In [91]:
# Creating a collection
if not qdrant_client.collection_exists("health_news"):
    qdrant_client.create_collection(
        collection_name="health_news",
        vectors_config=VectorParams(size=384, distance=Distance.COSINE),
    )

In [92]:
qdrant_client.collection_exists("health_news")

True

In [81]:
# There are two ways to know the embedding dimension of the model
model.get_sentence_embedding_dimension()

# or

vectors.shape[1]

384

In [93]:
fd = open("healthnews_yr.json")

# payload is now an iterator over startup data
payload = map(json.loads, fd)

# Load all vectors into memory, numpy array works as iterable for itself.
# Other option would be to use Mmap, if you don't want to load all data into RAM
vectors = np.load("healthnews_vectors.npy")

In [94]:
# Upload data to Qdrant
qdrant_client.upload_collection(
    collection_name="health_news",
    vectors=vectors,
    payload=payload,
    ids=None,  # Vector ids will be assigned automatically
    batch_size=384,  # How many vectors will be uploaded in a single request?
)

In [95]:
qdrant_client.get_collection("health_news")

CollectionInfo(status=<CollectionStatus.GREEN: 'green'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=None, indexed_vectors_count=0, points_count=2000, segments_count=1, config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=384, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None, datatype=None, multivector_config=None), shard_number=None, sharding_method=None, replication_factor=None, write_consistency_factor=None, read_fan_out_factor=None, on_disk_payload=None, sparse_vectors=None), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=None, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=1), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0

In order to process incoming requests, neural search will need 2 things: 1) a model to convert the query into a vector and 2) the Qdrant client to perform search queries.

In [None]:
class NeuralSearcher:
    def __init__(self, collection_name):
        self.collection_name = collection_name
        # Initialize encoder model
        self.model = SentenceTransformer("all-MiniLM-L12-v2", device="cpu", similarity_fn_name="cosine")
        # initialize Qdrant client
        self.qdrant_client = qdrant_client


    def search(self, text: str):
        # Convert text query into vector
        vector = self.model.encode(text).tolist()
    
        # Use `vector` for search for closest vectors in the collection
        search_result = self.qdrant_client.query_points(
            collection_name=self.collection_name,
            query=vector,
            query_filter=None,  # If you don't want any filters for now
            limit=3,  # 3 the most closest results is enough
        ).points
        # `search_result` contains found vector ids with similarity scores along with the stored payload
        # Hit payload and scores
        payloads = [(hit.payload['News'], hit.score) for hit in search_result]
        return payloads

With Qdrant it is also feasible to add some conditions to the search. For example, if you wanted to search for startups in a certain city, the search query could look like this:

In [None]:
# Searching with Filter

class NeuralSearcher:
    def __init__(self, collection_name, qdrant_client):
        self.collection_name = collection_name
        # Initialize encoder model
        self.model = SentenceTransformer("all-MiniLM-L12-v2", device="cpu")
        # initialize Qdrant client
        self.qdrant_client = qdrant_client


    def search(self, text: str, published_year: int):
        # Convert text query into vector
        vector = self.model.encode(text).tolist()

        published_year = published_year
        # Define a filter for cities
        year_filter = Filter(**{
            "must": [{
                "key": "published_year", # Store the information in a field of the same name 
                "match": { # This condition checks if payload field has the requested value
                    "value": published_year }
            }]
        })
        
        search_result = self.qdrant_client.query_points(
            collection_name=self.collection_name,
            query=vector,
            query_filter=year_filter,
            limit=3
        ).points
        
        # `search_result` contains found vector ids with similarity scores along with the stored payload
        # In this function you are interested in payload only
        payloads = [f"> Score: {round(hit.score,4)}, |> Text: {hit.payload['News']}, |> Year: { hit.payload['published_year']})" for hit in search_result]

        return payloads


### Search

In [111]:
%%time
# Instantiate NeuralSearcher
searcher = NeuralSearcher(collection_name="health_news",
                           qdrant_client=qdrant_client)

# Query
q = "Quick Workout"
searcher.search(text=q,
                published_year=2014)

CPU times: total: 516 ms
Wall time: 850 ms


['> Score: 0.6032, |> Text: Well: The Advanced 7-Minute Workout http://nyti.ms/10rsibq, |> Year: 2014)',
 '> Score: 0.5792, |> Text: Download the 7-Minute Workout app from atnytimeswell for iPhone and Android http://nyti.ms/ZQrfkz http://pbs.twimg.com/media/B0vj0v0CQAAl0dp.jpg, |> Year: 2014)',
 '> Score: 0.48, |> Text: Instead of a large all-out workout, have you tried exercising in snack-size portions?  http://nyti.ms/1jtdqKX, |> Year: 2014)']