In [1]:
# initialize client
from qdrant_client import QdrantClient, models
import os
from dotenv import load_dotenv


load_dotenv()

True

In [2]:
client = QdrantClient(
    url=os.getenv("QDRANT_URL"),
    api_key=os.getenv("QDRANT_API_KEY")
)


In [3]:
# Create Collection
collection_name = "test_collection"

# delete collection
client.delete_collection(collection_name=collection_name)

client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(size=4, distance=models.Distance.COSINE),
)

# Create payload index right after creating the collection and before uploading any data to enable filtering.
# If you add it later, HNSW won't rebuild automatically—bump ef_construct (e.g., 100→101) to trigger a safe rebuild.

client.create_payload_index(
    collection_name=collection_name,
    field_name="category",
    field_schema=models.PayloadSchemaType.KEYWORD,
)



UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

In [4]:
# insert points
# we use dummy embeddings here
points = [
    models.PointStruct(
        id=1,
        vector=[0.9, 0.1, 0.1, 0.8], # High affordability, high innovation
        payload={"name": "Budget Smartphone", "category": "Electronics", "price": 299},
    ),
    models.PointStruct(
        id=2,
        vector=[0.2, 0.9, 0.8, 0.5],
        payload={"name": "Bestselling Novel", "category": "Books", "price": 19},
    ),
    models.PointStruct(
        id=3,
        vector=[0.8, 0.3, 0.2, 0.9],
        payload={"name": "Smart Home Hub", "category": "electronics", "price":89},
    ),
    models.PointStruct(
        id=4,
        vector=[0.3, 0.7, 0.9, 0.4],
        payload={"name": "Cookbook", "category": "Books", "price": 29},
    ),
    models.PointStruct(
        id=5,
        vector=[0.6, 0.4, 0.3, 0.7],
        payload={"name": "Wireless Earbuds", "category": "Electronics", "price": 149},
    ),
    models.PointStruct(
        id=6,
        vector=[0.4, 0.8, 0.7, 0.6],
        payload={"name": "Science Fiction Novel", "category": "Books", "price": 24},
    ),
    models.PointStruct(
        id=7,
        vector=[0.7, 0.2, 0.4, 0.8],
        payload={"name": "4K Action Camera", "category": "Electronics", "price": 399},
    ),
    models.PointStruct(
        id=8,
        vector=[0.1, 0.9, 0.6, 0.5],
        payload={"name": "Historical Fiction", "category": "Books", "price": 22},
    ),  
    models.PointStruct(
        id=9,
        vector=[0.5, 0.5, 0.5, 0.5],
        payload={"name": "E-Reader", "category": "Electronics", "price": 129},
    ),
    models.PointStruct(
        id=10,
        vector=[0.2, 0.8, 0.7, 0.6],
        payload={"name": "Mystery Novel", "category": "Books", "price": 18},
    ),
        ]
client.upsert(collection_name=collection_name, points=points)

UpdateResult(operation_id=3, status=<UpdateStatus.COMPLETED: 'completed'>)

In [6]:
# Define a query vector for affordable and innovative products
query_vector = [0.85, 0.2, 0.1, 0.9]

# Basic similarity search
basic_results = client.query_points(
    collection_name=collection_name,
    query=query_vector
)

# Filtered similarity search: Electronics
filtered_results = client.query_points(
    collection_name=collection_name,
    query=query_vector,
    query_filter=models.Filter(
        must=[
            models.FieldCondition(
                key="category",
                match=models.MatchValue(value="Electronics")
            )
        ]
    )
)
print("Basic Similarity Search Results:", basic_results)
print("Filtered Similarity Search Results (Electronics):", filtered_results) 


                
                

Basic Similarity Search Results: points=[ScoredPoint(id=1, version=3, score=0.9933038, payload={'name': 'Budget Smartphone', 'category': 'Electronics', 'price': 299}, vector=None, shard_key=None, order_value=None), ScoredPoint(id=3, version=3, score=0.9928858, payload={'name': 'Smart Home Hub', 'category': 'electronics', 'price': 89}, vector=None, shard_key=None, order_value=None), ScoredPoint(id=7, version=3, score=0.96156037, payload={'name': '4K Action Camera', 'category': 'Electronics', 'price': 399}, vector=None, shard_key=None, order_value=None), ScoredPoint(id=5, version=3, score=0.9474185, payload={'name': 'Wireless Earbuds', 'category': 'Electronics', 'price': 149}, vector=None, shard_key=None, order_value=None), ScoredPoint(id=9, version=3, score=0.81480193, payload={'name': 'E-Reader', 'category': 'Electronics', 'price': 129}, vector=None, shard_key=None, order_value=None), ScoredPoint(id=6, version=3, score=0.6869248, payload={'name': 'Science Fiction Novel', 'category': 'B

In [None]:
text = "My name is jekwu, the lord of all that walks and crawls"
text.split()

['My',
 'name',
 'is',
 'jekwu,',
 'the',
 'lord',
 'of',
 'all',
 'that',
 'walks',
 'and',
 'crawls']

In [11]:
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /home/rand/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [18]:
def sentence_chunk(text, max_words=150):
    sentences = sent_tokenize(text)
    print(sentences)
    chunks, buffer, length = [], [], 0

    for sent in sentences:
        count = len(sent.split())
        if length + count > max_words:
            chunks.append(" ".join(buffer))
            buffer, length = [], 0

        buffer.append(sent)
        length += count
    if buffer:
        chunks.append(" ".join(buffer))
    return chunks

In [19]:
text = """

In this tutorial, you will build a mechanism that recommends movies based on defined preferences. Vector databases like Qdrant are good for storing high-dimensional data, such as user and item embeddings. They can enable personalized recommendations by quickly retrieving similar entries based on advanced indexing techniques. In this specific case, we will use sparse vectors to create an efficient and accurate recommendation system.

Privacy and Sovereignty: Since preference data is proprietary, it should be stored in a secure and controlled environment. Our vector database can easily be hosted on OVHcloud, our trusted Qdrant Hybrid Cloud partner. This means that Qdrant can be run from your OVHcloud region, but the database itself can still be managed from within Qdrant Cloud’s interface. Both products have been tested for compatibility and scalability, and we recommend their managed Kubernetes service.
Methodology: We’re adopting a collaborative filtering approach to construct a recommendation system from the dataset provided. Collaborative filtering works on the premise that if two users share similar tastes, they’re likely to enjoy similar movies. Leveraging this concept, we’ll identify users whose ratings align closely with ours, and explore the movies they liked but we haven’t seen yet. To do this, we’ll represent each user’s ratings as a vector in a high-dimensional, sparse space. Using Qdrant, we’ll index these vectors and search for users whose ratings vectors closely match ours. Ultimately, we will see which movies were enjoyed by users similar to us.
Service Managed Kubernetes, powered by OVH Public Cloud Instances, a leading European cloud provider. With OVHcloud Load Balancers and disks built in. OVHcloud Managed Kubernetes provides high availability, compliance, and CNCF conformance, allowing you to focus on your containerized software layers with total reversibility.
Sparse vectors can use advantage of negative values, so we can normalize ratings to have a mean of 0 and a standard deviation of 1. This normalization ensures that ratings are consistent and centered around zero, enabling accurate similarity calculations. In this scenario we can take into account movies that we don’t like.
"""

print(sentence_chunk(text))

['\n\nIn this tutorial, you will build a mechanism that recommends movies based on defined preferences.', 'Vector databases like Qdrant are good for storing high-dimensional data, such as user and item embeddings.', 'They can enable personalized recommendations by quickly retrieving similar entries based on advanced indexing techniques.', 'In this specific case, we will use sparse vectors to create an efficient and accurate recommendation system.', 'Privacy and Sovereignty: Since preference data is proprietary, it should be stored in a secure and controlled environment.', 'Our vector database can easily be hosted on OVHcloud, our trusted Qdrant Hybrid Cloud partner.', 'This means that Qdrant can be run from your OVHcloud region, but the database itself can still be managed from within Qdrant Cloud’s interface.', 'Both products have been tested for compatibility and scalability, and we recommend their managed Kubernetes service.', 'Methodology: We’re adopting a collaborative filtering a

In [24]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=100,
    separators=["\n\n", "\n", ".", " ", ""]
)

chunks = splitter.split_text(text)
print(chunks)

['In this tutorial, you will build a mechanism that recommends movies based on defined preferences. Vector databases like Qdrant are good for storing high-dimensional data, such as user and item embeddings. They can enable personalized recommendations by quickly retrieving similar entries based on advanced indexing techniques. In this specific case, we will use sparse vectors to create an efficient and accurate recommendation system.', 'Privacy and Sovereignty: Since preference data is proprietary, it should be stored in a secure and controlled environment. Our vector database can easily be hosted on OVHcloud, our trusted Qdrant Hybrid Cloud partner. This means that Qdrant can be run from your OVHcloud region, but the database itself can still be managed from within Qdrant Cloud’s interface. Both products have been tested for compatibility and scalability, and we recommend their managed Kubernetes service.', 'Methodology: We’re adopting a collaborative filtering approach to construct a

In [27]:
from sentence_transformers import SentenceTransformer
import numpy as np

def semantic_chunking(text, similarity_threshold=0.5):
    model = SentenceTransformer("all-MiniLM-L6-v2")
    sentences = text.split('.')
    embeddings = model.encode(sentences)

    chunks = []
    current_chunk = []

    for i in range(1, len(sentences)):
        # Calculate cosine similarity between consecutive sentences
        similarity = np.dot(embeddings[i-1], embeddings[i]) / (np.linalg.norm(embeddings[i-1]) * np.linalg.norm(embeddings[i]))
        if similarity < similarity_threshold:
            chunks.append('. '.join(current_chunk))
            current_chunk = [sentences[i]]
        else:
            current_chunk.append(sentences[i])

    chunks.append('. '.join(current_chunk))
    return chunks

In [28]:
semantic_chunking(text)

['',
 ' Vector databases like Qdrant are good for storing high-dimensional data, such as user and item embeddings',
 ' They can enable personalized recommendations by quickly retrieving similar entries based on advanced indexing techniques.  In this specific case, we will use sparse vectors to create an efficient and accurate recommendation system',
 '\n\nPrivacy and Sovereignty: Since preference data is proprietary, it should be stored in a secure and controlled environment',
 ' Our vector database can easily be hosted on OVHcloud, our trusted Qdrant Hybrid Cloud partner.  This means that Qdrant can be run from your OVHcloud region, but the database itself can still be managed from within Qdrant Cloud’s interface',
 ' Both products have been tested for compatibility and scalability, and we recommend their managed Kubernetes service',
 '\nMethodology: We’re adopting a collaborative filtering approach to construct a recommendation system from the dataset provided.  Collaborative filteri