# Lesson2: Filtering With Metadata

In [None]:
import custom_utils

## Data Loading

In [None]:
from datasets import load_dataset
import pandas as pd

datasets = load_dataset("MongoDB/airbnb_embeddings", streaming=True, split="train")
dataset = datasets.take(100)
dataset_df = pd.DataFrame(dataset)
dataset_df.head(5)

In [None]:
print("Columns:", dataset_df.columns)

# Document Modelling

In [None]:
listings = custom_utils.process_records(dataset_df)

# Database Creation and Connection

In [None]:
db, collection = custom_utils.connect_to_database()

In [None]:
collection.delete_many({})

# Data Ingestion

In [None]:
collection.insert_many(listings)
print("Data ingestion into MongoDB completed")

# Vector Search Index defintion

In [None]:
custom_utils.setup_vector_search_index(collection=collection)


# Compose Vector Search Query

In [None]:
def vector_search(user_query, db, collection, additional_stages=[], vector_index="vector_index_text"):
    """
    Perform a vector search in the MongoDB collection based on the user query.

    Args:
    user_query (str): The user's query string.
    db (MongoClient.database): The database object.
    collection (MongoCollection): The MongoDB collection to search.
    additional_stages (list): Additional aggregation stages to include in the pipeline.

    Returns:
    list: A list of matching documents.
    """

    query_embeddings = custom_utils.get_embedding(user_query)

    if query_embeddings is None:
        return "Invalid query or embedding generation failed."
    vector_search_stage = {
        "$vectorSearch": {
            "index": vector_index,
            "queryVector": query_embeddings,
            "path": "text_embeddings",
            "numCandidates": 150,
            "limit": 20,
        }
    }
    pipeline = [vector_search_stage] + additional_stages

    results = collection.aggregate(pipeline)

    explain_query_execution = db.command(  
        'explain', {  
            'aggregate': collection.name,  
            'pipeline': pipeline,  
            'cursor': {} 
        },
        verbosity='executionStats')  

    vector_search_explain = explain_query_execution['stages'][0]['$vectorSearch']
    millis_elapsed = vector_search_explain['explain']['collectStats']['millisElapsed']

    print(f"Total time for the execution to complete on the database server: {millis_elapsed} milliseconds")

    return list(results)

# Handling User Query

In [None]:
from pydantic import BaseModel
from typing import Optional


class SearchResultItem(BaseModel):
    name: str
    accommodates: Optional[int] = None
    bedrooms: Optional[int] = None
    address: custom_utils.Address
    space: str = None

In [None]:
from IPython.display import display, HTML


def handle_user_query(query, db, collection, stages=[], vector_index="vector_index_text"):
    get_knowledge = vector_search(query, db, collection, stages, vector_index)

    if not get_knowledge:
        return "No results found.", "No source information available."

    search_results_models = [
        SearchResultItem(**result)
        for result in get_knowledge
    ]

    search_results_df = pd.DataFrame([item.dict() for item in search_results_models])

    completion = custom_utils.openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "system",
                "content": "You are a airbnb listing recommendation system."},
            {
                "role": "user",
                "content": f"Answer this user query: {query} with the following context:\n{search_results_df}"
            }
        ]
    )

    system_response = completion.choices[0].message.content

    print(f"- User Question:\n{query}\n")
    print(f"- System Response:\n{system_response}\n")

    display(HTML(search_results_df.to_html()))

    return system_response

# Adding A Post Filter to Vector Search (Match Operator)

In [None]:
import re

search_path = "address.country"

match_stage = {
    "$match": {
        search_path: re.compile(r"United States"),
        "accommodates":{"$gt":1,"$lt":5}
    }
}
additional_stages=[match_stage]

In [None]:
query = """
I want to stay in a place that's warm and friendly, 
and not too far from restaurants, can you recommend a place? 
Include a reason as to why you've chosen your selection"
"""
handle_user_query(query, db, collection, additional_stages)

# Adding A PreFilter to Vector Search

In [None]:
from pymongo.operations import SearchIndexModel
import time 

vector_index_with_filter = "vector_index_with_filter"

new_vector_search_index_model = SearchIndexModel(
    definition={
        "mappings": {
            "dynamic": True,
            "fields": {
                "text_embeddings": {
                    "dimensions": 1536,
                    "similarity": "cosine",
                    "type": "knnVector",
                },
                 "accommodates": {
                    "type": "number"
                },
                "bedrooms": {
                    "type": "number"
                },
            },
        }
    },
    name=vector_index_with_filter,
)

# Create the new index
try:
    result = collection.create_search_index(model=new_vector_search_index_model)
    print("Creating index...")
    time.sleep(20)  
    print("New index created successfully:", result)
except Exception as e:
    print(f"Error creating new vector search index: {str(e)}")

In [None]:
def vector_search(user_query, db, collection, additional_stages=[], vector_index="vector_index_text"):
    query_embedding = custom_utils.get_embedding(user_query)
    if query_embedding is None:
        return "Invalid query or embedding generation failed."

    vector_search_stage = {
        "$vectorSearch": {
            "index": vector_index,  
            "queryVector": query_embedding,  
            "path": "text_embeddings",  
            "numCandidates": 150,  
            "limit": 20,  
            "filter": {
                "$and": [
                    {"accommodates": {"$gte": 2}}, 
                    {"bedrooms": {"$lte": 7}}
                ]
            },
        }
    }
    pipeline = [vector_search_stage] + additional_stages
    results = collection.aggregate(pipeline)
    explain_query_execution = db.command( 
        'explain', { 
            'aggregate': collection.name, 
            'pipeline': pipeline, 
            'cursor': {} 
        }, 
        verbosity='executionStats') 

    vector_search_explain = explain_query_execution['stages'][0]['$vectorSearch']
    millis_elapsed = vector_search_explain['explain']['collectStats']['millisElapsed']

    print(f"Total time for the execution to complete on the database server: {millis_elapsed} milliseconds")
    return list(results)

In [None]:
query = """
I want to stay in a place that's warm and friendly, 
and not too far from resturants, can you recommend a place? 
Include a reason as to why you've chosen your selection"
"""
handle_user_query(
    query, 
    db, 
    collection, 
    vector_index=vector_index_with_filter
)