# Lesson 1: Vanilla vector search

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

In [None]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
MONGO_URI = os.getenv("MONGO_URI")

# Data loading

In [None]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("MongoDB/airbnb_embeddings", streaming=True, split="train")
dataset.take(100)

dataset_df = pd.DataFrame(dataset)
dataset_df.head(5)

In [None]:
print("Columns:", dataset_df.columns)

# 1.2 Document modeling

In [None]:
from typing import List, Optional
from pydantic import BaseModel, ValidationError
from datetime import datetime

In [None]:
class Host(BaseModel):
    host_id: str
    host_url: str
    host_name: str
    host_location: str
    host_about: str
    host_response_time: Optional[str] = None
    host_thumbnail_url: str
    host_picture_url: str
    host_response_rate: Optional[int] = None
    host_is_superhost: bool
    host_has_profile_pic: bool
    host_identity_verified: bool

In [None]:
class Location(BaseModel):
    type: str
    coordinates: List[float]
    is_location_exact: bool


class Address(BaseModel):
    street: str
    government_area: str
    market: str
    country: str
    country_code: str
    location: Location

In [None]:
class Review(BaseModel):
    _id: str
    date: Optional[datetime] = None
    listing_id: str
    reviewer_id: str
    reviewer_name: Optional[str] = None
    comments: Optional[str] = None

In [None]:
class Listing(BaseModel):
    _id: int
    listing_url: str
    name: str
    summary: str
    space: str
    description: str
    neighborhood_overview: Optional[str] = None
    notes: Optional[str] = None
    transit: Optional[str] = None
    access: str
    interaction: Optional[str] = None
    house_rules: str
    property_type: str
    room_type: str
    bed_type: str
    minimum_nights: int
    maximum_nights: int
    cancellation_policy: str
    last_scraped: Optional[datetime] = None
    calendar_last_scraped: Optional[datetime] = None
    first_review: Optional[datetime] = None
    last_review: Optional[datetime] = None
    accommodates: int
    bedrooms: Optional[float] = 0
    beds: Optional[float] = 0
    number_of_reviews: int
    bathrooms: Optional[float] = 0
    amenities: List[str]
    price: int
    security_deposit: Optional[float] = None
    cleaning_fee: Optional[float] = None
    extra_people: int
    guests_included: int
    images: dict
    host: Host
    address: Address
    availability: dict
    review_scores: dict
    reviews: List[Review]
    text_embeddings: List[float]


In [None]:
records = dataset_df.to_dict(orient="records")

In [None]:
for record in records:
    for key, value in record.items():
        if isinstance(value, list):
            processed_list = [None if pd.isnull(value) else value for value in value]
            record[key] = processed_list
        else:
            if pd.isnull(value):
                record[key] = None

In [None]:
try:
    listings=[Listing(**record).dict() for record in records]
    print(listings[0].keys())
except ValidationError as e:
    print(e)

# 1.3 Database Creation and Connection

In [None]:
from pymongo.mongo_client import MongoClient
from pymongo.operations import SearchIndexModel

database_name = "airbnb_dataset"
collection_name = "listings_reviews"

In [None]:
def get_mongo_client(mongo_uri):
    client=MongoClient(mongo_uri,appname="devrel.deeplearningai.lesson1.python", tlsAllowInvalidCertificates=True)
    print("Connection to MongoDB successful")
    return client

In [None]:
if not MONGO_URI:
    print("MONGO_URI not set in environment variables")
mongo_client=get_mongo_client(MONGO_URI)

In [None]:
db=mongo_client[database_name]
collection=db.get_collection(collection_name)

In [None]:
collection.delete_many({})

# 1.4 Data Ingestion

In [None]:
collection.insert_many(listings)
print("Data ingestion into MongoDB completed")

# 1.5 Vector Search Index defintion

In [None]:
text_embedding_field_name = "text_embeddings"
vector_search_index_name_text = "vector_index_text"

In [None]:
vector_search_index_model = SearchIndexModel(
    definition={
        "mappings": { 
            "dynamic": True, 
            "fields": { 
                text_embedding_field_name: { 
                    "dimensions": 1536, 
                    "similarity": "cosine", 
                    "type": "knnVector",
                }
            },
        }
    },
    name=vector_search_index_name_text, 
)

In [None]:
# Check if the index already exists
index_exists = False
for index in collection.list_indexes():
    print(index)
    if index['name'] == vector_search_index_name_text:
        index_exists = True
        break

In [None]:
import time

# Create the index if it doesn't exist
if not index_exists:
    try:
        result = collection.create_search_index(model=vector_search_index_model)
        print("Creating index...")
        time.sleep(20)  
        print("Index created successfully:", result)
        print("Wait a few minutes before conducting search with index to ensure index intialization")
    except Exception as e:
        print(f"Error creating vector search index: {str(e)}")
else:
    print(f"Index '{vector_search_index_name_text}' already exists.")

# NOTE: if the output of this process is Error creating vector search index: Duplicate Index, you may proceed to the next cell if you intend to still use a previously created index

In [None]:
import openai

openai.api_key = OPENAI_API_KEY

def get_embedding(text):
    """Generate an embedding for the given text using OpenAI's API."""

    # Check for valid input
    if not text or not isinstance(text, str):
        return None

    try:
        # Call OpenAI API to get the embedding
        embedding = openai.embeddings.create(
            input=text,
            model="text-embedding-3-small", dimensions=1536).data[0].embedding
        return embedding
    except Exception as e:
        print(f"Error in get_embedding: {e}")
        return None

# 1.6 Compose Vector Search Query

In [None]:
def vector_search(user_query, db, collection, vector_index=vector_search_index_name_text):
    """
    Perform a vector search in the MongoDB collection based on the user query.

    Args:
    user_query (str): The user's query string.
    db (MongoClient.database): The database object.
    collection (MongoCollection): The MongoDB collection to search.
    additional_stages (list): Additional aggregation stages to include in the pipeline.

    Returns:
    list: A list of matching documents.
    """

    # Generate embedding for the user query
    query_embedding = get_embedding(user_query)

    if query_embedding is None:
        return "Invalid query or embedding generation failed."

    vector_search_stage = {
        "$vectorSearch": {
            "index": vector_index, 
            "queryVector": query_embedding, 
            "path": text_embedding_field_name, 
            "numCandidates": 150, 
            "limit": 20 
        }
    }

    # Define the aggregate pipeline with the vector search stage and additional stages
    pipeline = [vector_search_stage]

    # Execute the search
    results = collection.aggregate(pipeline)

    explain_query_execution = db.command( 
        'explain', { 
            'aggregate': collection.name, 
            'pipeline': pipeline, 
            'cursor': {} 
        }, 
        verbosity='executionStats') 


    vector_search_explain = explain_query_execution['stages'][0]['$vectorSearch']
    millis_elapsed = vector_search_explain['explain']['resourceUsage']['systemTimeMs']

    print(f"Total time for the execution to complete on the database server: {millis_elapsed} milliseconds")

    return list(results)


## 1.7 Handling User Query

In [None]:
class SearchResultItem(BaseModel):
    name: str
    accommodates: Optional[int] = None
    address: Address
    summary: Optional[str] = None
    description: Optional[str] = None
    neighborhood_overview: Optional[str] = None
    notes: Optional[str] = None

In [None]:
from IPython.display import display, HTML

def handle_user_query(query, db, collection):
    get_knowledge = vector_search(query, db, collection)

    if not get_knowledge:
        return "No results found.", "No source information available."
        
    search_results_models = [
        SearchResultItem(**result)
        for result in get_knowledge
    ]

    search_results_df = pd.DataFrame([item.dict() for item in search_results_models])

    completion = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "system", 
                "content": "You are a airbnb listing recommendation system."},
            {
                "role": "user", 
                "content": f"Answer this user query: {query} with the following context:\n{search_results_df}"
            }
        ]
    )

    system_response = completion.choices[0].message.content

    print(f"- User Question:\n{query}\n")
    print(f"- System Response:\n{system_response}\n")

    display(HTML(search_results_df.to_html()))

    return system_response

In [None]:
query = """
I want to stay in a place that's warm and friendly, 
and not too far from restaurants, can you recommend a place? 
Include a reason as to why you've chosen your selection.
"""
handle_user_query(query, db, collection)