# Lesson4: Boosting

In [None]:
import custom_utils

In [None]:
from datasets import load_dataset
import pandas as pd

datasets = load_dataset("MongoDB/airbnb_embeddings", streaming=True, split="train")
dataset = datasets.take(100)
dataset_df = pd.DataFrame(dataset)
dataset_df.head(5)

In [None]:
print("Columns:", dataset_df.columns)

# Document Modelling

In [None]:
listings = custom_utils.process_records(dataset_df)

# Database creation and connection

In [None]:
db, collections = custom_utils.connect_to_database()

# Data ingestion

In [None]:
collections.insert_many(listings)
print("Data ingestion into MongoDB completed")

In [None]:
# Vector search index defintion

In [None]:
custom_utils.setup_vector_search_index_with_filter(collection=collections)

# Handle user query

In [None]:
from pydantic import BaseModel
from typing import Optional


class SearchResultItem(BaseModel):
    name: str
    accommodates: Optional[int] = None
    address: custom_utils.Address
    averageReviewScore: Optional[float] = None
    number_of_reviews: Optional[float] = None
    combinedScore: Optional[float] = None


In [None]:
from IPython.display import display, HTML


def handle_user_query(query, db, collection, stages=[], vector_index="vector_index_text"):
    get_knowledge = custom_utils.vector_search_with_filter(query, db, collection, stages, vector_index)

    if not get_knowledge:
        return "No results found.", "No source information available."

    print("List of all fields of the first document, before model conformance")
    print(get_knowledge[0].keys())

    search_results_models = [
        SearchResultItem(**result)
        for result in get_knowledge
    ]

    search_results_df = pd.DataFrame([item.dict() for item in search_results_models])

    completion = custom_utils.openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "system",
                "content": "You are a airbnb listing recommendation system."},
            {
                "role": "user",
                "content": f"Answer this user query: {query} with the following context:\n{search_results_df}"
            }
        ]
    )
    system_response = completion.choices[0].message.content
    print(f"- User Question:\n{query}\n")
    print(f"- System Response:\n{system_response}\n")
    display(HTML(search_results_df.to_html()))
    return system_response

# Boosting search result after vector search

In [None]:
review_average_stage = {
    "$addFields": {
        "averageReviewScore": {
            "$divide": [
                {
                    "$add": [
                        "$review_scores.review_scores_accuracy",
                        "$review_scores.review_scores_cleanliness",
                        "$review_scores.review_scores_checkin",
                        "$review_scores.review_scores_communication",
                        "$review_scores.review_scores_location",
                        "$review_scores.review_scores_value",
                    ]
                },
                6  # Divide by the number of review score types to get the average
            ]
        },
        "reviewCountBoost": "$number_of_reviews"
    }
}

In [None]:
weighting_stage = {
    "$addFields": {
        "combinedScore": {
            "$add": [
                {"$multiply": ["$averageReviewScore", 0.9]},  
                {"$multiply": ["$reviewCountBoost", 0.1]}  
            ]
        }
    }
}

In [None]:
# Apply the combinedScore for sorting
sorting_stage_sort = {
    "$sort": {"combinedScore": -1}  
}

In [None]:
additional_stages = [review_average_stage, weighting_stage, sorting_stage_sort]

In [None]:
query = """
I want to stay in a place that's warm and friendly, 
and not too far from resturants, can you recommend a place? 
Include a reason as to why you've chosen your selection"
"""
handle_user_query(
    query,
    db,
    collections,
    additional_stages,
    vector_index="vector_index_with_filter"
)