## MongoDB + LlamaIndex + Fireworks AI

In [2]:
import os

In [16]:
FIREWORKS_API_KEY = "FIREWORKS_API_KEY"
if FIREWORKS_API_KEY not in os.environ:
    raise ValueError(f"Please provide a {FIREWORKS_API_KEY}.")
fw_api_key = os.getenv(FIREWORKS_API_KEY)
os.environ[FIREWORKS_API_KEY] = fw_api_key

In [17]:
from datasets import load_dataset
import pandas as pd

# https://huggingface.co/datasets/AIatMongoDB/whatscooking.restaurants
dataset = load_dataset("AIatMongoDB/whatscooking.restaurants")

# Convert the dataset to a pandas dataframe
dataset_df = pd.DataFrame(dataset["train"])

dataset_df.head(5)

Unnamed: 0,TakeOut,address,stars,cuisine,restaurant_id,name,menu,PriceRange,OutdoorSeating,DogsAllowed,embedding,attributes,sponsored,review_count,borough,location,HappyHour,_id
0,True,"{'building': '627', 'coord': [-73.975980999999...",2.5,Tex-Mex,40366661,Baby Bo'S Burritos,,1.0,True,,"[-0.14520384, 0.018315623, -0.018330636, -0.10...","{'Alcohol': ''none'', 'Ambience': '{'romantic'...",,10,Manhattan,"{'coordinates': [-73.97598099999999, 40.745132...",,{'$oid': '6095a34a7c34416a90d3206b'}
1,True,"{'building': '17', 'coord': [-74.1350211, 40.6...",3.5,American,40367442,Buddy'S Wonder Bar,"[Grilled cheese sandwich, Baked potato, Lasagn...",2.0,True,True,"[-0.11977468, -0.02157107, 0.0038846824, -0.09...","{'Alcohol': ''beer_and_wine'', 'Ambience': '{'...",,62,Staten Island,"{'coordinates': [-74.1350211, 40.6369042], 'ty...",,{'$oid': '6095a34a7c34416a90d3209e'}
2,True,"{'building': '37', 'coord': [-74.138263, 40.54...",4.0,American,40364610,Great Kills Yacht Club,"[Mozzarella sticks, Mushroom swiss burger, Spi...",1.0,True,,"[-0.1004329, -0.014882699, -0.033005167, -0.09...","{'Alcohol': ''none'', 'Ambience': '{'touristy'...",,72,Staten Island,"{'coordinates': [-74.138263, 40.546681], 'type...",,{'$oid': '6095a34a7c34416a90d31ff6'}
3,True,"{'building': '842', 'coord': [-73.970637000000...",4.0,American,40365288,Keats Restaurant,"[French fries, Chicken pot pie, Mac & cheese, ...",2.0,True,,"[-0.11735515, -0.0397448, -0.0072645755, -0.09...","{'Alcohol': None, 'Ambience': '{'touristy': Fa...",,149,Manhattan,"{'coordinates': [-73.97063700000001, 40.751495...",True,{'$oid': '6095a34a7c34416a90d32017'}
4,True,"{'building': '120', 'coord': [-73.9998042, 40....",5.0,Bakery,40363151,Olive'S,"[doughnuts, chocolate chip cookies, chocolate ...",1.0,True,,"[-0.096541286, -0.009661355, 0.04402167, -0.12...","{'Alcohol': None, 'Ambience': None, 'BYOB': No...",,7,Manhattan,"{'coordinates': [-73.9998042, 40.7251256], 'ty...",,{'$oid': '6095a34a7c34416a90d31fbd'}


In [18]:
from llama_index.core.settings import Settings
from llama_index.llms.fireworks import Fireworks
from llama_index.embeddings.fireworks import FireworksEmbedding

embed_model = FireworksEmbedding(
    embed_batch_size=512,
    model_name="nomic-ai/nomic-embed-text-v1.5",
    api_key=fw_api_key,
)
llm = Fireworks(
    temperature=0,
    model="accounts/fireworks/models/mixtral-8x7b-instruct",
    api_key=fw_api_key,
)

Settings.llm = llm
Settings.embed_model = embed_model

In [19]:
import json
from llama_index.core import Document
from llama_index.core.schema import MetadataMode

# Convert the DataFrame to a JSON string representation
documents_json = dataset_df.to_json(orient="records")
# Load the JSON string into a Python list of dictionaries
documents_list = json.loads(documents_json)

llama_documents = []

for document in documents_list:
    # Value for metadata must be one of (str, int, float, None)
    document["name"] = json.dumps(document["name"])
    document["cuisine"] = json.dumps(document["cuisine"])
    document["attributes"] = json.dumps(document["attributes"])
    document["menu"] = json.dumps(document["menu"])
    document["borough"] = json.dumps(document["borough"])
    document["address"] = json.dumps(document["address"])
    document["PriceRange"] = json.dumps(document["PriceRange"])
    document["HappyHour"] = json.dumps(document["HappyHour"])
    document["review_count"] = json.dumps(document["review_count"])
    document["TakeOut"] = json.dumps(document["TakeOut"])
    # these two fields are not relevant to the question we want to answer,
    # so I will skip it for now
    del document["embedding"]
    del document["location"]

    # Create a Document object with the text and excluded metadata for llm and embedding models
    llama_document = Document(
        text=json.dumps(document),
        metadata=document,
        metadata_template="{key}=>{value}",
        text_template="Metadata: {metadata_str}\n-----\nContent: {content}",
    )

    llama_documents.append(llama_document)

# Observing an example of what the LLM and Embedding model receive as input
print(
    "\nThe LLM sees this: \n",
    llama_documents[0].get_content(metadata_mode=MetadataMode.LLM),
)
print(
    "\nThe Embedding model sees this: \n",
    llama_documents[0].get_content(metadata_mode=MetadataMode.EMBED),
)


The LLM sees this: 
 Metadata: TakeOut=>true
address=>{"building": "627", "coord": [-73.975981, 40.745132], "street": "2 Avenue", "zipcode": "10016"}
stars=>2.5
cuisine=>"Tex-Mex"
restaurant_id=>40366661
name=>"Baby Bo'S Burritos"
menu=>null
PriceRange=>1.0
OutdoorSeating=>True
DogsAllowed=>None
attributes=>{"Alcohol": "'none'", "Ambience": "{'romantic': False, 'intimate': False, 'classy': False, 'hipster': False, 'divey': False, 'touristy': False, 'trendy': False, 'upscale': False, 'casual': False}", "BYOB": null, "BestNights": null, "BikeParking": null, "BusinessAcceptsBitcoin": null, "BusinessAcceptsCreditCards": null, "BusinessParking": "None", "Caters": "True", "DriveThru": null, "GoodForDancing": null, "GoodForKids": "True", "GoodForMeal": null, "HasTV": "True", "Music": null, "NoiseLevel": "'average'", "RestaurantsAttire": "'casual'", "RestaurantsDelivery": "True", "RestaurantsGoodForGroups": "True", "RestaurantsReservations": "True", "RestaurantsTableService": "False", "Wheelc

In [20]:
llama_documents[0]

Document(id_='f32dda5c-0a82-41fb-9f21-7fa5f0ce97f0', embedding=None, metadata={'TakeOut': 'true', 'address': '{"building": "627", "coord": [-73.975981, 40.745132], "street": "2 Avenue", "zipcode": "10016"}', 'stars': 2.5, 'cuisine': '"Tex-Mex"', 'restaurant_id': '40366661', 'name': '"Baby Bo\'S Burritos"', 'menu': 'null', 'PriceRange': '1.0', 'OutdoorSeating': True, 'DogsAllowed': None, 'attributes': '{"Alcohol": "\'none\'", "Ambience": "{\'romantic\': False, \'intimate\': False, \'classy\': False, \'hipster\': False, \'divey\': False, \'touristy\': False, \'trendy\': False, \'upscale\': False, \'casual\': False}", "BYOB": null, "BestNights": null, "BikeParking": null, "BusinessAcceptsBitcoin": null, "BusinessAcceptsCreditCards": null, "BusinessParking": "None", "Caters": "True", "DriveThru": null, "GoodForDancing": null, "GoodForKids": "True", "GoodForMeal": null, "HasTV": "True", "Music": null, "NoiseLevel": "\'average\'", "RestaurantsAttire": "\'casual\'", "RestaurantsDelivery": "Tr

In [21]:
from llama_index.core.node_parser import SentenceSplitter

parser = SentenceSplitter()
nodes = parser.get_nodes_from_documents(llama_documents)
# 25k nodes takes about 10 minutes, will trim it down to 2.5k
new_nodes = nodes[:2500]

# There are 25k documents, so we need to do batching. Fortunately LlamaIndex provides good batching
# for embedding models, and we are going to rely on the __call__ method for the model to handle this
node_embeddings = embed_model(new_nodes)

In [22]:
for idx, n in enumerate(new_nodes):
    n.embedding = node_embeddings[idx].embedding
    if "_id" in n.metadata:
        del n.metadata["_id"]

In [50]:
import pymongo

def get_mongo_client(mongo_uri):
    """Establish connection to the MongoDB."""
    try:
        client = pymongo.MongoClient(mongo_uri)
        print("Connection to MongoDB successful")
        return client
    except pymongo.errors.ConnectionFailure as e:
        print(f"Connection failed: {e}")
        return None


MONGODB_USERNAME = "mongodb_username"
MONGODB_PASSWORD = "mongodb_password"


def get_mongodb_uri() -> str:
    """Returns the MongoDB URI for Cluster0 (owned by E)."""
    if MONGODB_USERNAME not in os.environ:
        raise EnvironmentError(f"Please set your MongoDB username as the environment variable '{MONGODB_USERNAME}'.")
    if MONGODB_PASSWORD not in os.environ:
        raise EnvironmentError(f"Please set your MongoDB password as the environment variable '{MONGODB_PASSWORD}'.")

    mongodb_username = os.getenv(MONGODB_USERNAME)
    mongodb_password = os.getenv(MONGODB_PASSWORD)
    uri = f"mongodb+srv://{mongodb_username}:{mongodb_password}@mongodbgenaihackathon.ureqwqu.mongodb.net/?retryWrites=true&w=majority&appName=mongodbgenaihackathon" # noqa: E501
    return uri

mongo_uri = get_mongodb_uri()

mongo_client = get_mongo_client(mongo_uri)

DB_NAME = "whatscooking"
COLLECTION_NAME = "restaurants"

db = mongo_client[DB_NAME]
collection = db[COLLECTION_NAME]

Connection to MongoDB successful


In [51]:
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch

vector_store = MongoDBAtlasVectorSearch(
    mongo_client,
    db_name=DB_NAME,
    collection_name=COLLECTION_NAME,
    index_name="vector_index",
)
vector_store.add(new_nodes)

['47213446-3a20-4c2a-ad3d-e926fab8c9b8',
 'e67651f3-82bb-4033-84a2-b7dc3358e2a0',
 '3223e570-89e5-44a5-b740-49aa674987bd',
 '609d178c-13d4-475b-9a6e-ea5bdb9da987',
 '9ef4e24a-78f1-49ec-892f-0e79068c0945',
 '3dede671-b9f5-445c-a35c-4bf2facd05d2',
 '353312b8-da09-4abb-a3fa-c2a0675cd16f',
 '31f65f22-4bf8-4289-a4cb-3401696ad235',
 'f36f41e7-45f8-4f07-b919-ba8f69ad0b71',
 '9e0befad-b437-438f-aca0-7bf9016fc071',
 '880e0f41-e8a2-412d-b626-6d9e754a9e50',
 'fd4ac53b-1938-467a-a4b1-394ba9de5d4f',
 '24a1cfe0-1edd-483a-a2ca-59c7987e85a3',
 'd36d290f-22f4-4bb8-96ad-deca2bb0e7e4',
 '9a8788f1-0e7e-464d-9849-51c8eb864111',
 '72ffeb1a-0b15-4184-ae1e-af842cd42098',
 '09689927-962b-480f-92cb-e2b892ff9454',
 '875fccae-014a-41ae-88f0-c7c235cec607',
 'ffb2ec63-0456-456b-a75c-919f215f83b6',
 'e02ada41-eebb-4723-981d-5a67ddd9de4c',
 '5a658832-ea31-4589-b3f7-38420e28f866',
 'bb100de6-8c20-4215-b71d-e2ec66529f7b',
 'd305ed17-2e81-4e7d-a9b0-2db2ab3e9f63',
 '02632885-f251-4479-a906-d44208e7d581',
 '93c4d024-7b5e-

In [44]:
from llama_index.core import VectorStoreIndex, StorageContext

index = VectorStoreIndex.from_vector_store(vector_store)

In [52]:
import pprint
from llama_index.core.response.notebook_utils import display_response

query_engine = index.as_query_engine()

query = "search query: Anything that doesn't have alcohol in it"

response = query_engine.query(query)
display_response(response)
pprint.pprint(response.source_nodes)

**`Final Response:`** Empty Response

[]
