# Vector store creation

In [1]:
import os
from dotenv import load_dotenv
from huggingface_hub import login

import uuid
import pandas as pd
from tqdm import tqdm # package for showing a progress bar
from sentence_transformers import SentenceTransformer
from langchain.document_loaders import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

## Load HuggingFace and login

In [2]:
load_dotenv(override=True)
hf_token = os.getenv('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


## Load the CSV and split the content in chunks

In [3]:
loader = CSVLoader("./food_dataset.csv", encoding="utf-8")
data = loader.load()

In [4]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=20)
documents = text_splitter.split_documents(data)

## Sentence transformer

In [5]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [6]:
NUMBER_OF_DOCUMENTS = len(documents)
docs_to_insert = []

for i in tqdm(range(0, NUMBER_OF_DOCUMENTS)):
    doc = documents[i].page_content.replace('\n', ', ')
    # Prepare documents for insertion
    docs_to_insert.append({
        "text": doc,
        "embedding": model.encode(doc).tolist()
    })

100%|███████████████████████████████████████| 7385/7385 [23:28<00:00,  5.24it/s]


## Upload data to Atlas

In [7]:
from pymongo import MongoClient

In [8]:
MONGO_DB_USER = os.getenv('MONGO_DB_USER')
MONGO_DB_PASSWORD = os.getenv('MONGO_DB_PASSWORD')
MONGO_DB_CLUSTER_NAME = os.getenv('MONGO_DB_CLUSTER_NAME')

DB_NAME = 'nutritional_rag'
COLLECTION_NAME = 'food'

uri = f"mongodb+srv://{MONGO_DB_USER}:{MONGO_DB_PASSWORD}@{MONGO_DB_CLUSTER_NAME}.i1ndjzi.mongodb.net/?retryWrites=true&w=majority&appName={MONGO_DB_CLUSTER_NAME}"

In [9]:
client = MongoClient(uri)
collection = client[DB_NAME][COLLECTION_NAME]

In [10]:
result = collection.insert_many(docs_to_insert)

## Setup index search

In [11]:
from pymongo.operations import SearchIndexModel
import time

# Create your index model, then create the search index
index_name="food_vector_index"
search_index_model = SearchIndexModel(
  definition = {
    "fields": [
      {
        "type": "vector",
        "numDimensions": 384,
        "path": "embedding",
        "similarity": "cosine"
      }
    ]
  },
  name = index_name,
  type = "vectorSearch"
)
collection.create_search_index(model=search_index_model)

# Wait for initial sync to complete
print("Polling to check if the index is ready. This may take up to a minute.")
predicate=None
if predicate is None:
   predicate = lambda index: index.get("queryable") is True

while True:
   indices = list(collection.list_search_indexes(index_name))
   if len(indices) and predicate(indices[0]):
      break
   time.sleep(5)
    
print(index_name + " is ready for querying.")

Polling to check if the index is ready. This may take up to a minute.
food_vector_index is ready for querying.


## Test with a query result function if the index search is working

In [12]:
# Define a function to run vector search queries
def get_query_results(query):
  """Gets results from a vector search query."""

  query_embedding = model.encode(query).tolist()
  pipeline = [
      {
            "$vectorSearch": {
              "index": "food_vector_index",
              "queryVector": query_embedding,
              "path": "embedding",
              "exact": True,
              "limit": 5
            }
      }, {
            "$project": {
              "_id": 0,
              "text": 1
         }
      }
  ]

  results = collection.aggregate(pipeline)

  array_of_results = []
  for doc in results:
      array_of_results.append(doc)
  return array_of_results

In [13]:
# Test the function with a sample query
import pprint
pprint.pprint(get_query_results("coconut"))

[{'text': 'product name: COCONUT MEAT,RAW, fat: 33.49, carbohydrates: 15.23, '
          'proteins: 3.33, calories: 354, sugars: 6.230000019, fiber: 9.0'},
 {'text': 'product name: COCONUT MEAT,DRIED (DESICCATED),NOT SWTND, fat: '
          '64.53, carbohydrates: 23.65, proteins: 6.88, calories: 660, sugars: '
          '7.349999905, fiber: 16.29999924'},
 {'text': 'product name: COCONUT MILK,RAW (LIQ EXPRESSED FROM GRATED '
          'MEAT&H2O), fat: 23.84, carbohydrates: 5.54, proteins: 2.29, '
          'calories: 230, sugars: 3.339999914, fiber: 2.200000048'},
 {'text': 'product name: COCONUT MEAT,DRIED '
          '(DESICCATED),SWTND,FLAKED,PACKAGED, fat: 27.99, carbohydrates: '
          '51.85, proteins: 3.13, calories: 456, sugars: 36.75, fiber: '
          '9.899999619'},
 {'text': 'product name: COCONUT H2O (LIQ FROM COCONUTS), fat: 0.2, '
          'carbohydrates: 3.71, proteins: 0.72, calories: 19, sugars: '
          '2.609999895, fiber: 1.100000024'}]
