# Vector store creation

In [25]:
import os
from dotenv import load_dotenv
from huggingface_hub import login

import uuid
import pandas as pd
from tqdm import tqdm # package for showing a progress bar
from sentence_transformers import SentenceTransformer
from langchain.document_loaders import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

## Load HuggingFace and login

In [26]:
load_dotenv(override=True)
hf_token = os.getenv('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


## Load the CSV and split the content in chunks

In [27]:
loader = CSVLoader("./openfood_sanitized_dataset.csv", encoding="utf-8")
data = loader.load()

In [28]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=20)
documents = text_splitter.split_documents(data)

## Sentence transformer

In [29]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [50]:
NUMBER_OF_DOCUMENTS = len(documents)
docs_to_insert = []

for i in tqdm(range(0, NUMBER_OF_DOCUMENTS)):
    doc = documents[i].page_content.replace('\n', ', ')
    # Prepare documents for insertion
    docs_to_insert.append({
        "text": doc,
        "embedding": model.encode(doc).tolist()
    })

100%|█████████████████████████████████| 142232/142232 [2:34:16<00:00, 15.37it/s]


## Upload data to Atlas

### N.B.: only 105,000 rows have been inserted without exceeding the space, review this

In [51]:
from pymongo import MongoClient

In [52]:
MONGO_DB_USER = os.getenv('MONGO_DB_USER')
MONGO_DB_PASSWORD = os.getenv('MONGO_DB_PASSWORD')
MONGO_DB_CLUSTER_NAME = os.getenv('MONGO_DB_CLUSTER_NAME')

DB_NAME = 'nutritional_rag'
COLLECTION_NAME = 'food'

uri = f"mongodb+srv://{MONGO_DB_USER}:{MONGO_DB_PASSWORD}@{MONGO_DB_CLUSTER_NAME}.i1ndjzi.mongodb.net/?retryWrites=true&w=majority&appName={MONGO_DB_CLUSTER_NAME}"

In [53]:
client = MongoClient(uri)
collection = client[DB_NAME][COLLECTION_NAME]

In [None]:
result = collection.insert_many(docs_to_insert[:105000])

## Setup index search

In [69]:
from pymongo.operations import SearchIndexModel
import time

# Create your index model, then create the search index
index_name="food_vector_index"
search_index_model = SearchIndexModel(
  definition = {
    "fields": [
      {
        "type": "vector",
        "numDimensions": 384,
        "path": "embedding",
        "similarity": "cosine"
      }
    ]
  },
  name = index_name,
  type = "vectorSearch"
)
collection.create_search_index(model=search_index_model)

# Wait for initial sync to complete
print("Polling to check if the index is ready. This may take up to a minute.")
predicate=None
if predicate is None:
   predicate = lambda index: index.get("queryable") is True

while True:
   indices = list(collection.list_search_indexes(index_name))
   if len(indices) and predicate(indices[0]):
      break
   time.sleep(5)
print(index_name + " is ready for querying.")

Polling to check if the index is ready. This may take up to a minute.
food_vector_index is ready for querying.


## Test with a query result function if the index search is working

In [74]:
# Define a function to run vector search queries
def get_query_results(query):
  """Gets results from a vector search query."""

  query_embedding = model.encode(query).tolist()
  pipeline = [
      {
            "$vectorSearch": {
              "index": "food_vector_index",
              "queryVector": query_embedding,
              "path": "embedding",
              "exact": True,
              "limit": 5
            }
      }, {
            "$project": {
              "_id": 0,
              "text": 1
         }
      }
  ]

  results = collection.aggregate(pipeline)

  array_of_results = []
  for doc in results:
      array_of_results.append(doc)
  return array_of_results

In [75]:
# Test the function with a sample query
import pprint
pprint.pprint(get_query_results("cow milk"))

[{'text': "product name: 100% Real Cow's Milk, fat: 3.33, carbohydrates: 5.42, "
          'proteins: 3.33, calories: 61.9, saturated fat: 2.08, sugars: 5.0, '
          'fiber: 0.0, salt: 0.13208'},
 {'text': "product name: 100% Cow's Milk, fat: 3.33, carbohydrates: 5.42, "
          'proteins: 3.33, calories: 61.9, saturated fat: 2.08, sugars: 5.0, '
          'fiber: 0.0, salt: 0.13208'},
 {'text': "product name: Milk'Is : Biscuits surfins fourrés au lait et garnis "
          "d'une tablette de chocolat au lait suisse, fat: 29.0, "
          'carbohydrates: 65.0, proteins: 5.0, calories: 544.93, saturated '
          'fat: 19.0, sugars: 37.0, fiber: 1.5, salt: 0.499999999999999'},
 {'text': "product name: Milk'is, fat: 9.0, carbohydrates: 24.0, proteins: "
          '3.5, calories: 191.2, saturated fat: 6.0, sugars: 23.0, fiber: 0.5, '
          'salt: 0.2'},
 {'text': 'product name: 1% Lowfat Cow Milk, fat: 1.04, carbohydrates: 5.42, '
          'proteins: 3.33, calories: 45.89, s