# Calories RAG - Open Source Model Version

In [90]:
import os
import glob
from dotenv import load_dotenv

load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

## Setup Mongo connection

In [91]:
import json
from pymongo import MongoClient
from openai import OpenAI
from sentence_transformers import SentenceTransformer

In [92]:
MONGO_DB_USER = os.getenv('MONGO_DB_USER')
MONGO_DB_PASSWORD = os.getenv('MONGO_DB_PASSWORD')
MONGO_DB_CLUSTER_NAME = os.getenv('MONGO_DB_CLUSTER_NAME')

DB_NAME = 'nutritional_rag'
COLLECTION_NAME = 'food'

uri = f"mongodb+srv://{MONGO_DB_USER}:{MONGO_DB_PASSWORD}@{MONGO_DB_CLUSTER_NAME}.i1ndjzi.mongodb.net/?retryWrites=true&w=majority&appName={MONGO_DB_CLUSTER_NAME}"

client = MongoClient(uri)
collection = client[DB_NAME][COLLECTION_NAME]

## Setup search index function

In [93]:
transformer = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [94]:
# Define a function to run vector search queries
def get_query_results(query):
  """Gets results from a vector search query."""

  query_embedding = transformer.encode(query).tolist()
  pipeline = [
      {
            "$vectorSearch": {
              "index": "food_vector_index",
              "queryVector": query_embedding,
              "path": "embedding",
              "exact": True,
              "limit": 3
            }
      }, {
            "$project": {
              "_id": 0,
              "text": 1
         }
      }
  ]

  results = collection.aggregate(pipeline)

  array_of_results = []
  for doc in results:
      array_of_results.append(doc)
  return array_of_results

## Function for extract JSON

In [95]:
import json
import re

def get_json(text):
    # Find the starting position of the first JSON object
    start_of_json = text.find('{')
    
    if start_of_json != -1:
        # Use a regular expression to find the full JSON object
        # This pattern looks for a JSON-like structure enclosed in curly braces
        # with a closing brace `}` that isn't a part of an inner structure.
        # The `re.DOTALL` flag is used to match newlines as well.
        match = re.search(r'\{[^{}]*?\}', text[start_of_json:], re.DOTALL)
        
        if match:
            json_str = match.group(0)
            try:
                # Parse the extracted string into a Python dictionary
                data = json.loads(json_str)
                print(data)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")
        else:
            print("No valid JSON structure found.")
    else:
        print("No JSON object found in the text.")

## Create query function to Open source model

In [96]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "meta-llama/Llama-3.2-1B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(model_name)

In [99]:
def get_nutritional_data(food):
    context = get_query_results(food)
    context_string = " - ".join([doc["text"] for doc in context])
    prompt = f"""
    Get the nutritional data of the following food ingredient: {food}
    Answer the question based only on the following context: {context_string}
    Reply only with a JSON that contains the following data: protein, carbohydrates, fats, calories, sugars, fibers. 
    """

    input_ids = tokenizer(prompt, return_tensors="pt", truncation='do_not_truncate')

    outputs = model.generate(**input_ids, max_new_tokens=100)

    decoded_output = tokenizer.decode(outputs[0]).replace('\n', '')

    return get_json(decoded_output)

In [100]:
get_nutritional_data("coconut")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{'protein': 3.33, 'carbohydrates': 15.23, 'fats': 33.49, 'calories': 354, 'sugars': 6.23, 'fibers': 9.0}
