In [1]:
from langchain_chroma import Chroma
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings
from typing import List
from langchain.docstore.document import Document
import json
import os
from dotenv import load_dotenv

# Load, and index the contents of bird.
class BirdDataLoader:
    """
    A loader specifically designed to load bird data,
    with the identification as the page_content and the other fields as metadata.
    """
    
    def __init__(self, file_path: str):
        self.file_path = file_path
        
    def load(self) -> List[Document]:
        """
        Load and process bird data.
        
        Return:
            List[Document]: List of documents, including:
                - page_content: identification
                - metadata: bird_name、binomial_name、macaulay_id, url
        """
        # Read JSON
        with open(self.file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        documents = []
        
        for bird_name, bird_info in data.items():
            doc = Document(
                page_content=f"{bird_name}, also called {bird_info['binomialName']}, is {bird_info['identification']}",
                metadata={
                    'birdName': bird_name,
                    'binomialName': bird_info['binomialName'],
                    'macaulayID': bird_info['macaulayID'],
                    'url': bird_info['url']
                }
            )
            documents.append(doc)
            
        return documents
    
loader = BirdDataLoader('./source/ebird_data.json')
docs = loader.load()

In [2]:
# Load NIM API key
load_dotenv()
NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY")

embeddings_model = NVIDIAEmbeddings(
    model="nvidia/llama-3.2-nv-embedqa-1b-v1",
    api_key=NVIDIA_API_KEY,
    truncate="NONE",
)

persist_directory = "./chroma_db"
vectorstore = Chroma.from_documents(
    documents=docs,
    embedding=embeddings_model,
    persist_directory=persist_directory,
    collection_metadata={"hnsw:space": "cosine"},
)

In [7]:
loaded_vectorstore = Chroma(
    persist_directory=persist_directory,
    embedding_function=embeddings_model
)

loaded_vectorstore.similarity_search_with_relevance_scores("blue bird", k=3)

[(Document(metadata={'binomialName': 'Alcedo atthis', 'birdName': 'Common Kingfisher', 'macaulayID': '26854431', 'url': 'https://ebird.org/species/comkin1/JP-13'}, page_content='Common Kingfisher, also called Alcedo atthis, is Beautiful little blue-and-orange bird with a long, pointed bill. Often rather shy and inconspicuous despite bright plumage. Found along rivers, streams, lakes, and ponds—almost any fresh or brackish habitat with small fish. Often perches quietly in trees over water; most often seen in very fast low flight as a turquoise flash over the water, usually flying away. Easily detected once its high, shrill whistled call is learned, even if the bird itself is hidden. The only small blue kingfisher over much of its range.'),
  0.30331742763519287),
 (Document(metadata={'binomialName': 'Eurystomus orientalis', 'birdName': 'Dollarbird', 'macaulayID': '382216911', 'url': 'https://ebird.org/species/dollar1/JP-13'}, page_content='Dollarbird, also called Eurystomus orientalis, 

[0.6966825723648071,
 0.7173626858874733,
 0.7295412107078101]