In [12]:
import os
import json
import chromadb
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

In [13]:
# Initialize ChromaDB (Persistent)
chroma_client = chromadb.PersistentClient(path="./chroma")
collection = chroma_client.get_or_create_collection(name="tool_json_documents")

# Load local embedding models
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Directory containing JSON files
json_dir = "../tools"

# Function to check if a file already exists in ChromaDB
def file_exists_in_db(filename):
    results = collection.get(ids=[filename])
    return len(results["ids"]) > 0  # If ID exists, return True

# Process and add/update JSON files in the vector database
for filename in tqdm(os.listdir(json_dir)):
    if filename.endswith(".json"):
        file_path = os.path.join(json_dir, filename)

        with open(file_path, "r", encoding="utf-8") as file:
            data = json.load(file)  # Load JSON content

        # Convert JSON into a searchable text format
        text_data = json.dumps(data, indent=2)  # Convert to string
        embedding = embedding_model.encode(text_data, convert_to_numpy=True).tolist()  # Generate embedding

        if file_exists_in_db(filename):
            # Update existing entry
            collection.update(
                ids=[filename],  # Keep the same ID
                embeddings=[embedding],
                metadatas=[{"filename": filename, "path": file_path}],
                documents=[text_data]
            )
            print(f"🔄 Updated existing file in ChromaDB: {filename}")
        else:
            # Add new entry
            collection.add(
                ids=[filename],
                embeddings=[embedding],
                metadatas=[{"filename": filename, "path": file_path}],
                documents=[text_data]
            )
            print(f"✅ Added new file to ChromaDB: {filename}")

print("🎉 All JSON files have been processed!")


100%|██████████| 3/3 [00:00<00:00, 58.17it/s]

✅ Added new file to ChromaDB: set_alarm.json
✅ Added new file to ChromaDB: turn_on_lights.json
✅ Added new file to ChromaDB: fetch_weather.json
🎉 All JSON files have been processed!





In [14]:
# Query Example
query = "Find documents about machine learning"
query_embedding = embedding_model.encode(query, convert_to_numpy=True).tolist()

results = collection.query(query_embeddings=[query_embedding], n_results=5)

# Print the top results
for doc, meta in zip(results["documents"][0], results["metadatas"][0]):
    print(f"🔍 Retrieved Document from {meta['filename']}:\n", doc[:500], "\n")

Number of requested results 5 is greater than number of elements in index 3, updating n_results = 3


🔍 Retrieved Document from turn_on_lights.json:
 {
  "examples": [
    "Can you turn on the ROOM light?",
    "Turn on the light in the ROOM?",
    "turn on the ROOM light?"
  ],
  "tool": {
    "name": "turn_on_light",
    "description": "Send an HTTP request to turn on a smart lightbulb in a specific room.",
    "parameters": {
      "type": "dict",
      "required": [
        "room"
      ],
      "properties": {
        "room": {
          "type": "string",
          "description": "The room where the light is located. Must be one of: livi 

🔍 Retrieved Document from fetch_weather.json:
 {
  "examples": [
    "Can you get the weather for CITY STATE?",
    "Get the weather for CITY STATE?",
    "What is the weather for CITY STATE?"
  ],
  "tool": {
    "name": "fetch_weather",
    "description": "Fetch the current weather for a given location.",
    "parameters": {
      "type": "dict",
      "required": [
        "location"
      ],
      "properties": {
        "location": {
      