In [None]:
import chromadb
from duckduckgo_search import DDGS
from chromadb.config import Settings

In [None]:
# Define metadata structure template
def build_metadata(name: str, synonyms=None, tags=None, primary_url: str = "", urls=None):
    """
    Build metadata dict for each result.
    """
    return {
        "name": name,
        "synonyms": synonyms or [],
        "tags": tags or [],
        "primary_url": primary_url,
        "urls": urls or [],
    }

# Initialize ChromaDB client
def init_chroma(db_dir: str = None):
    client = chromadb.PersistentClient(path=db_dir)
    return client

# Main pipeline
def add_data(
    query: str,
    collection_name: str,
    db_dir: str = None,
    max_results: int = 10
):
    """
    Search DuckDuckGo and index results into ChromaDB.

    Args:
        query: Search query string.
        collection_name: Name of the ChromaDB collection.
        db_dir: Directory to persist the ChromaDB database.
        max_results: Maximum number of search results to retrieve.
    """
    # Initialize clients
    client = init_chroma(db_dir)
    collection = client.get_or_create_collection(
        name=collection_name,
        configuration={
            "hnsw": {
                "space": "cosine", # Cohere models often use cosine space
                "ef_search": 100,
                "ef_construction": 100,
                "max_neighbors": 16,
                "num_threads": 4
            }
        }
    )

    # Perform search
    results = DDGS().text(query, max_results)

    documents = []
    metadatas = []
    ids = []

    for idx, res in enumerate(results):
        title = res.get("title", "")
        snippet = res.get("body", "")  # 'body' is the snippet/document text
        url = res.get("href", "")

        # Build metadata
        metadata = build_metadata(
            name=title,
            synonyms=[],  # add synonyms if available
            tags=[query],  # you can customize tags
            primary_url=url,
            urls=[url]
        )

        # Collect for batch upsert
        doc_id = f"{collection_name}_{idx}"
        ids.append(doc_id)
        documents.append(snippet)
        metadatas.append(metadata)

    # Index into Chroma
    collection.add(
        ids=ids,
        documents=documents,
        metadatas=metadatas
    )

    # Persist if directory provided
    if db_dir:
        client.persist()

    print(f"Indexed {len(documents)} documents into collection '{collection_name}'.")

def search_data(
    query: str,
    collection_name: str,
    db_dir: str = None,
    n_results: int = 5
):
    """
    Search indexed data in ChromaDB.

    Args:
        query: Search query string.
        collection_name: Name of the ChromaDB collection.
        db_dir: Directory to persist the ChromaDB database.
        n_results: Number of results to return.
    """
    # Initialize clients
    client = init_chroma(db_dir)
    collection = client.get_collection(name=collection_name)

    # Perform search
    results = collection.query(
        query_texts=[query],
        # query_embeddings=[query],
        n_results=n_results
    )

    return results

In [None]:
add_data(
    query="openai GPT-4",
    collection_name="gpt4_search",
    db_dir="./chroma_db",
    max_results=5
)

results = search_data(
    query="gpt-4",
    collection_name="gpt4_search",
    db_dir="./chroma_db",
    n_results=5
) 

print(results)