In [3]:
import chromadb
import random

persist_directory = "./data/chroma_db"  # <--- SET THIS


client = chromadb.PersistentClient(path=persist_directory)
print(f"Connected directly to ChromaDB at: {persist_directory}")

#client.delete_collection("cards_collection_googlae")  # <--- DELETE THIS COLLECTION
print("\nListing all collections found in the database:")
collections = client.list_collections()
if not collections:
        print("-> No collections found.")
else:
    print(f"-> Found {len(collections)} collection(s):")
    # Iterate through the Collection objects
    for collection in collections:
        # Call the .count() method on each collection object
        item_count = collection.count()
        print(f"  - Name: {collection.name} - Items: {item_count}")

        # Fetch 2 random samples if the collection has items
        if item_count > 0:
            all_items = collection.get(include=["metadatas", "documents"])
            random_samples = random.sample(range(len(all_items["metadatas"])), 2)

            print("    Random Samples:")
            for sample in random_samples:
                print(f"        Metadata: {all_items['metadatas'][sample]}")
                print(f"      - Document: {all_items['documents'][sample]}")
        else:
            print("    No items found in this collection.")

Connected directly to ChromaDB at: ./data/chroma_db

Listing all collections found in the database:
-> Found 1 collection(s):
  - Name: cards_collection_google - Items: 0
    No items found in this collection.


In [7]:
from langchain_community.vectorstores import Chroma
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings 
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
import os
from langchain_core.output_parsers import StrOutputParser

from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

if not os.getenv("GOOGLE_API_KEY"):
    raise ValueError("GOOGLE_API_KEY not found in environment variables or gcloud ADC.")

# Card Retriever (Self-Query) - Schema remains the same
card_metadata_field_info = [
    AttributeInfo(name="name", description="The name of the Magic: The Gathering card", type="string"),
    AttributeInfo(name="type", description="The type line of the card (e.g., 'Artifact', 'Instant', 'Creature — Elf Druid')", type="string"),
    AttributeInfo(name="set", description="The Magic: The Gathering set the card is from", type="string"),
    AttributeInfo(name="power", description="The power of the creature card", type="string"),
    AttributeInfo(name="toughness", description="The toughness of the creature card", type="string"),
    AttributeInfo(name="mana_cost", description="The mana cost symbols of the card", type="string"),
]
llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro-latest", temperature=0)

vector_store = Chroma(
    persist_directory="./data/chroma_db",
    collection_name="cards_collection_google",
    embedding_function=GoogleGenerativeAIEmbeddings(model="models/embedding-001"), 
      
)
card_document_content_description = "Text content describing a Magic: The Gathering card, including its abilities and stats."
template  = """Your goal is to structure the user's query to match the request schema provided below.

<< Structured Request Schema >>
When responding use a markdown code snippet with a JSON object formatted in the following schema:

```json
{
    "query": string \ text string to compare to document contents
    "filter": string \ logical condition statement for filtering documents
}
```

The query string should contain only text that is expected to match the contents of documents. Any conditions in the filter should not be mentioned in the query as well.

A logical condition statement is composed of one or more comparison and logical operation statements.

A comparison statement takes the form: `comp(attr, val)`:
- `comp` (eq | ne | gt | gte | lt | lte): comparator
- `attr` (string):  name of attribute to apply the comparison to
- `val` (string): is the comparison value

A logical operation statement takes the form `op(statement1, statement2, ...)`:
- `op` (and | or): logical operator
- `statement1`, `statement2`, ... (comparison statements or logical operation statements): one or more statements to apply the operation to

Make sure that you only use the comparators and logical operators listed above and no others.
Make sure that filters only refer to attributes that exist in the data source.
Make sure that filters only use the attributed names with its function names if there are functions applied on them.
Make sure that filters only use format `YYYY-MM-DD` when handling date data typed values.
Make sure that filters take into account the descriptions of attributes and only make comparisons that are feasible given the type of data being stored.
Make sure that filters are only used as needed. If there are no filters that should be applied return "NO_FILTER" for the filter value.


Structured Request:
```json
{
    "query": "teenager love",
    "filter": "and(or(eq(\"artist\", \"Taylor Swift\"), eq(\"artist\", \"Katy Perry\")), lt(\"length\", 180), eq(\"genre\", \"pop\"))"
}
```
<< Example >>
Data Source:
```json
{
    "content": "Text content describing a Magic: The Gathering card, including its abilities and stats.",
    "attributes": {
    "name": {
        "description": "The name of the Magic: The Gathering card",
        "type": "string"
    },
    "type": {
        "description": "The type line of the card (e.g., 'Artifact', 'Instant', 'Creature \u2014 Elf Druid')",
        "type": "string"
    },
    "set": {
        "description": "The Magic: The Gathering set the card is from",
        "type": "string"
    },
    "power": {
        "description": "The power of the creature card",
        "type": "string"
    },
    "toughness": {
        "description": "The toughness of the creature card",
        "type": "string"
    },
    "mana_cost": {
        "description": "The mana cost symbols of the card",
        "type": "string"
    }
}
}
```

User Query:"""


card_retriever_prompt = ChatPromptTemplate.from_template(template)
card_retriever = SelfQueryRetriever.from_llm(
    llm,
    vector_store,
    card_document_content_description,
    card_metadata_field_info,
    verbose=True,
    prompt=card_retriever_prompt,
)
print(f"Vector store loaded. Collection '{vector_store._collection.name}' has approx {vector_store._collection.count()} items.")
def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])

template = """Answer the question based only on the following context:

{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

chain = (
    {"context": card_retriever | format_docs, "question": RunnablePassthrough() }
    | prompt
    | llm
    | StrOutputParser()
)

chain.invoke("Find me a card that can deal 3 damage to target creature or player.")

Vector store loaded. Collection 'cards_collection_google' has approx 34504 items.


'Valakut Invoker can deal 3 damage to any target (which includes creatures and players) for eight generic mana.\n'