<a href="https://colab.research.google.com/github/dzivkovi/LLM_RBAC_experiments/blob/main/08_RBAC_using_VectorDB_filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RBAC using Vector DB filtering of tagged data (metadata)

Based on https://github.com/run-llama/llama_index/blob/main/docs/examples/vector_stores/chroma_metadata_filter.ipynb

https://github.com/run-llama/llama_index/blob/main/docs/examples/vector_stores/WeaviateIndex_metadata_filter.ipynb

https://github.com/run-llama/llama_index/blob/main/docs/examples/vector_stores/Qdrant_metadata_filter.ipynb

https://github.com/run-llama/llama_index/blob/main/docs/examples/vector_stores/pinecone_metadata_filter.ipynb


In [1]:
!pip install llama-index



In [2]:
!pip install chromadb



#### Creating a Chroma Index

In [3]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [4]:
from google.colab import userdata
import openai

openai.api_key = userdata.get('OPENAI_API_KEY')

In [5]:
import chromadb

In [6]:
chroma_client = chromadb.EphemeralClient()

chroma_collection = chroma_client.create_collection("collection-filtering")

In [7]:
from llama_index import VectorStoreIndex
from llama_index.vector_stores import ChromaVectorStore
from IPython.display import Markdown, display

## Mock NDL Data catalog to be indexed

In [8]:
from llama_index.schema import TextNode

nodes = [
    # VIP Datasets
    TextNode(
        text="Real-Time and Historical Market Data (Customized)",
        metadata={
            "symbol": "FOO/FOO",
            "type": "Market Data",
            "coverage": "Global Stock Exchanges, Commodities, Forex",
            "role": "vip",
            "year": 2024,
        },
    ),
    TextNode(
        text="Exclusive Alternative Data Sets (Customized)",
        metadata={
            "symbol": "BAR/BAR",
            "type": "Alternative Data",
            "coverage": "Consumer Behavior, Satellite Imagery, Social Media",
            "role": "vip",
            "year": 2023,
        },
    ),
    # Free Datasets
    TextNode(
        text="Stock Prices",
        metadata={
            "symbol": "WIKI/PRICES",
            "type": "Stock Data",
            "coverage": "Historical Stock Prices",
            "role": "free",
            "year": 2023,
        },
    ),
    TextNode(
        text="Economic Indicators",
        metadata={
            "symbol": "FRED/GDP",
            "type": "Economic Data",
            "coverage": "U.S. Gross Domestic Product",
            "role": "free",
            "year": 2022,
        },
    ),
    TextNode(
        text="Foreign Exchange Rates",
        metadata={
            "symbol": "CURRFX/USDJPY",
            "type": "Forex Data",
            "coverage": "USD/JPY Exchange Rates",
            "role": "free",
            "year": 2024,
        },
    ),
    TextNode(
        text="Commodities Prices",
        metadata={
            "symbol": "CHRIS/CME_GC1",
            "type": "Commodity Data",
            "coverage": "Gold Futures Prices",
            "role": "free",
            "year": 2023,
        },
    ),
    TextNode(
        text="Cryptocurrency Prices",
        metadata={
            "symbol": "BCHAIN/ABTC",
            "type": "Cryptocurrency Data",
            "coverage": "Bitcoin Blockchain Statistics",
            "role": "free",
            "year": 2022,
        },
    ),
    # Paid Datasets
    TextNode(
        text="Core Financial Data",
        metadata={
            "symbol": "SHARADAR/SF1",
            "type": "Financial Data",
            "coverage": "Core US Fundamentals",
            "role": "paid",
            "year": 2024,
        },
    ),
    TextNode(
        text="Real-Time Market Data",
        metadata={
            "symbol": "XNAS/TLT",
            "type": "Market Data",
            "coverage": "Real-Time Trade Data",
            "role": "paid",
            "year": 2023,
        },
    ),
    TextNode(
        text="Global Economic Data",
        metadata={
            "symbol": "WB/WDI",
            "type": "Economic Data",
            "coverage": "World Development Indicators",
            "role": "paid",
            "year": 2022,
        },
    ),
    TextNode(
        text="Alternative Data",
        metadata={
            "symbol": "NSR/SENTIMENT",
            "type": "Alternative Data",
            "coverage": "News Sentiment",
            "role": "paid",
            "year": 2024,
        },
    ),
    TextNode(
        text="Fixed Income Data",
        metadata={
            "symbol": "SIFMA/USABOND",
            "type": "Bond Market Data",
            "coverage": "U.S. Bond Market Trading Volume",
            "role": "paid",
            "year": 2023,
        },
    ),
    TextNode(
        text="Commodity Prices",
        metadata={
            "symbol": "LBMA/GOLD",
            "type": "Commodity Data",
            "coverage": "Historical Gold Prices",
            "role": "paid",
            "year": 2022,
        },
    ),
    TextNode(
        text="FX Rates",
        metadata={
            "symbol": "CURRFX/USDEUR",
            "type": "Forex Data",
            "coverage": "USD to EUR Exchange Rates",
            "role": "paid",
            "year": 2024,
        },
    ),
    TextNode(
        text="Fundamentals Data",
        metadata={
            "symbol": "SHARADAR/DAILY",
            "type": "Financial Data",
            "coverage": "Daily US Stock Market Fundamentals",
            "role": "paid",
            "year": 2023,
        },
    ),
]

In [9]:
from llama_index.storage.storage_context import StorageContext


vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [10]:
index = VectorStoreIndex(nodes, storage_context=storage_context)

## One Exact Match Filter

In [11]:
from llama_index.vector_stores.types import ExactMatchFilter, MetadataFilters

filters = MetadataFilters(
    filters=[ExactMatchFilter(key="role", value="free")]
)
query_engine = index.as_query_engine(filters=filters)


response = query_engine.query("Historical data sets and their symbols")
display(Markdown(f"<b>{response}</b>"))

response = query_engine.query("alternative coverage symbols?")
display(Markdown(f"<b>{response}</b>"))


<b>The historical data set symbol is WIKI/PRICES.</b>

<b>There is no information provided about alternative coverage symbols in the given context.</b>

In [12]:
from llama_index.vector_stores.types import ExactMatchFilter, MetadataFilters

filters = MetadataFilters(
    filters=[ExactMatchFilter(key="role", value="paid")]
)
query_engine = index.as_query_engine(filters=filters)

response = query_engine.query("Historical data sets and their symbols")
display(Markdown(f"<b>{response}</b>"))

response = query_engine.query("alternative coverage symbols?")
display(Markdown(f"<b>{response}</b>"))


<b>The historical data set with the symbol "LBMA/GOLD" is related to commodity prices.</b>

<b>NSR/SENTIMENT</b>

In [13]:
from llama_index.vector_stores.types import ExactMatchFilter, MetadataFilters

filters = MetadataFilters(
    filters=[ExactMatchFilter(key="role", value="vip")]
)
query_engine = index.as_query_engine(filters=filters)

response = query_engine.query("Historical data sets and their symbols")
display(Markdown(f"<b>{response}</b>"))

response = query_engine.query("alternative coverage symbols?")
display(Markdown(f"<b>{response}</b>"))

<b>The historical data sets and their symbols are not provided in the given context information.</b>

<b>BAR/BAR</b>

## Multiple Filters

### VIP client has access to all Paid data sets, plus more.

In [14]:
from llama_index.vector_stores.types import ExactMatchFilter, MetadataFilters
from llama_index.vector_stores.types import FilterCondition

filters = MetadataFilters(
    filters=[ExactMatchFilter(key="role", value="vip"),
             ExactMatchFilter(key="role", value="paid")],
    condition=FilterCondition.OR,
)

query_engine = index.as_query_engine(filters=filters)

response = query_engine.query("Historical data sets and their symbols")
display(Markdown(f"<b>{response}</b>"))

response = query_engine.query("alternative coverage symbols?")
display(Markdown(f"<b>{response}</b>"))

<b>The historical data sets and their symbols are not provided in the given context information.</b>

<b>The alternative coverage symbols mentioned in the context are BAR/BAR and NSR/SENTIMENT.</b>