In [1]:
import os
import re
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_community.document_loaders import RecursiveUrlLoader
from bs4 import BeautifulSoup
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
import warnings
warnings.filterwarnings("ignore")

In [2]:
load_dotenv()

True

![Diagram](./images/Metadata_Filter.png)

# Enhancing RAG with Metadata

In the first part of this tutorial, we built a basic RAG pipeline. Now, we'll take it a step further by incorporating metadata.

__What is metadata?__ It's data about your data. For a webpage, this could include the author, publication date, title, or even a summary. By extracting and indexing this metadata alongside the document content, we can perform more targeted and efficient retrievals.


Let's dive in and make our RAG pipeline smarter! 


# Data Extraction

The first step is to get our data, but this time, we'll be more deliberate about what information we extract. Instead of just grabbing all the text, we'll create custom functions to pull out specific pieces of metadata and the main article content.

We'll define two functions:

1. `custom_metadata_extractor`: This function will parse the HTML to find the page title, description, language, and any associated "subjects" or tags. It uses BeautifulSoup to navigate the HTML structure and find the relevant elements.

2. `article_extractor`: This function's job is to find the main article content of the page, which is usually contained within an <article> tag.

Using these specific extractors ensures we get clean, relevant data for both content and metadata, which will be crucial for filtering later on.

In [3]:
def custom_metadata_extractor(raw_html: str, url: str) -> dict:
    """
    Extracts custom metadata from the raw HTML of a webpage.

    This function parses the HTML to find the title, description, language,
    and a list of subjects associated with the page.

    Args:
        raw_html: The raw HTML content of the page.
        url: The URL of the page.

    Returns:
        A dictionary containing the extracted metadata.
    """
        
    metadata = {"source": url}
    soup = BeautifulSoup(raw_html, "html.parser")
    if title := soup.find("title"):
        raw_title = title.get_text()
        clean_title = re.sub(r'\\\s*Anthropic\s*', '', raw_title)
        metadata["title"] = clean_title.strip()

    if description := soup.find("meta", attrs={"name": "description"}):
        metadata["description"] = description.get("content", None)
    if html := soup.find("html"):
        metadata["language"] = html.get("lang", None)
    # Extract subjects into a list
    subjects = []
    subject_container = soup.find('div', class_='PostDetail_post-detail-types-subjects__rYglE')
    if subject_container:
        subject_spans = subject_container.find_all('span', class_='PostDetail_post-subject__Kpz7U')
        subjects = [span.get_text() for span in subject_spans]

    metadata["subjects"] = subjects if subjects else ['Other']
    metadata["subjects"] = metadata['subjects'][0].lower() #just using the first topic for simplicity
    

    return metadata



def article_extractor(html: str) -> dict:
    """
    Extracts the article content and post subjects from the given HTML.

    Args:
        html: The HTML content as a string.

    Returns:
        A dictionary containing the extracted article text and a list of subjects.
    """
    soup = BeautifulSoup(html, "lxml")
    
    # Extract text from the <article> tag
    article_tag = soup.find('article')
    article_text = ''
    if article_tag:
        # Clean up the text by removing excessive newlines
        article_text = re.sub(r"\n\n+", "\n\n", article_tag.get_text()).strip()
    

    return article_text


# Now, we'll use LangChain's RecursiveUrlLoader again, 
# but this time we'll pass our custom functions to the extractor and metadata_extractor parameters.

loader = RecursiveUrlLoader("https://www.anthropic.com/news", extractor=article_extractor, metadata_extractor=custom_metadata_extractor, max_depth=2)

docs = loader.load()

In [4]:
len(docs) 

142

In [5]:
# docs[1].page_content
docs[10].metadata   

{'source': 'https://www.anthropic.com/news/lyft-announcement',
 'title': 'Lyft to bring Claude to more than 40 million riders and over 1 million drivers',
 'description': "Anthropic is an AI safety and research company that's working to build reliable, interpretable, and steerable AI systems.",
 'language': 'en',
 'subjects': 'announcements'}

In [6]:
def wrap_text(text, width=80):
    return '\n'.join([text[i:i+width] for i in range(0, len(text), width)]) 

for doc in docs:
    print(wrap_text(doc.page_content))
    print("-"*100)

NewsroomAnnouncementsIntroducing Claude 4Press inquiries press@anthropic.comNon-
media inquiries support.anthropic.comMedia assets Download press kitFollow Anthr
opic FeaturedAnthropic raises Series E at $61.5B post-money valuationFeaturedInt
roducing the Anthropic Economic IndexNewsNo results found.Case StudyHow Anthropi
c teams use Claude CodeJul 24, 2025PolicyThoughts on America’s AI Action PlanJul
 23, 2025AnnouncementsAnthropic partners with the University of Chicago’s Becker
 Friedman Institute on AI economic researchJul 23, 2025PolicyBuild AI in America
Jul 21, 2025PolicyAnthropic to sign the EU Code of PracticeJul 21, 2025Announcem
entsPaul Smith to join Anthropic as Chief Commercial OfficerJul 15, 2025ProductC
laude for Financial ServicesJul 15, 2025AlignmentInvesting in energy to secure A
merica's AI future Jul 15, 2025ProductDiscover tools that work with Claude Jul 1
4, 2025AnnouncementsAnthropic and the Department of Defense to advance responsib
le AI in defense operationsJ

# Chunk the Data

Just like in the first tutorial, we need to split our documents into smaller chunks for effective processing by the LLM. We'll use the same RecursiveCharacterTextSplitter with a 1000-character chunk size and a 200-character overlap.

In [7]:
# chunk the data

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)

splits = text_splitter.split_documents(docs) #Figure out how to explicitly set the metadata for the chunks

In [8]:
for split in splits:
    print(wrap_text(split.page_content))
    print("-"*100)

NewsroomAnnouncementsIntroducing Claude 4Press inquiries press@anthropic.comNon-
media inquiries support.anthropic.comMedia assets Download press kitFollow Anthr
opic FeaturedAnthropic raises Series E at $61.5B post-money valuationFeaturedInt
roducing the Anthropic Economic IndexNewsNo results found.Case StudyHow Anthropi
c teams use Claude CodeJul 24, 2025PolicyThoughts on America’s AI Action PlanJul
 23, 2025AnnouncementsAnthropic partners with the University of Chicago’s Becker
 Friedman Institute on AI economic researchJul 23, 2025PolicyBuild AI in America
Jul 21, 2025PolicyAnthropic to sign the EU Code of PracticeJul 21, 2025Announcem
entsPaul Smith to join Anthropic as Chief Commercial OfficerJul 15, 2025ProductC
laude for Financial ServicesJul 15, 2025AlignmentInvesting in energy to secure A
merica's AI future Jul 15, 2025ProductDiscover tools that work with Claude Jul 1
4, 2025AnnouncementsAnthropic and the Department of Defense to advance responsib
le AI in defense operationsJ

In [9]:
splits[2].metadata

{'source': 'https://www.anthropic.com/news',
 'title': 'Newsroom',
 'description': "Anthropic is an AI safety and research company that's working to build reliable, interpretable, and steerable AI systems.",
 'language': 'en',
 'subjects': 'other',
 'start_index': 1585}

# Index

Next, we'll create our vector store and index the document chunks. We're using ChromaDB again with Google's embeddings. 

In [10]:
#Indexing

#define the embeddings model
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

#define the vector store
vector_store = Chroma(
    collection_name="2_metadata_filtering_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_db",  # Where to save data locally
)



# vector_store.delete_collection()

#add the chunks in to db
document_ids = vector_store.add_documents(documents=splits)

# Retreival & Generation

In [11]:

#configure the llm
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")  

#set the prompt template
template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
Always say "thanks for asking!" at the end of the answer.

{context}

Question: {question}

Helpful Answer:"""

rag_prompt_template = PromptTemplate.from_template(template)

In [12]:
def generate_answer(user_question):
    #retrieve the relevant docs
    retriever = vector_store.as_retriever(search_kwargs={"k": 5}, search_type='similarity')
    retrieved_docs = retriever.invoke(user_question)
    
    #generate
    docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)
    prompt = rag_prompt_template.invoke({"question": user_question, "context": docs_content})
    response = llm.invoke(prompt)

    return retrieved_docs, response.content

user_question = "What is Anthropic?"
retrieved_docs, answer = generate_answer(user_question)

In [13]:
print(answer)

Based on the provided text, Anthropic is an AI company that develops next-generation AI systems, such as Claude Code and Claude 3.7 Sonnet.  They recently raised $3.5 billion in funding.  Thanks for asking!


In [14]:
for doc in retrieved_docs:
    print(doc.page_content)
    print("-"*100)

human know this when relevant…”NewsHow Anthropic teams use Claude CodeJul 24, 2025NewsThoughts on America’s AI Action PlanJul 23, 2025NewsAnthropic partners with the University of Chicago’s Becker Friedman Institute on AI economic researchJul 23, 2025
----------------------------------------------------------------------------------------------------
https://www.sciencedirect.com/...NewsHow Anthropic teams use Claude CodeJul 24, 2025NewsThoughts on America’s AI Action PlanJul 23, 2025NewsAnthropic partners with the University of Chicago’s Becker Friedman Institute on AI economic researchJul 23, 2025
----------------------------------------------------------------------------------------------------
AnnouncementsAnthropic raises Series E at $61.5B post-money valuationMar 3, 2025●1 min readAnthropic has raised $3.5 billion at a $61.5 billion post-money valuation. The round was led by Lightspeed Venture Partners, with participation from Bessemer Venture Partners, Cisco Investments, D1 Cap

# Retrieval with Metadata Filtering

This is where the magic happens! Now that our metadata is indexed, we can use it to create more powerful and precise retrievers.

In [15]:
len(docs) 

142

In [16]:
subjects = set([doc.metadata['subjects'] for doc in docs ])
subjects

{'alignment',
 'announcements',
 'case study',
 'education',
 'event',
 'other',
 'policy',
 'product',
 'societal impacts'}

### Manual Metadata Filtering

You can manually create a filter to narrow down your search. ChromaDB supports a variety of filter operators. In this example, we're creating a filter to only search for documents where the subject is "announcements".

In [17]:
subject_filter = {"subjects": {"$in": ['announcements']}}
retriever = vector_store.as_retriever(search_kwargs={"k": 5, 'filter': subject_filter }, search_type='similarity')
retrieved_docs = retriever.invoke(user_question)

In [18]:
for doc in retrieved_docs:
    print(doc.metadata['subjects'])

announcements
announcements
announcements
announcements
announcements


Manually creating filters is great, but what if we could have the LLM create the filter for us based on the user's query? 

_ps: You can also use Langchain's Self Query retriver_

In [19]:
user_question = "What did Anthropic announce about Economic Futures Program?"

In [20]:
# Use LLM with structured output to figure out the best topic filter for the query 

from pydantic import BaseModel, Field

# Define the output schema using a Pydantic model
class Subject(BaseModel):
    """A class to hold the subject of a user query."""
    subject: str = Field(description="The category of the user query")

# Your existing code
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0) # Note: I've updated to a more recent model name

# Use the Pydantic model with with_structured_output
structured_llm = llm.with_structured_output(Subject)

# The rest of your code should now work
subject_template = """Classify the user query into one of the following article categories:
{subjects}.

user query: {question}
"""

subject_prompt_template  = PromptTemplate.from_template(subject_template)

# Assuming 'subjects' is a list of strings
subjects = list(subjects)

prompt = subject_prompt_template.invoke({"question": user_question, "subjects": ", ".join(subjects)})
subject_response = structured_llm.invoke(prompt)

print(subject_response)

subject='announcements'


In [21]:
def generate_filtered_answer(user_question, subject_filter):
    #retrieve the relevant docs
    subject_filter_chromadb = {"subjects": {"$in": [subject_filter]}}
    retriever = vector_store.as_retriever(search_kwargs={"k": 5, 'filter': subject_filter_chromadb }, search_type='similarity')
    retrieved_docs = retriever.invoke(user_question)
    
    #generate
    docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)
    prompt = rag_prompt_template.invoke({"question": user_question, "context": docs_content})
    response = llm.invoke(prompt)

    return retrieved_docs, response.content

retrieved_docs, answer = generate_filtered_answer(user_question, subject_response.subject)

In [22]:
print(answer)

Anthropic announced the launch of its Economic Futures Program,  a new initiative focused on researching and developing policies to address AI's economic impacts.  The program will provide research grants, foster evidence-based policy, and promote collaboration. Thanks for asking!


In [23]:
for doc in retrieved_docs:
    print(doc.metadata['subjects'])
    print("-"*100)

announcements
----------------------------------------------------------------------------------------------------
announcements
----------------------------------------------------------------------------------------------------
announcements
----------------------------------------------------------------------------------------------------
announcements
----------------------------------------------------------------------------------------------------
announcements
----------------------------------------------------------------------------------------------------


In [24]:
for doc in retrieved_docs:
    print(doc.page_content)
    print("-"*100)

AnnouncementsPolicyIntroducing the Anthropic Economic Futures ProgramJun 27, 2025●4 min readToday, we're announcing the Anthropic Economic Futures Program, a new initiative to support research and policy development focused on addressing AI’s economic impacts. We’re launching this initiative to understand how AI is reshaping the way we work and surface proposals on how to prepare for this shift. This program will serve as an extension of Anthropic’s Economic Index and its insights on AI usage across the workforce. Our goal for this program is to contribute to the development of new research and potential responses to the impacts of AI on the labor market and global economy. To accomplish this, the Economic Futures Program will focus on three pillars:Research Grants: Providing grants, API credits, and partnerships to enable researchers to investigate AI's effects on labor, productivity, and value creation;Evidence-Based Policy: Creating forums for researchers, policymakers, and
--------