In [1]:
import os
import re
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_community.document_loaders import RecursiveUrlLoader
from bs4 import BeautifulSoup
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
import chromadb

In [2]:
load_dotenv()

True

![Diagram](./images/Basic_Rag.png)

# Data Extraction 

The first step in any RAG pipeline is to get the data you want to work with. In our case, we'll be extracting the text content from the Anthropic news website.

This code block defines a function bs4_extractor that uses BeautifulSoup to parse the HTML content of a webpage and extract the text. The RecursiveUrlLoader then uses this function to load the content from the specified URL. We set max_depth=2 to limit how deep the scraper will go into the website's links.

In [3]:
def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    return re.sub(r"\n\n+", "\n\n", soup.text).strip() #parse wbpage text

# ref: https://python.langchain.com/docs/integrations/document_loaders/
loader = RecursiveUrlLoader("https://www.anthropic.com/news", extractor=bs4_extractor, max_depth=2)

docs = loader.load()

In [4]:
len(docs) # Number of documents loaded from the Anthropic News site

142

In [5]:
# Let's inspect the metadata of one of the loaded documents. 
# The metadata provides useful information about the source of the document, such as the URL, title, and description.

# docs[1].page_content
docs[10].metadata   

{'source': 'https://www.anthropic.com/news/the-case-for-targeted-regulation',
 'content_type': 'text/html; charset=utf-8',
 'title': 'The case for targeted regulation \\ Anthropic',
 'description': 'Increasingly powerful AI systems have the potential to accelerate scientific progress, unlock new medical treatments, and grow the economy. But along with the remarkable new capabilities of these AIs come significant risks. Governments should urgently take action on AI policy in the next eighteen months. The window for proactive risk prevention is closing fast.',
 'language': 'en'}

In [6]:
# To get a better sense of the content we've loaded, 
# this code block defines a helper function wrap_text to format the text and then prints the content of each document.

def wrap_text(text, width=80):
    return '\n'.join([text[i:i+width] for i in range(0, len(text), width)]) 

for doc in docs:
    print(wrap_text(doc.page_content))
    print("-"*100)

Newsroom \ AnthropicSkip to main contentSkip to footerClaudeAPISolutionsResearch
CommitmentsLearnNewsTry ClaudeNewsroomAnnouncementsIntroducing Claude 4Press inq
uiries press@anthropic.comNon-media inquiries support.anthropic.comMedia assets 
Download press kitFollow Anthropic FeaturedAnthropic raises Series E at $61.5B p
ost-money valuationFeaturedIntroducing the Anthropic Economic IndexNewsNo result
s found.Case StudyHow Anthropic teams use Claude CodeJul 24, 2025PolicyThoughts 
on America’s AI Action PlanJul 23, 2025AnnouncementsAnthropic partners with the 
University of Chicago’s Becker Friedman Institute on AI economic researchJul 23,
 2025PolicyBuild AI in AmericaJul 21, 2025PolicyAnthropic to sign the EU Code of
 PracticeJul 21, 2025AnnouncementsPaul Smith to join Anthropic as Chief Commerci
al OfficerJul 15, 2025ProductClaude for Financial ServicesJul 15, 2025AlignmentI
nvesting in energy to secure America's AI future Jul 15, 2025ProductDiscover too
ls that work with Claude Jul

# Chunk the data

Now that we have our documents, the next step is to split them into smaller chunks. This is important for a few reasons:

__Vector search efficiency__: Smaller chunks are easier to search and retrieve.

__Context window limitations__: LLMs have a limited context window, so we need to make sure the retrieved information fits within that window.

__Relevance__: Smaller chunks are more likely to be focused on a specific topic, which improves the relevance of the retrieved information.

We'll use the `RecursiveCharacterTextSplitter` to split our documents. This splitter tries to split text on a series of characters (like newlines, spaces, etc.) in a recursive manner.

- `chunk_size=1000`: This sets the maximum size of each chunk to 1000 characters.

- `chunk_overlap=200`: This creates an overlap of 200 characters between consecutive chunks. This helps to ensure that we don't lose any important context at the boundaries of our chunks.

- `add_start_index=True`: This will add the starting index of the chunk in the original document to the metadata.

In [7]:
# chunk the data

# ref: https://python.langchain.com/docs/concepts/text_splitters/
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)

splits = text_splitter.split_documents(docs) 

In [8]:
for split in splits:
    print(wrap_text(split.page_content))
    print("-"*100)

Newsroom \ AnthropicSkip to main contentSkip to footerClaudeAPISolutionsResearch
CommitmentsLearnNewsTry ClaudeNewsroomAnnouncementsIntroducing Claude 4Press inq
uiries press@anthropic.comNon-media inquiries support.anthropic.comMedia assets 
Download press kitFollow Anthropic FeaturedAnthropic raises Series E at $61.5B p
ost-money valuationFeaturedIntroducing the Anthropic Economic IndexNewsNo result
s found.Case StudyHow Anthropic teams use Claude CodeJul 24, 2025PolicyThoughts 
on America’s AI Action PlanJul 23, 2025AnnouncementsAnthropic partners with the 
University of Chicago’s Becker Friedman Institute on AI economic researchJul 23,
 2025PolicyBuild AI in AmericaJul 21, 2025PolicyAnthropic to sign the EU Code of
 PracticeJul 21, 2025AnnouncementsPaul Smith to join Anthropic as Chief Commerci
al OfficerJul 15, 2025ProductClaude for Financial ServicesJul 15, 2025AlignmentI
nvesting in energy to secure America's AI future Jul 15, 2025ProductDiscover too
ls that work with Claude Jul

In [11]:
splits[2].metadata

{'source': 'https://www.anthropic.com/news',
 'content_type': 'text/html; charset=utf-8',
 'title': 'Newsroom \\ Anthropic',
 'description': "Anthropic is an AI safety and research company that's working to build reliable, interpretable, and steerable AI systems.",
 'language': 'en',
 'start_index': 1579}

# Indexing

Now that we have our document chunks, we need to create an index that we can search. We'll use a vector store for this, which allows us to perform semantic search on our documents.

In this block, we're setting up our vector store using ChromaDB and Google's Generative AI embeddings.

- `embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")`: This initializes the embedding model that will be used to convert our document chunks into numerical vectors.

- `vector_store = Chroma(...)`: This creates a ChromaDB vector store.

- `collection_name`: A name for our collection of documents.

- `embedding_function`: The embedding model to use.

- `persist_directory`: The directory where the vector store data will be saved locally.

In [9]:
#Indexing

#define the embeddings model
#ref: https://python.langchain.com/docs/integrations/text_embedding/
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

#disable telemetry
client_settings = chromadb.config.Settings(
    persist_directory="./chroma_db",
    anonymized_telemetry=False,  # Disables telemetry
)

#define the vector store
#ref: https://python.langchain.com/docs/concepts/vectorstores/
vector_store = Chroma(
    collection_name="1_basic_rag_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_db",  # Where to save data locally
    client_settings=client_settings
)

# vector_store.delete_collection()

In [10]:
#Now, we add our document chunks to the vector store. 
# This process will convert each chunk into a vector and store it in the database.

document_ids = vector_store.add_documents(documents=splits)
document_ids

['0501d245-8544-44f8-bc63-79641a4ff814',
 'cd21d1e9-14e1-4a40-9afa-c0f1915f776d',
 'eeadaf94-2b97-48c4-ab66-49ef72d7ea41',
 '3441833d-f683-4c55-b953-2cda54a1d41e',
 '3a9f1261-2de5-443c-a157-07eea51a0c90',
 'b8ff3b14-8b5b-4107-a9a7-9f62fd9dd071',
 '47dffd61-caba-4632-8258-597836631b00',
 '6951471b-7428-435c-8b51-44a7634b2791',
 '3e0efe2b-0a21-47f8-ad4e-89b6a962afa5',
 '9b0ad3ec-9a69-451d-b9a7-d65778e58f53',
 '34a19983-bc97-47c2-b38f-196cc95abb67',
 'a2ef28f4-12ef-43f9-aeff-815524fe9e5f',
 'a3b49e5d-c940-48ee-8c93-8dd7943e8ab3',
 'bcd775aa-f268-40d1-9219-19bde4f50efe',
 '4bb4de5c-9091-4034-857d-1cc236d62846',
 'cff7662b-3763-492d-84e8-307ebf360a7d',
 '6451e7ba-84a5-42dd-b4b8-4fc6b6614121',
 '23f0867a-b9b9-46c5-a4af-7666d41745ad',
 '260bf73d-5474-4515-9bf3-38cd916acf64',
 'e473cab3-e4b3-4363-aa47-0b14e1928808',
 '5189683e-8c84-4db5-af65-2836fef88ce7',
 '3822012b-cc0b-448f-ab7e-10abc0655926',
 '2bb74f5b-ecd7-42ce-9494-1efbe7aae17a',
 '1a026a40-a131-4570-bc88-97a2a5ba9c8c',
 'da93c43c-8fad-

In [12]:
# Let's retrieve a document from the vector store by its ID to confirm that it has been indexed correctly.
vector_store.get_by_ids([document_ids[1]])

[Document(id='cd21d1e9-14e1-4a40-9afa-c0f1915f776d', metadata={'source': 'https://www.anthropic.com/news', 'title': 'Newsroom \\ Anthropic', 'content_type': 'text/html; charset=utf-8', 'language': 'en', 'start_index': 803, 'description': "Anthropic is an AI safety and research company that's working to build reliable, interpretable, and steerable AI systems."}, page_content="OfficerJul 15, 2025ProductClaude for Financial ServicesJul 15, 2025AlignmentInvesting in energy to secure America's AI future Jul 15, 2025ProductDiscover tools that work with Claude Jul 14, 2025AnnouncementsAnthropic and the Department of Defense to advance responsible AI in defense operationsJul 14, 2025ProductAdvancing Claude for EducationJul 09, 2025AnnouncementsLawrence Livermore National Laboratory expands Claude for Enterprise use to empower scientists and researchersJul 09, 2025PolicyThe need for transparency in Frontier AIJul 07, 2025AnnouncementsIntroducing the Anthropic Economic Futures ProgramJun 27, 202

# Retrieval

With our documents indexed, we can now perform retrieval. The goal of this step is to find the most relevant document chunks for a given user question.

First, we'll set up our LLM and a prompt template.

- `llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")`: We'll use the Gemini 1.5 Flash model for generation.

- `template = ...`: This is the prompt template that we'll use to combine the retrieved context with the user's question. The {context} and {question} are placeholders that will be filled in later.

In [13]:
#configure the llm
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")  # web is search disabled by default

#set the prompt template
template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
Always say "thanks for asking!" at the end of the answer.

{context}

Question: {question}

Helpful Answer:"""

rag_prompt_template = PromptTemplate.from_template(template)


In [14]:
# Let's define a sample user question.
user_question = "What is anthropic?"

In [15]:
# Now, we'll use the vector store's similarity_search method to find the top 5 most similar documents to the user's question.
retrieved_docs = vector_store.similarity_search(user_question, k=5)

In [16]:
#review the retreived docs and see how relevant they are
for doc in retrieved_docs:
    print(doc.page_content)
    print("-"*100)

at AnthropicMCP IntegrationsPartner DirectoryExploreAbout usBecome a partnerCareersEventsNewsStartups programHelp and securityStatusAvailabilitySupport centerTerms and policiesPrivacy choicesPrivacy policyResponsible disclosure policyTerms of service - consumerTerms of service - commercialUsage policy© 2025 Anthropic PBC
----------------------------------------------------------------------------------------------------
at AnthropicMCP IntegrationsPartner DirectoryExploreAbout usBecome a partnerCareersEventsNewsStartups programHelp and securityStatusAvailabilitySupport centerTerms and policiesPrivacy choicesPrivacy policyResponsible disclosure policyTerms of service - consumerTerms of service - commercialUsage policy© 2025 Anthropic PBC
----------------------------------------------------------------------------------------------------
at AnthropicMCP IntegrationsPartner DirectoryExploreAbout usBecome a partnerCareersEventsNewsStartups programHelp and securityStatusAvailabilitySupport 

In [17]:
#inspect metadata
retrieved_docs[0].metadata

{'content_type': 'text/html; charset=utf-8',
 'description': "Today we're announcing Integrations, a new way to connect your apps and tools to Claude. We're also expanding Claude's Research capabilities with an advanced mode that searches the web, your Google Workspace, and now your Integrations too.",
 'language': 'en',
 'start_index': 5422,
 'title': 'Claude can now connect to your world \\ Anthropic',
 'source': 'https://www.anthropic.com/news/integrations'}

In [18]:
# The similarity_search_with_score method returns the documents along with their similarity scores.
vector_store.similarity_search_with_score(user_question, k=5)

[(Document(id='80e19436-cc50-4db5-bd12-1cbac6cf65ea', metadata={'content_type': 'text/html; charset=utf-8', 'language': 'en', 'source': 'https://www.anthropic.com/news/integrations', 'title': 'Claude can now connect to your world \\ Anthropic', 'description': "Today we're announcing Integrations, a new way to connect your apps and tools to Claude. We're also expanding Claude's Research capabilities with an advanced mode that searches the web, your Google Workspace, and now your Integrations too.", 'start_index': 5422}, page_content='at AnthropicMCP IntegrationsPartner DirectoryExploreAbout usBecome a partnerCareersEventsNewsStartups programHelp and securityStatusAvailabilitySupport centerTerms and policiesPrivacy choicesPrivacy policyResponsible disclosure policyTerms of service - consumerTerms of service - commercialUsage policy© 2025 Anthropic PBC'),
  0.43348556756973267),
 (Document(id='2222f5d7-566d-453d-9a68-2c095ec7c12a', metadata={'source': 'https://www.anthropic.com/news/how-p

# Generation

The final step is to use the retrieved documents to generate an answer to the user's question.

We'll combine the content of the retrieved documents and use our prompt template to create a final prompt for the LLM.


In [19]:
#generate answer

docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)
prompt = rag_prompt_template.invoke({"question": user_question, "context": docs_content})
response = llm.invoke(prompt)

In [20]:
#generated response
response.content

'Based on the provided text, Anthropic is a company, as indicated by the copyright notice "© 2025 Anthropic PBC".  It appears to offer various services and resources, including a partner directory and support center.  Thanks for asking!'

In [21]:
# We can also include citations in our response by extracting the source from the metadata of the retrieved documents.

sources = [doc.metadata["source"] for doc in retrieved_docs]

print(f"Sources: {sources}\n\n")
print(f'Answer: {response.content}')

Sources: ['https://www.anthropic.com/news/integrations', 'https://www.anthropic.com/news/how-people-use-claude-for-support-advice-and-companionship', 'https://www.anthropic.com/news/build-artifacts', 'https://www.anthropic.com/news/token-saving-updates', 'https://www.anthropic.com/news/national-security-expert-richard-fontaine-appointed-to-anthropic-s-long-term-benefit-trust']


Answer: Based on the provided text, Anthropic is a company, as indicated by the copyright notice "© 2025 Anthropic PBC".  It appears to offer various services and resources, including a partner directory and support center.  Thanks for asking!


### Langchain Retreiver

LangChain provides a `Retriever` interface, which is a more general way to retrieve documents. A vector store can be used as the backbone of a retriever, but there are other types of retrievers as well.

Here, we're creating a retriever from our vector store. We can also specify search arguments like k (the number of documents to retrieve) and search_type.

In [22]:
# Ref: https://python.langchain.com/docs/concepts/retrievers/
retriever = vector_store.as_retriever(search_kwargs={"k": 100}, search_type='similarity')

retrieved_docs = retriever.invoke(user_question)
retrieved_docs


for doc in retrieved_docs:
    print(doc.page_content)
    print("-"*100)


at AnthropicMCP IntegrationsPartner DirectoryExploreAbout usBecome a partnerCareersEventsNewsStartups programHelp and securityStatusAvailabilitySupport centerTerms and policiesPrivacy choicesPrivacy policyResponsible disclosure policyTerms of service - consumerTerms of service - commercialUsage policy© 2025 Anthropic PBC
----------------------------------------------------------------------------------------------------
at AnthropicMCP IntegrationsPartner DirectoryExploreAbout usBecome a partnerCareersEventsNewsStartups programHelp and securityStatusAvailabilitySupport centerTerms and policiesPrivacy choicesPrivacy policyResponsible disclosure policyTerms of service - consumerTerms of service - commercialUsage policy© 2025 Anthropic PBC
----------------------------------------------------------------------------------------------------
at AnthropicMCP IntegrationsPartner DirectoryExploreAbout usBecome a partnerCareersEventsNewsStartups programHelp and securityStatusAvailabilitySupport 

In [23]:
# Now, let's put it all together in a single function.
def generate_answer(user_question):
    #retrieve the relevant docs
    retriever = vector_store.as_retriever(search_kwargs={"k": 100}, search_type='similarity')
    retrieved_docs = retriever.invoke(user_question)
    
    #generate
    docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)
    prompt = rag_prompt_template.invoke({"question": user_question, "context": docs_content})
    response = llm.invoke(prompt)

    return response.content

user_question = "What is Anthropic?"
generate_answer(user_question)

'Based on the provided text, Anthropic is an AI safety and research company that develops and deploys advanced AI systems.  They also conduct research in mechanistic interpretability and alignment. Thanks for asking!'