In [1]:
sample_docs = [
    """
    Machine Learning Fundamentals
    
    Machine learning is a subset of artificial intelligence that enables systems to learn 
    and improve from experience without being explicitly programmed. There are three main 
    types of machine learning: supervised learning, unsupervised learning, and reinforcement 
    learning. Supervised learning uses labeled data to train models, while unsupervised 
    learning finds patterns in unlabeled data. Reinforcement learning learns through 
    interaction with an environment using rewards and penalties.
    """,
    
    """
    Deep Learning and Neural Networks
    
    Deep learning is a subset of machine learning based on artificial neural networks. 
    These networks are inspired by the human brain and consist of layers of interconnected 
    nodes. Deep learning has revolutionized fields like computer vision, natural language 
    processing, and speech recognition. Convolutional Neural Networks (CNNs) are particularly 
    effective for image processing, while Recurrent Neural Networks (RNNs) and Transformers 
    excel at sequential data processing.
    """,
    
    """
    Natural Language Processing (NLP)
    
    NLP is a field of AI that focuses on the interaction between computers and human language. 
    Key tasks in NLP include text classification, named entity recognition, sentiment analysis, 
    machine translation, and question answering. Modern NLP heavily relies on transformer 
    architectures like BERT, GPT, and T5. These models use attention mechanisms to understand 
    context and relationships between words in text.
    """
]

sample_docs

['\n    Machine Learning Fundamentals\n\n    Machine learning is a subset of artificial intelligence that enables systems to learn \n    and improve from experience without being explicitly programmed. There are three main \n    types of machine learning: supervised learning, unsupervised learning, and reinforcement \n    learning. Supervised learning uses labeled data to train models, while unsupervised \n    learning finds patterns in unlabeled data. Reinforcement learning learns through \n    interaction with an environment using rewards and penalties.\n    ',
 '\n    Deep Learning and Neural Networks\n\n    Deep learning is a subset of machine learning based on artificial neural networks. \n    These networks are inspired by the human brain and consist of layers of interconnected \n    nodes. Deep learning has revolutionized fields like computer vision, natural language \n    processing, and speech recognition. Convolutional Neural Networks (CNNs) are particularly \n    effective f

In [2]:
import tempfile
temp_dir=tempfile.mkdtemp()

for i,doc in enumerate(sample_docs):
    with open(f"{temp_dir}/doc_{i}.txt","w") as f:
        f.write(doc)

print(f"Sample document create in : {temp_dir}")

Sample document create in : C:\Users\Debdutta Chatterjee\AppData\Local\Temp\tmpuiyh4tbc


In [3]:
import tempfile
temp_dir=tempfile.mkdtemp()

for i,doc in enumerate(sample_docs):
    with open(f"doc_{i}.txt","w") as f:
        f.write(doc)

In [4]:
from langchain_community.document_loaders import DirectoryLoader,TextLoader


loader = DirectoryLoader(
    path ='C:\\Users\\Debdutta Chatterjee\\AppData\\Local\\Temp\\tmpmi9w_c2t',
    glob ='*.txt',
    loader_cls =TextLoader,
    loader_kwargs={'encoding':'utf-8'}
)

docs = loader.load()
len(docs)

3

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size =100,
    chunk_overlap =10
)
docs = loader.load()
split_docs = text_splitter.split_documents(docs)

len(split_docs)

20

In [6]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings  = GoogleGenerativeAIEmbeddings(
    model = 'models/text-embedding-004',
    google_api_key = 'AIzaSyCtTc-rIeCTfJsfIMHFqnSIjhPbSJpy5Yc'
)

em = embeddings.embed_query('Hi')
len(em)

  from .autonotebook import tqdm as notebook_tqdm


768

In [7]:
split_docs[0]

Document(metadata={'source': 'C:\\Users\\Debdutta Chatterjee\\AppData\\Local\\Temp\\tmpmi9w_c2t\\doc_0.txt'}, page_content='Machine Learning Fundamentals')

In [8]:
from langchain_community.vectorstores import Chroma

vs = Chroma.from_documents(
    documents=split_docs,
    embedding=embeddings,
    persist_directory='./chrom_db',
    collection_name='rag_demo'
)

vs

<langchain_community.vectorstores.chroma.Chroma at 0x23aa8f10ce0>

In [9]:
vs.similarity_search('Machine learning',k=3)

[Document(metadata={'source': 'C:\\Users\\Debdutta Chatterjee\\AppData\\Local\\Temp\\tmpmi9w_c2t\\doc_2.txt'}, page_content='Natural Language Processing (NLP)'),
 Document(metadata={'source': 'C:\\Users\\Debdutta Chatterjee\\AppData\\Local\\Temp\\tmpmi9w_c2t\\doc_2.txt'}, page_content='Natural Language Processing (NLP)'),
 Document(metadata={'source': 'C:\\Users\\Debdutta Chatterjee\\AppData\\Local\\Temp\\tmpmi9w_c2t\\doc_2.txt'}, page_content='Natural Language Processing (NLP)')]

In [10]:
vs.similarity_search_with_score('Deep learning',k=5)

[(Document(metadata={'source': 'C:\\Users\\Debdutta Chatterjee\\AppData\\Local\\Temp\\tmpmi9w_c2t\\doc_1.txt'}, page_content='Deep Learning and Neural Networks'),
  0.18090315163135529),
 (Document(metadata={'source': 'C:\\Users\\Debdutta Chatterjee\\AppData\\Local\\Temp\\tmpmi9w_c2t\\doc_1.txt'}, page_content='Deep Learning and Neural Networks'),
  0.18090315163135529),
 (Document(metadata={'source': 'C:\\Users\\Debdutta Chatterjee\\AppData\\Local\\Temp\\tmpmi9w_c2t\\doc_1.txt'}, page_content='Deep Learning and Neural Networks'),
  0.18090315163135529),
 (Document(metadata={'source': 'C:\\Users\\Debdutta Chatterjee\\AppData\\Local\\Temp\\tmpmi9w_c2t\\doc_2.txt'}, page_content='Natural Language Processing (NLP)'),
  0.23174500465393066),
 (Document(metadata={'source': 'C:\\Users\\Debdutta Chatterjee\\AppData\\Local\\Temp\\tmpmi9w_c2t\\doc_2.txt'}, page_content='Natural Language Processing (NLP)'),
  0.23174500465393066)]

In [11]:
from langchain.chat_models import init_chat_model
from langchain_google_genai import ChatGoogleGenerativeAI
import os
os.environ['GOOGLE_API_KEY'] = "AIzaSyCtTc-rIeCTfJsfIMHFqnSIjhPbSJpy5Yc"

model = ChatGoogleGenerativeAI(
                model='gemini-2.0-flash',
                google_api_key="AIzaSyCtTc-rIeCTfJsfIMHFqnSIjhPbSJpy5Yc",
                temperature=0,
                max_output_tokens=1000
            )

model.invoke('Hello')

AIMessage(content='Hello! How can I help you today?', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-2.0-flash', 'safety_ratings': []}, id='run--469d9453-04aa-4134-9b96-2c7880ce3ac4-0', usage_metadata={'input_tokens': 1, 'output_tokens': 10, 'total_tokens': 11, 'input_token_details': {'cache_read': 0}})

In [12]:
retriever = vs.as_retriever(search_type='similarity',search_kwargs={'k':3})
retriever

VectorStoreRetriever(tags=['Chroma', 'GoogleGenerativeAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x0000023AA8F10CE0>, search_kwargs={'k': 3})

In [13]:
from langchain_core.prompts import ChatPromptTemplate

system_prompt="""You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Use three sentences maximum and keep the answer concise.

Context: {context}"""

prompt = ChatPromptTemplate.from_messages([
    
    ('system',system_prompt),
    ('human','{input}')
    
])

prompt

ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. \nUse the following pieces of retrieved context to answer the question. \nIf you don't know the answer, just say that you don't know. \nUse three sentences maximum and keep the answer concise.\n\nContext: {context}"), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], input_types={}, partial_variables={}, template='{input}'), additional_kwargs={})])

In [14]:
from langchain.chains.combine_documents import create_stuff_documents_chain
doc_chain =create_stuff_documents_chain(model,prompt)
doc_chain


RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableLambda(format_docs)
}), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
| ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. \nUse the following pieces of retrieved context to answer the question. \nIf you don't know the answer, just say that you don't know. \nUse three sentences maximum and keep the answer concise.\n\nContext: {context}"), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], input_types={}, partial_variables={}, template='{input}'), additional_kwargs={})])
| ChatGoogleGenerativeAI(model='models/gemini-2.0-flash', google_api_key=SecretStr('**********'), temperature=0.0, max_output_tokens=1

In [15]:
from langchain.chains import  create_retrieval_chain

rag_chain = create_retrieval_chain(
    retriever,
    doc_chain
)
rag_chain

RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableBinding(bound=RunnableLambda(lambda x: x['input'])
           | VectorStoreRetriever(tags=['Chroma', 'GoogleGenerativeAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x0000023AA8F10CE0>, search_kwargs={'k': 3}), kwargs={}, config={'run_name': 'retrieve_documents'}, config_factories=[])
})
| RunnableAssign(mapper={
    answer: RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
              context: RunnableLambda(format_docs)
            }), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
            | ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. \nUse the following pieces of retrieved context to answer the quest

In [16]:
response=rag_chain.invoke({"input":"What is Deep LEarning"})

In [17]:
response

{'input': 'What is Deep LEarning',
 'context': [Document(metadata={'source': 'C:\\Users\\Debdutta Chatterjee\\AppData\\Local\\Temp\\tmpmi9w_c2t\\doc_1.txt'}, page_content='Deep learning is a subset of machine learning based on artificial neural networks.'),
  Document(metadata={'source': 'C:\\Users\\Debdutta Chatterjee\\AppData\\Local\\Temp\\tmpmi9w_c2t\\doc_1.txt'}, page_content='Deep learning is a subset of machine learning based on artificial neural networks.'),
  Document(metadata={'source': 'C:\\Users\\Debdutta Chatterjee\\AppData\\Local\\Temp\\tmpmi9w_c2t\\doc_1.txt'}, page_content='Deep learning is a subset of machine learning based on artificial neural networks.')],
 'answer': 'Deep learning is a subset of machine learning. It is based on artificial neural networks.'}

In [18]:
#LCEl
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template(
    """Use the following context to answer the question. 
If you don't know the answer based on the context, say you don't know.
Provide specific details from the context to support your answer.

Context:
{context}

Question: {question}

Answer:"""
)



In [19]:
retriever

VectorStoreRetriever(tags=['Chroma', 'GoogleGenerativeAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x0000023AA8F10CE0>, search_kwargs={'k': 3})

In [20]:
def format_docs(docs):
    return '\n\n'.join(d.page_content for d in docs)

In [21]:
rag_chain =(
{
    'context':retriever|format_docs,
    'question': RunnablePassthrough()
}
|prompt
|model
|StrOutputParser()
)
rag_chain

{
  context: VectorStoreRetriever(tags=['Chroma', 'GoogleGenerativeAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x0000023AA8F10CE0>, search_kwargs={'k': 3})
           | RunnableLambda(format_docs),
  question: RunnablePassthrough()
}
| ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="Use the following context to answer the question. \nIf you don't know the answer based on the context, say you don't know.\nProvide specific details from the context to support your answer.\n\nContext:\n{context}\n\nQuestion: {question}\n\nAnswer:"), additional_kwargs={})])
| ChatGoogleGenerativeAI(model='models/gemini-2.0-flash', google_api_key=SecretStr('**********'), temperature=0.0, max_output_tokens=1000, client=<google.ai.generativelanguage_v1beta.services.generativ

In [22]:
response=rag_chain.invoke("What is Deep Learning")
response

'Deep learning is a subset of machine learning based on artificial neural networks.'

In [23]:
retriever.get_relevant_documents('deep learning')

  retriever.get_relevant_documents('deep learning')


[Document(metadata={'source': 'C:\\Users\\Debdutta Chatterjee\\AppData\\Local\\Temp\\tmpmi9w_c2t\\doc_1.txt'}, page_content='Deep Learning and Neural Networks'),
 Document(metadata={'source': 'C:\\Users\\Debdutta Chatterjee\\AppData\\Local\\Temp\\tmpmi9w_c2t\\doc_1.txt'}, page_content='Deep Learning and Neural Networks'),
 Document(metadata={'source': 'C:\\Users\\Debdutta Chatterjee\\AppData\\Local\\Temp\\tmpmi9w_c2t\\doc_1.txt'}, page_content='Deep Learning and Neural Networks')]

In [26]:
from langchain.schema import Document

d1 = Document(
    page_content = 'DL is a fraud',
    metadata = {'source':'Fake source'}
)

vs.add_documents([d1])

['968f4954-f0a3-4e3f-a606-774e70815697']

In [32]:
client = vs._collection
client.delete(ids =['968f4954-f0a3-4e3f-a606-774e70815697'])

In [33]:
from langchain.schema import Document

d1 = Document(
    page_content = 'MDD is a fraud',
    metadata = {'source':'Fake source'}
)

vs.add_documents([d1])

['b7b1bcbc-7392-48eb-94c4-26f96c25d9d7']

In [34]:
rag_chain.invoke('what is MDD')

'The context states "MDD is a fraud".'

In [38]:
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder
from langchain_core.messages import HumanMessage,AIMessage

In [37]:
from langchain_core.prompts import ChatPromptTemplate

contextualize_q_system_prompt = """Given a chat history and the latest user question 
which might reference context in the chat history, formulate a standalone question 
which can be understood without the chat history. Do NOT answer the question, 
just reformulate it if needed and otherwise return it as is."""


prompt  = ChatPromptTemplate.from_messages([
    ('system',contextualize_q_system_prompt),
    MessagesPlaceholder('chat_history'),
    ('human','{input}')
])

prompt

ChatPromptTemplate(input_variables=['chat_history', 'input'], input_types={'chat_history': list[typing.Annotated[typing.Union[typing.Annotated[langchain_core.messages.ai.AIMessage, Tag(tag='ai')], typing.Annotated[langchain_core.messages.human.HumanMessage, Tag(tag='human')], typing.Annotated[langchain_core.messages.chat.ChatMessage, Tag(tag='chat')], typing.Annotated[langchain_core.messages.system.SystemMessage, Tag(tag='system')], typing.Annotated[langchain_core.messages.function.FunctionMessage, Tag(tag='function')], typing.Annotated[langchain_core.messages.tool.ToolMessage, Tag(tag='tool')], typing.Annotated[langchain_core.messages.ai.AIMessageChunk, Tag(tag='AIMessageChunk')], typing.Annotated[langchain_core.messages.human.HumanMessageChunk, Tag(tag='HumanMessageChunk')], typing.Annotated[langchain_core.messages.chat.ChatMessageChunk, Tag(tag='ChatMessageChunk')], typing.Annotated[langchain_core.messages.system.SystemMessageChunk, Tag(tag='SystemMessageChunk')], typing.Annotated[l

In [39]:
retriever = create_history_aware_retriever(
    model,retriever,prompt
)

In [40]:
qa_system_prompt = """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Use three sentences maximum and keep the answer concise.

Context: {context}"""

qa_prompt = ChatPromptTemplate.from_messages([
    ("system", qa_system_prompt),
    MessagesPlaceholder("chat_history"),
    ("human", "{input}"),
])

In [42]:
question_answer_chain = create_stuff_documents_chain(model, qa_prompt)

# Create conversational RAG chain
conversational_rag_chain = create_retrieval_chain(
    retriever, 
    question_answer_chain
)
print("Conversational RAG chain created!")

Conversational RAG chain created!


In [47]:
history = []
result1 =conversational_rag_chain.invoke({
    'chat_history':history,
    'input':"What is machine learning?"
})

In [49]:
history.extend([
    HumanMessage(content="What is machine learning"),
    AIMessage(content=result1['answer'])
])