In [1]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [2]:
loader = TextLoader("../data/dataengineering.txt", encoding="utf-8")
documents = loader.load()
documents

[Document(metadata={'source': '../data/dataengineering.txt'}, page_content='Agentic AI refers to intelligent systems that can autonomously plan, decide, and execute actions toward specific predefined goals without constant human supervision.\nUnlike traditional software programs, an agent observes its environment, reasons about possible outcomes, and selects the most appropriate action dynamically.\nThe main purpose of agentic systems is reducing manual effort while improving decision-making efficiency in complex workflows.\nThese systems typically include memory components, reasoning engines, and the ability to call external tools or APIs.\nAn AI agent can interact with databases, web services, or enterprise applications to complete assigned objectives.\nPlanning capabilities allow the agent to decompose large tasks into smaller manageable steps before execution.\nReactive agents respond immediately to stimuli, whereas deliberative agents evaluate multiple possibilities before acting.

In [3]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=50
)
chunks = splitter.split_documents(documents)
#chunks_str = [chunk.page_content for chunk in chunks]
#chunks

In [4]:
from dotenv import load_dotenv
import os
load_dotenv(dotenv_path="../.env")

True

In [5]:
embedding = OpenAIEmbeddings(model="text-embedding-3-small", api_key=os.getenv("OPENAI_API_KEY"))
#embedded_chunks = embedding.embed_documents(chunks_str)
#embedded_chunks

In [6]:
if not os.path.exists("../chroma_db"):
    os.makedirs("../chroma_db")
    print("Chroma DB directory created.")
else:
    print("Chroma DB directory already exists.")

import chromadb

client = chromadb.PersistentClient(path="../chroma_db")
existing_collections = [c.name for c in client.list_collections()]
print(f"Existing Collections: {existing_collections}")

COLLECTION_NAME = "data_engineering"

if COLLECTION_NAME in existing_collections:
    client.delete_collection(COLLECTION_NAME)
    print(f"Deleted existing collection: {COLLECTION_NAME}")
else:
    print("Collection does not exist. Creating new one.")



Chroma DB directory already exists.
Existing Collections: ['data_engineering', 'langchain']
Deleted existing collection: data_engineering


In [11]:
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embedding,
    persist_directory="../chroma_db",
    collection_name=COLLECTION_NAME
)

results = vectorstore.similarity_search_with_score("what are the types of machine learning?", k=3)

for i in results:
    print(i)

retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}
)

(Document(id='073da520-18fc-4184-9223-b8d5e81e79db', metadata={'source': '../data/dataengineering.txt'}, page_content='Machine learning is a branch of artiﬁcial intelligence focused on enabling systems to learn patterns from data.\nThe purpose of machine learning is allowing computers to improve performance without explicit rule-based programming.\nSupervised learning uses labeled datasets to train predictive models for classification or regression tasks.'), 0.8394509553909302)
(Document(id='93778f82-a22a-43b1-9e19-72577c9d0785', metadata={'source': '../data/dataengineering.txt'}, page_content='Machine learning is a branch of artiﬁcial intelligence focused on enabling systems to learn patterns from data.\nThe purpose of machine learning is allowing computers to improve performance without explicit rule-based programming.\nSupervised learning uses labeled datasets to train predictive models for classification or regression tasks.'), 0.8394509553909302)
(Document(id='780f7b1f-4ea2-4e29-a

In [8]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [9]:
prompt = ChatPromptTemplate.from_template("""
You are a helpful assistant.
Answer the question using ONLY the context below. if you don't find answer in context, say I don't know

Context:
{context}

Question:
{question}
""")

llm = ChatOpenAI(
    model="gpt-4.1-mini",
    api_key=os.getenv("OPENAI_API_KEY"),
    temperature=0
)


rag_chain = (
    {
        "context": retriever | format_docs,
        "question": RunnablePassthrough()
    }
    | prompt
    | llm
    | StrOutputParser()
)

In [15]:
question = "what is the benefit of systemetic search?"
response = rag_chain.invoke(question)
print(response)

I don't know
