<a href="https://colab.research.google.com/github/dasakash26/TLDR_bot/blob/main/RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tasks
- semantic search
- lang graph orchestration
----
- coversation style chat
- chat memory
- elastic search/ BM25
- contextual embading
- reanking of retreved chunks
- rifining output via llm


In [None]:
!pip install --upgrade --quiet "langchain[google-genai]" langchain-huggingface langchain-chroma langchain-text-splitters langchain-community langgraph "unstructured[pdf]"

from google.colab import userdata
import os
from langchain.chat_models import init_chat_model
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings


os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')

llm = init_chat_model("gemini-2.5-flash", model_provider="google_genai")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

vector_store = Chroma(
    collection_name="rag_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_db",
)

print("> All components initialized successfully!")
print("> Chat Model:", llm.model)
print("> Embeddings Model:", embeddings.model_name)
print("> Vector Store:", vector_store)

> All components initialized successfully!
> Chat Model: models/gemini-2.5-flash
> Embeddings Model: sentence-transformers/all-mpnet-base-v2
> Vector Store: <langchain_chroma.vectorstores.Chroma object at 0x791d8dc7c770>


In [None]:
import bs4
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter


# load the doc and split into chunks

# bs4_strainer = bs4.SoupStrainer(class_=("post-title", "post-header", "post-content"))
# loader = WebBaseLoader(
#     web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
#     bs_kwargs={"parse_only": bs4_strainer},
# )
# docs = loader.load()

path = "/content/drive/MyDrive/notes/SIH2025_HEARME.pdf"
doc = UnstructuredFileLoader(path)
print(doc.file_path)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True,
)

all_splits = doc.load_and_split(text_splitter)
print(all_splits[0].metadata)
print(f"Split document post into {len(all_splits)} sub-documents.")

# store in vector db as embeddings
document_ids = vector_store.add_documents(documents=all_splits)

/content/drive/MyDrive/notes/SIH2025_HEARME.pdf
{'source': '/content/drive/MyDrive/notes/SIH2025_HEARME.pdf', 'start_index': 0}
Split document post into 7 sub-documents.


In [1]:
# part 1: simpler model
# custom prompt
from langchain import hub
prompt = hub.pull("rlm/rag-prompt")

# control flow
from langchain_core.documents import Document
from typing_extensions import List, TypedDict

class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

def retreve(state: State):
    docs = vector_store.similarity_search(state["question"])
    return {"context": docs}

def generate(state: State):
    docs_content = "\n\n".join([doc.page_content for doc in state["context"]])
    messages = prompt.invoke({
        "context": docs_content,
        "question": state["question"]
    }).to_messages()
    res = llm.predict_messages(messages)
    return {"answer": res.content}

from langgraph.graph import START, StateGraph

graph = StateGraph(State).add_sequence([retreve, generate]).add_edge(START, "retreve").compile()



ModuleNotFoundError: No module named 'langgraph'

In [None]:
# res = graph.invoke({"question": "point out the contrast in the chat month wise judging from the time stamp over the last two year"})
res = graph.invoke({"question": "explain the mvp for this project"})

# print(f"Context: {res['context']}\n\n")
# for doc in res["context"]:
#     print(doc.page_content)

fAns = res["answer"].split(". ")

formatted_answer = ""
for i, line in enumerate(fAns):
    words = line.split()
    current_line = "  - " # Add indentation for list items
    for word in words:
        if len(current_line.split()) < 10: # Check line length
            current_line += word + " "
        else:
            formatted_answer += current_line.strip() + "\n"
            current_line = "    " + word + " " # New line with indentation
    formatted_answer += current_line.strip() + "\n"

print(formatted_answer)


output_filename = "res.txt"

with open(output_filename, "w") as f:
    f.write(formatted_answer)

print(f"Answer saved to {output_filename}")

- The MVP for this project would include an Agentic
AI Chatbot to provide immediate first-aid support and alert authorities
in high-risk situations
- It would also feature a confidential booking system for
students to anonymously schedule appointments with on-campus professionals
- Finally, a localized resource hub offering psychoeducational content in
regional languages would provide essential self-help tools.

Answer saved to res.txt


In [None]:
# visualisation
from IPython.display import Image, display
display(Image(graph.get_graph().draw_mermaid_png())

In [None]:
# Agentic conversational RAG
from langgraph.graph import MessagesState, StateGraph
from langchain_core.tools import tool
from langgraph_core.messages import SystemMessage

agent_builder = StateGraph(MessagesState)

@tool(response_format="content_and_artifact")
def retrieve(query: string):
  """Retreve information related to query."""
  retrieved_docs = vector_store.similarity_search(query, k=5)
  serialized = "\n\n".join((f"Source: {doc.source}\nContent: {docs.page_content}")for doc in retreved_docs)
  return serialized, retrieved_docs

def query_or_respond(state: Messagestate):
  """Generate tool-call for retreve or respond directly."""
  llm_with_tools = llm.bind_tools([retrieve])
  res = llm_wit_tools.invoke(state["messages"])
  return {"messages":[res]}

tools = ToolNode([retrieve])

def generate(state:MessageState):
  """Generate answer to the question."""
  # Get generated ToolMessages
  recent_tool_messages = []
  for message in reversed(state["messages"]):
    if message.type == "tool":
      recent_tool_messages.append(message)
    else:
      break

  tool_messages = recent_tool_messages[::-1]

  # Format into prompt
  docs_content = "\n\n".join(doc.content for doc in tool_messages)
  system_message_content = (
        "You are an assistant for question-answering tasks. "
        "Use the following pieces of retrieved context to answer "
        "the question. If you don't know the answer, say that you "
        "don't know. Use five sentences maximum and keep the "
        "answer concise and to the point."
        "\n\n"
        f"{docs_content}"
    )

  conversation_messages = [
      message
      for message in state["messages"]
      if message.type in ("human", "system")
      or (message.type == "ai" and not message.tool_calls)
  ]

  prompt = [SystemMessage(system_message_content)]+conversation_messages

  # Run
  response = llm.invoke(prompt)
  return {"messages": [response]}



In [None]:
# Build the agent
from langgraph.graph import END
from langgraph.prebuilt import ToolNode, tools_condition

agent_builder.add_node(query_or_respond).add_node(tools).add_node(generate)
agent_builder.set_entry_point("query_or_respond")
agent_builder.add_conditional_edges(
    tools_condition,
    "query_or_respond",

)