In [19]:
!pip install langchain
!pip install langgraph
!pip install tavily
!pip install transformers
!pip install openai
!pip install beautifulsoup4
!pip install requests
!pip install tiktoken
!pip install --upgrade langchain
!pip install langchain_community
!pip install --upgrade langgraph
!pip install -q langchain langchain-community langgraph
!pip install -q langchain-google-genai
!pip install -q tavily-python
!pip install -q beautifulsoup4 requests
!pip install -q pandas
!pip install langfuse
!pip install langchain langgraph langchain_openai langchain_community
!pip install -qU langchain-openai
!pip install --upgrade langgraph

[31mERROR: Could not find a version that satisfies the requirement tavily (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for tavily[0m[31m


In [20]:
import os
from google.colab import userdata #Access Secret key

# Load Google API Key
try:
    os.environ["GOOGLE_API_KEY"] = userdata.get("GOOGLE_API_KEY")
    print("Google API Key loaded.")
except:
    print("Google API Key not found in Colab secrets. Please add it.")

# Load Tavily API Key
try:
    os.environ["TAVILY_API_KEY"] = userdata.get("TAVILY_API_KEY")
    print("Tavily API Key loaded.")
except:
    print("Tavily API Key not found in Colab secrets. Please add it.")

Google API Key loaded.
Tavily API Key loaded.


In [21]:
# Import necessary modules
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# Using Google Gemini
from langchain_google_genai import ChatGoogleGenerativeAI
# Import tool integrations
from langchain_community.tools.tavily_search import TavilySearchResults
from langgraph.graph import StateGraph, END

print("Libraries imported successfully.")

Libraries imported successfully.


In [22]:
import os
import google.generativeai as genai # For direct Gemini testing
from tavily import TavilyClient # For direct Tavily testing
from google.colab import userdata # Colab specific for accessing secrets if not using os.environ directly after userdata.get

print("--- Starting API Key Test ---")

# --- Test Google Gemini API Key ---
print("\nTesting Google Gemini API...")
google_api_key = os.getenv("GOOGLE_API_KEY")

if not google_api_key:
    print("🔑 Google_API_KEY not found in environment variables. Skipping Gemini test.")
else:
    try:
        # Configure the client with the API key
        genai.configure(api_key=google_api_key)
        genai.list_models()

        print("✅ Google Gemini API Key is valid and working.")

    except Exception as e:
        print(f"❌ Google Gemini API Key test failed: {e}")
        print("   Possible issues: Invalid key, network problems, or service outage.")


# --- Test Tavily API Key ---
print("\nTesting Tavily API...")
tavily_api_key = os.getenv("TAVILY_API_KEY")

if not tavily_api_key:
    print("🔑 TAVILY_API_KEY not found in environment variables. Skipping Tavily test.")
else:
    try:
        # Instantiate the Tavily client
        tavily_client = TavilyClient(api_key=tavily_api_key)
        response = tavily_client.search(query="test query", max_results=1)

        if response and 'results' in response:
             print("✅ Tavily API Key is valid and working.")
        else:
             # This case might happen if the key is valid but the search failed for another reason
             print(f"❌ Tavily API Key test failed: Search returned unexpected response. {response}")


    except Exception as e:
        print(f"❌ Tavily API Key test failed: {e}")
        print("   Possible issues: Invalid key, network problems, or service outage.")

print("\n--- API Key Test Complete ---")

--- Starting API Key Test ---

Testing Google Gemini API...
✅ Google Gemini API Key is valid and working.

Testing Tavily API...
✅ Tavily API Key is valid and working.

--- API Key Test Complete ---


In [23]:
from typing import List, Dict, Any, TypedDict
from langchain_core.messages import BaseMessage

# --- 1. Defining the State of Graph ---
class ResearchState(TypedDict):
    """Represents the state of the research and drafting process."""
    user_query: str
    research_plan: List[str]
    search_queries: List[str]
    raw_search_results: List[Dict[str, Any]]
    crawled_content: List[Dict[str, str]]
    processed_information: str
    draft_answer: str
    review_feedback: str
    review_decision: str

In [24]:
import os
from typing import List, Dict, Any, TypedDict
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.tools.tavily_search import TavilySearchResults
from google.colab import userdata
from langgraph.graph import StateGraph, END
import requests
from bs4 import BeautifulSoup
import time # For adding delays in crawling



llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest", temperature=0)
tavily_tool = TavilySearchResults(max_results=5)

def escape_curly_braces(text: Any) -> str:
    """Escapes single curly braces in text for f-string safety."""
    if not isinstance(text, str):
        text = str(text)

    # Replace } with }} and { with {{
    return text.replace('}', '}}').replace('{', '{{')

# --- 2. Implementing Each Agent as a Node Function ---
# --- 2a. Query Analyst Agent  ---
def query_analyst_node(state: ResearchState) -> Dict[str, Any]:
    """Analyzes the user query and creates a research plan (list of questions)."""
    print("\n---Executing Query Analyst---")
    user_query = state['user_query']

    prompt = ChatPromptTemplate.from_messages([
        ("system", """You are a Query Analyst. Your role is to deeply analyze user queries.
        First, conduct a 5W1H analysis (Who, What, When, Where, Why, and How) of the user's query to fully understand its scope and requirements.
        Based on this 5W1H analysis, break down the complex user query into a list of specific, actionable research questions.
        Focus on identifying the key pieces of information needed to fully answer the user's request, informed by your 5W1H analysis.
        Respond with ONLY a numbered list of research questions. Do not include the 5W1H analysis itself, or any other text (like introductory/concluding remarks) before or after the list."""),
        ("user", f"Analyze the following query and list the research questions, based on your 5W1H analysis: {user_query}")
    ])

    chain = prompt | llm | StrOutputParser()
    research_plan_str = chain.invoke({})
    research_plan = [q.strip() for q in research_plan_str.split('\n') if q.strip() and any(char.isdigit() for char in q.split('.')[0])] # Basic check for numbering

    print(f"Generated Research Plan: {research_plan}")
    return {"research_plan": research_plan}


# --- 2b. Research Coordinator Node (Implementation Provided Previously) ---
def research_coordinator_node(state: ResearchState) -> Dict[str, Any]:
    """Takes the research plan and generates specific search queries."""
    print("\n---Executing Research Coordinator---")
    research_plan = state.get('research_plan', []) # Get plan safely

    if not research_plan:
        print("No research plan provided by Query Analyst. Skipping search query generation.")
        return {"search_queries": []}

    plan_text = "\n".join([f"- {q}" for q in research_plan]) # Format for prompt

    prompt = ChatPromptTemplate.from_messages([
        ("system", """You are a Research Coordinator. You receive a list of research questions or tasks.
        Your role is to convert these into concise and effective search engine query strings.
        Aim for diversity if needed to cover different angles.
        Respond with ONLY a list of search query strings, one per line. Do not include numbering or bullet points or any other text."""), # Simplified output parsing
        ("user", f"Convert these research questions/tasks into search queries:\n{plan_text}")
    ])

    chain = prompt | llm | StrOutputParser()
    search_queries_str = chain.invoke({})
    search_queries = [q.strip() for q in search_queries_str.split('\n') if q.strip()]

    print(f"Generated Search Queries: {search_queries}")
    return {"search_queries": search_queries}


# --- 2c. Researcher Agent (Your Code - Updated to read search_queries) ---

def researcher_node(state: ResearchState) -> Dict[str, Any]:
    """Executes search queries based on the search_queries state and collects raw results."""
    print("\n---Executing Researcher---")
    search_queries = state.get('search_queries', [])

    if not search_queries:
        print("No search queries provided by coordinator. Skipping research.")
        return {"raw_search_results": []}

    raw_results = []
    print(f"Attempting searches for {len(search_queries)} queries.")

    for query in search_queries:
        try:
            results = tavily_tool.invoke({"query": query})
            print(f"--- Results for query: '{query}' ---")
            if results:
                for i, res in enumerate(results):
                    url = res.get('url', 'N/A')
                    title = res.get('title', 'No Title') # Also print title for context
                    print(f"  {i+1}. [{title}] {url}")
            else:
                print("  No results found.")
            raw_results.extend(results)

        except Exception as e:
            print(f"Error during search for '{query}': {e}")
            pass

    print(f"\nCollected {len(raw_results)} total raw search results across all queries.")
    return {"raw_search_results": raw_results}


# --- 2d. Crawler Node (New Implementation) ---
def crawler_node(state: ResearchState) -> Dict[str, Any]:
    """Fetches content from URLs found in raw_search_results."""
    print("\n---Executing Crawler---")
    raw_results = state.get('raw_search_results', [])

    if not raw_results:
        print("No raw search results with URLs to crawl. Skipping crawl.")
        return {"crawled_content": []}
    urls = set()
    for res in raw_results:
        url = res.get('url')
        # Basic validation and limit to http/https
        if url and isinstance(url, str) and url.startswith('http'):
            urls.add(url)

    # Convert set back to list
    urls = list(urls)

    # Limit the number of pages to crawl to avoid excessive requests
    # Adjust this limit based on your needs and politeness
    max_pages_to_crawl = 5
    urls_to_crawl = urls[:max_pages_to_crawl]

    print(f"Found {len(urls)} unique URLs from search. Crawling up to {max_pages_to_crawl}...")

    crawled_content_list = []
    # Add a User-Agent header to make requests look more like a browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    for i, url in enumerate(urls_to_crawl):
        print(f"Crawling: {url}")
        try:
            # Add a small delay to be polite and avoid being blocked
            time.sleep(1) # Adjust delay as needed

            response = requests.get(url, headers=headers, timeout=10) # Add timeout
            response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)

            # Use BeautifulSoup to parse the HTML and extract text
            soup = BeautifulSoup(response.content, 'html.parser')
            text = soup.get_text()

            # Basic text cleaning: remove excessive whitespace and newlines
            # This helps reduce token count for the LLM
            cleaned_text = ' '.join(text.split())

            crawled_content_list.append({"url": url, "text": cleaned_text})
            print(f"Successfully crawled and extracted text from {url}")

        except requests.exceptions.RequestException as e:
            print(f"Error fetching {url}: {e}")
        except Exception as e:
            print(f"Error processing {url}: {e}")


    print(f"Finished crawling. Collected content from {len(crawled_content_list)} pages.")
    # Update the state with the fetched content
    return {"crawled_content": crawled_content_list}

# --- 2e. Information Processor / Synthesizer Agent (Modified) ---
def information_processor_node(state: ResearchState) -> Dict[str, Any]:
    """Synthesizes raw search results and crawled content into structured information."""
    print("\n---Executing Information Processor---")
    raw_results = state.get('raw_search_results', [])
    crawled_content = state.get('crawled_content', [])

    if not raw_results and not crawled_content:
        print("No raw results or crawled content to process.")
        return {"processed_information": "No information found."}

    # Format and ESCAPE both types of results for the LLM
    formatted_search_results = "\n---\n".join([
        f"Search Snippet - Source: {escape_curly_braces(res.get('url', 'N/A'))}\nContent: {escape_curly_braces(res.get('content', 'N/A'))}"
        for res in raw_results
    ])

    formatted_crawled_content = "\n---\n".join([
        f"Full Page Content - Source: {escape_curly_braces(page.get('url', 'N/A'))}\nContent: {escape_curly_braces(page.get('text', 'No text extracted.')[:2000])}..." # Escape and limit length
        for page in crawled_content
    ])

    prompt = ChatPromptTemplate.from_messages([
        ("system", """You are an Information Processor. Synthesize information from both the provided search snippets and the full page content fetched from relevant pages.
        Use the search snippets for context and diverse sources, and the full page content for detailed information.
        Extract key facts and answer the research questions provided earlier (if available in the state['research_plan']).
        Structure the information logically. If conflicting information is found between sources, note it.
        Present the synthesized information clearly and concisely."""),
        ("user", f"""Raw Search Snippets:
{formatted_search_results}

---

Full Page Content:
{formatted_crawled_content}""")
    ])

    chain = prompt | llm | StrOutputParser()
    processed_info = chain.invoke({})

    print("Information processing complete.")
    return {"processed_information": processed_info}


# --- 2e. Drafting Agent (Your Code) ---

def drafting_node(state: ResearchState) -> Dict[str, Any]:
    """Drafts the final answer based on the processed information and original query."""
    print("\n---Executing Drafting Agent---")
    user_query = state.get('user_query', 'N/A - Original query missing.')
    processed_info = state.get('processed_information', '')

    escaped_user_query = escape_curly_braces(user_query)
    escaped_processed_info = escape_curly_braces(processed_info)

    if not processed_info or processed_info == "No information found.":
        draft = "Could not gather sufficient information to answer your query."
    else:
        prompt = ChatPromptTemplate.from_messages([
            ("system", f"""You are a Drafting Agent. Write a comprehensive answer to the user's original query based ONLY on the processed information provided.
            Ensure the answer is clear, well-structured, and directly addresses the user's request.
            User Query: {escaped_user_query}"""), # <-- Use escaped variable
            ("user", f"Processed Information:\n{escaped_processed_info}") # <-- Use escaped variable
        ])

        chain = prompt | llm | StrOutputParser()

        draft = chain.invoke({})
    print("Drafting complete.")
    return {"draft_answer": draft}


# --- 2f. Reviewer Node (Implementation Provided Previously) ---
def reviewer_node(state: ResearchState) -> Dict[str, Any]:
    """Reviews the draft answer, provides feedback, and decides if revision is needed."""
    print("\n---Executing Reviewer---")
    user_query = state.get('user_query', 'N/A - Original query missing.')
    draft_answer = state.get('draft_answer', '')
    processed_info = state.get('processed_information', 'N/A - Could not retrieve processed info.') # Use processed info for accuracy check

    if not draft_answer or draft_answer == "Could not gather sufficient information to answer your query.":
        print("No valid draft to review. Ending process.")
        return {"review_feedback": "Review skipped: No valid draft generated.", "review_decision": "END"}
    escaped_user_query = escape_curly_braces(user_query)
    escaped_draft_answer = escape_curly_braces(draft_answer)
    escaped_processed_info = escape_curly_braces(processed_info)


    prompt = ChatPromptTemplate.from_messages([
        ("system", f"""You are a Reviewer Agent. Your task is to review a draft answer based on the user's original query and the processed information used to create the draft.
        Provide constructive feedback on clarity, accuracy, completeness, grammar, and style.
        Based on your review, decide if the draft requires significant revisions or is acceptable as a final answer.
        Your final decision determines the next step in the workflow.

        **IMPORTANT:** Your response MUST start with one of the following exact phrases on the first line, followed by your feedback:
        DECISION: REVISE - The draft needs significant changes.
        DECISION: ACCEPT - The draft is acceptable.

        ---
        Original Query: {escaped_user_query} # <-- Use escaped variable
        Processed Information (Reference):
        {escaped_processed_info} # <-- Use escaped variable
        ---"""),
        ("user", f"Draft Answer to Review:\n{escaped_draft_answer}\n---") # <-- Use escaped variable
    ])

    chain = prompt | llm | StrOutputParser()

    review_output = chain.invoke({})

    print(f"Raw Reviewer Output:\n{review_output}")

    # ... (Parsing logic remains the same) ...
    decision = "END" # Default decision if parsing fails
    feedback = review_output # Default feedback is the whole output

    output_lines = review_output.strip().split('\n', 1)
    if output_lines and output_lines[0].startswith("DECISION:"):
        decision_part = output_lines[0].replace("DECISION:", "").strip().upper()
        if "REVISE" in decision_part:
            decision = "REVISE"
        feedback = output_lines[1].strip() if len(output_lines) > 1 else "No specific feedback provided."
    else:
        feedback = f"Reviewer did not follow format. Full output: {review_output}"
        decision = "END"

    print(f"Parsed Review Decision: {decision}")
    print(f"Parsed Review Feedback: {feedback}")

    return {"review_feedback": feedback, "review_decision": decision}


def route_review(state: ResearchState) -> str:
    """
    Router function for the reviewer transition.
    Determines the next step based on the review_decision in the state.
    """
    print("\n---Executing Review Router (Function)---")
    # Read the decision made by the reviewer node from the state
    decision = state.get('review_decision', 'END')
    print(f"Reviewer decision received: {decision}")

    # Based on the decision, return the name of the next node or END string
    # These return values MUST match the keys in the mapping below ("REVISE", "END")
    if decision == "REVISE":
        print("Routing back to Drafting Agent for revision.")
        return "REVISE" # Return the string key for the mapping
    else:
        print("Review accepted or decision unclear. Ending workflow.")
        return "END" # Return the string key for the mapping




In [25]:
!pip install -qU langchain langgraph langchain-community langchain-google-genai tavily-python requests beautifulsoup4

In [26]:
# --- 3. Define the Workflow Graph (LangGraph) ---

# Define the graph
workflow = StateGraph(ResearchState)

# Add all the *agent* nodes (The actual steps that perform tasks)
# DO NOT add the router function as a node here
workflow.add_node("query_analyst", query_analyst_node)
workflow.add_node("research_coordinator", research_coordinator_node)
workflow.add_node("researcher", researcher_node)
workflow.add_node("crawler", crawler_node) # Add the crawler node
workflow.add_node("information_processor", information_processor_node)
workflow.add_node("drafting_agent", drafting_node)
workflow.add_node("reviewer", reviewer_node)


# Set the entry point (where the graph starts)
workflow.set_entry_point("query_analyst")

# Define sequential edges
workflow.add_edge("query_analyst", "research_coordinator")
workflow.add_edge("research_coordinator", "researcher")
workflow.add_edge("researcher", "crawler") # After research (search), go to the crawler
workflow.add_edge("crawler", "information_processor") # After crawling, go to the information processor

# Continue the flow after information processing
workflow.add_edge("information_processor", "drafting_agent")
workflow.add_edge("drafting_agent", "reviewer")

workflow.add_conditional_edges(
    "reviewer",
    route_review,
    {
        "REVISE": "drafting_agent",
        "END": END
    }
)

# --- 4. Compile the graph ---
app = workflow.compile()

print("\nLangGraph workflow compiled with all agents and conditional routing.")


LangGraph workflow compiled with all agents and conditional routing.


In [28]:
# --- 5. Run the Graph (Example Usage) ---

# Example usage
user_query = "Explain the concept of Deep Learning and advancment in it"

# Initial state contains just the user query and empty values for everything else
initial_state = {
    "user_query": user_query,
    "research_plan": [],
    "search_queries": [],
    "raw_search_results": [],
    "crawled_content": [],
    "processed_information": "",
    "draft_answer": "",
    "review_feedback": "",
    "review_decision": ""
}


print(f"\nStarting workflow for query: {user_query}")


# Or use invoke() to just get the final state
final_state = app.invoke(initial_state)


# --- Access the final output ---
print("\n--- Workflow Finished ---")

print("\n--- Final Answer Draft ---")
print(final_state.get('draft_answer', 'No draft generated.'))

print("\n--- Final Review Feedback ---")
print(final_state.get('review_feedback', 'No review feedback.'))

print("\n--- Final Review Decision ---")
print(final_state.get('review_decision', 'N/A'))


Starting workflow for query: Explain the concept of Deep Learning and advancment in it

---Executing Query Analyst---
Generated Research Plan: ['1. What is the fundamental definition of Deep Learning?', '2. What are the core components and architecture of a Deep Learning model (e.g., layers, neurons, activation functions)?', '3. What are the different types of Deep Learning architectures (e.g., Convolutional Neural Networks (CNNs), Recurrent Neural Networks (RNNs), Generative Adversarial Networks (GANs)) and their respective applications?', '4. What are the key advancements in Deep Learning in the last 5-10 years?', '5. What are the major breakthroughs and milestones achieved in Deep Learning research?', '6. How has the computational power and availability of large datasets influenced the advancements in Deep Learning?', '7. What are the current limitations and challenges in Deep Learning?', '8. What are the ethical considerations surrounding the advancements and applications of Deep 