# Web Research Assistant Workflow
🧩 **Workflow Architecture**: User Query → Web Search → Content Loading → Document Summarization → Report Writing → Citation Caching

This notebook implements a comprehensive research workflow using LangGraph.

In [67]:
# Import all required dependencies
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_tavily import TavilySearch
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_community.document_loaders import WebBaseLoader
from langgraph.graph import StateGraph, START, END
from langgraph.checkpoint.memory import MemorySaver
from typing_extensions import TypedDict
from typing import List, Dict, Any, Optional
from dotenv import load_dotenv
import os
import json
import time

In [68]:
# Load environment variables
load_dotenv()

# Initialize the LLM
model = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    google_api_key=os.getenv("GOOGLE_API_KEY"),
    temperature=0.1
)

# Initialize Tavily Search
search_tool = TavilySearch(
    max_results=5,
    topic="general"
)

In [69]:
# Define the workflow state schema
class ResearchState(TypedDict):
    """State schema for the research workflow"""
    query: str                          # User's research query
    search_results: List[Dict[str, Any]]  # Raw search results from Tavily
    page_contents: List[Dict[str, Any]]   # Extracted content from web pages
    summaries: List[Dict[str, str]]       # Individual page summaries
    final_report: str                     # Combined research report
    citations: List[Dict[str, str]]       # Citations and references
    error_message: Optional[str]          # Error handling

In [70]:
# Web Search Agent Node
def web_search_agent(state: ResearchState) -> ResearchState:
    """
    Search multiple sources using TavilySearch and return structured results
    """
    try:
        query = state["query"]
        print(f"🔍 Searching for: {query}")
        
        # Perform web search
        search_results = search_tool.invoke(query)
        
        # Extract and structure the results
        structured_results = []
        if isinstance(search_results, list):
            for result in search_results:
                structured_results.append({
                    'title': result.get('title', 'No title'),
                    'url': result.get('url', ''),
                    'content': result.get('content', ''),
                    'score': result.get('score', 0)
                })
        else:
            # Handle case where search_results is a dict with 'results' key
            results_list = search_results.get('results', [])
            for result in results_list:
                structured_results.append({
                    'title': result.get('title', 'No title'),
                    'url': result.get('url', ''),
                    'content': result.get('content', ''),
                    'score': result.get('score', 0)
                })
        
        print(f"✅ Found {len(structured_results)} search results")
        return {**state, "search_results": structured_results}
        
    except Exception as e:
        print(f"❌ Search error: {str(e)}")
        return {**state, "error_message": f"Search failed: {str(e)}"}

In [71]:
# Content Loader Node
def content_loader(state: ResearchState) -> ResearchState:
    """
    Extract and clean content from search result URLs using LangChain WebBaseLoader
    """
    try:
        search_results = state.get("search_results", [])
        page_contents = []
        
        print(f"📄 Loading content from {len(search_results)} pages using WebBaseLoader...")
        
        for i, result in enumerate(search_results):
            try:
                url = result.get('url', '')
                if not url:
                    continue
                    
                print(f"  Loading page {i+1}: {url[:50]}...")
                
                # Use LangChain's WebBaseLoader for robust web content extraction
                loader = WebBaseLoader(
                    web_paths=[url],
                    header_template={
                        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
                    },
                    verify_ssl=False  # Handle SSL issues gracefully
                )
                
                # Load and parse the document
                documents = loader.load()
                
                if documents:
                    # Get the first document (should only be one for single URL)
                    doc = documents[0]
                    content = doc.page_content
                    
                    # Clean up the content - remove excessive whitespace
                    lines = (line.strip() for line in content.splitlines())
                    clean_content = '\n'.join(line for line in lines if line)
                    
                    # Limit content length to avoid token limits
                    if len(clean_content) > 4000:
                        clean_content = clean_content[:4000] + "..."
                    
                    page_contents.append({
                        'title': result.get('title', 'No title'),
                        'url': url,
                        'content': clean_content,
                        'original_content': result.get('content', ''),  # Keep Tavily's content as backup
                        'score': result.get('score', 0),
                        'metadata': doc.metadata  # Include metadata from loader
                    })
                    
                    print(f"    ✅ Successfully loaded {len(clean_content)} characters")
                    
                else:
                    raise Exception("No documents returned from WebBaseLoader")
                
            except Exception as e:
                print(f"  ⚠️  Failed to load {url}: {str(e)}")
                # Use Tavily's content as fallback
                fallback_content = result.get('content', 'Content not available')
                page_contents.append({
                    'title': result.get('title', 'No title'),
                    'url': result.get('url', ''),
                    'content': fallback_content,
                    'original_content': result.get('content', ''),
                    'score': result.get('score', 0),
                    'metadata': {'source': 'tavily_fallback'}
                })
                print(f"    ↳ Using Tavily fallback content ({len(fallback_content)} chars)")
            
            # Small delay to be respectful to servers
            time.sleep(0.5)
        
        print(f"✅ Loaded content from {len(page_contents)} pages")
        return {**state, "page_contents": page_contents}
        
    except Exception as e:
        print(f"❌ Content loading error: {str(e)}")
        return {**state, "error_message": f"Content loading failed: {str(e)}"}

In [72]:
# Document Summarizer Node
def document_summarizer(state: ResearchState) -> ResearchState:
    """
    LLM node to summarize individual documents/pages with source tracking
    """
    try:
        page_contents = state.get("page_contents", [])
        query = state.get("query", "")
        summaries = []
        
        print(f"📝 Summarizing {len(page_contents)} documents...")
        
        # Create summarization prompt
        summarize_prompt = ChatPromptTemplate.from_messages([
            ("system", """You are an expert research assistant. Your task is to summarize web page content 
            in relation to a specific research query. Focus on extracting the most relevant information 
            that directly addresses the research question.
            
            Guidelines:
            - Extract key facts, statistics, and insights relevant to the query
            - Maintain factual accuracy and avoid adding interpretations
            - Keep summaries concise but informative (200-300 words)
            - Include specific details like dates, numbers, names when relevant
            - If the content is not relevant to the query, mention that clearly"""),
            ("human", """Research Query: {query}
            
            Web Page Title: {title}
            Web Page URL: {url}
            
            Content to Summarize:
            {content}
            
            Please provide a focused summary of this content in relation to the research query.""")
        ])
        
        for i, page in enumerate(page_contents):
            try:
                print(f"  Summarizing page {i+1}: {page['title'][:40]}...")
                
                # Create summary using LLM
                formatted_prompt = summarize_prompt.format_messages(
                    query=query,
                    title=page['title'],
                    url=page['url'],
                    content=page['content']
                )
                
                response = model.invoke(formatted_prompt)
                summary_text = response.content
                
                summaries.append({
                    'title': page['title'],
                    'url': page['url'],
                    'summary': summary_text,
                    'score': page.get('score', 0)
                })
                
            except Exception as e:
                print(f"  ⚠️  Failed to summarize {page['title']}: {str(e)}")
                summaries.append({
                    'title': page['title'],
                    'url': page['url'],
                    'summary': f"Summary failed: {str(e)}",
                    'score': page.get('score', 0)
                })
        
        print(f"✅ Generated {len(summaries)} summaries")
        return {**state, "summaries": summaries}
        
    except Exception as e:
        print(f"❌ Summarization error: {str(e)}")
        return {**state, "error_message": f"Summarization failed: {str(e)}"}

In [73]:
# Report Writer Node
def report_writer(state: ResearchState) -> ResearchState:
    """
    LLM node to combine all summaries into coherent structured report
    """
    try:
        summaries = state.get("summaries", [])
        query = state.get("query", "")
        
        print(f"📊 Writing comprehensive report...")
        
        # Prepare summaries text for the prompt
        summaries_text = ""
        for i, summary in enumerate(summaries, 1):
            summaries_text += f"\n\n--- Source {i} ---\n"
            summaries_text += f"Title: {summary['title']}\n"
            summaries_text += f"URL: {summary['url']}\n"
            summaries_text += f"Summary: {summary['summary']}\n"
        
        # Create report writing prompt
        report_prompt = ChatPromptTemplate.from_messages([
            ("system", """You are an expert research analyst tasked with creating comprehensive, 
            well-structured research reports. Your goal is to synthesize information from multiple 
            sources into a coherent, insightful report.
            
            Report Structure:
            1. **Executive Summary** - Brief overview of key findings
            2. **Introduction** - Context and background of the research topic
            3. **Key Findings** - Main insights organized by themes/topics
            4. **Analysis** - Deeper analysis, trends, and implications
            5. **Conclusion** - Summary of main takeaways and implications
            6. **Sources** - List all sources with their URLs
            
            Guidelines:
            - Use clear, professional language
            - Organize information logically with proper headings
            - Cite sources appropriately [1], [2], etc.
            - Highlight key statistics, dates, and facts
            - Draw connections between different sources
            - Maintain objectivity and factual accuracy"""),
            ("human", """Research Query: {query}
            
            Source Summaries:
            {summaries}
            
            Please create a comprehensive research report that synthesizes all the information above 
            into a well-structured, professional document that thoroughly addresses the research query.""")
        ])
        
        # Generate the report
        formatted_prompt = report_prompt.format_messages(
            query=query,
            summaries=summaries_text
        )
        
        response = model.invoke(formatted_prompt)
        final_report = response.content
        
        print(f"✅ Generated comprehensive report ({len(final_report)} characters)")
        return {**state, "final_report": final_report}
        
    except Exception as e:
        print(f"❌ Report writing error: {str(e)}")
        return {**state, "error_message": f"Report writing failed: {str(e)}"}

In [74]:
# Citation Cache Node
def citation_cache(state: ResearchState) -> ResearchState:
    """
    Function to manage citations and cache results for future use
    """
    try:
        summaries = state.get("summaries", [])
        query = state.get("query", "")
        
        print(f"📚 Processing citations and caching results...")
        
        # Create structured citations
        citations = []
        for i, summary in enumerate(summaries, 1):
            citation = {
                'id': i,
                'title': summary['title'],
                'url': summary['url'],
                'access_date': time.strftime("%Y-%m-%d"),
                'relevance_score': summary.get('score', 0),
                'citation_format': f"[{i}] {summary['title']}. Retrieved {time.strftime('%Y-%m-%d')}. {summary['url']}"
            }
            citations.append(citation)
        
        # Cache the research session (in a real application, this would go to a database)
        cache_entry = {
            'query': query,
            'timestamp': time.strftime("%Y-%m-%d %H:%M:%S"),
            'sources_count': len(summaries),
            'citations': citations,
            'report_length': len(state.get("final_report", "")),
            'status': 'completed'
        }
        
        # In a real application, you would save this to a database or file
        print(f"✅ Cached research session with {len(citations)} citations")
        
        return {**state, "citations": citations}
        
    except Exception as e:
        print(f"❌ Citation caching error: {str(e)}")
        return {**state, "error_message": f"Citation caching failed: {str(e)}"}

In [75]:
# Construct LangGraph Workflow
def create_research_workflow():
    """
    Wire all nodes together in proper sequence with conditional logic
    """
    # Create the workflow graph
    workflow = StateGraph(ResearchState)
    
    # Add all nodes to the workflow
    workflow.add_node("web_search", web_search_agent)
    workflow.add_node("content_loader", content_loader)
    workflow.add_node("summarizer", document_summarizer)
    workflow.add_node("report_writer", report_writer)
    workflow.add_node("citation_cache", citation_cache)
    
    # Define the workflow edges (sequence)
    workflow.add_edge(START, "web_search")
    workflow.add_edge("web_search", "content_loader")
    workflow.add_edge("content_loader", "summarizer")
    workflow.add_edge("summarizer", "report_writer")
    workflow.add_edge("report_writer", "citation_cache")
    workflow.add_edge("citation_cache", END)
    
    # Compile the workflow with memory
    memory = MemorySaver()
    compiled_workflow = workflow.compile(checkpointer=memory)
    
    return compiled_workflow

# Create the workflow instance
research_workflow = create_research_workflow()

print("🚀 Research workflow created successfully!")
print("\nWorkflow nodes:")
print("  1. web_search - Search multiple sources")
print("  2. content_loader - Extract page content")
print("  3. summarizer - Summarize each document")
print("  4. report_writer - Create comprehensive report")
print("  5. citation_cache - Process citations and cache")

🚀 Research workflow created successfully!

Workflow nodes:
  1. web_search - Search multiple sources
  2. content_loader - Extract page content
  3. summarizer - Summarize each document
  4. report_writer - Create comprehensive report
  5. citation_cache - Process citations and cache


In [76]:
# Test the Complete Workflow
def run_research(query: str, thread_id: str = "test_session"):
    """
    Run end-to-end research workflow
    """
    print(f"🔬 Starting research workflow for: '{query}'")
    print("=" * 60)
    
    # Initial state
    initial_state = {
        "query": query,
        "search_results": [],
        "page_contents": [],
        "summaries": [],
        "final_report": "",
        "citations": [],
        "error_message": None
    }
    
    # Configure thread
    config = {"configurable": {"thread_id": thread_id}}
    
    try:
        # Run the workflow
        result = research_workflow.invoke(initial_state, config)
        
        print("\n" + "=" * 60)
        print("🎉 RESEARCH COMPLETED SUCCESSFULLY!")
        print("=" * 60)
        
        return result
        
    except Exception as e:
        print(f"\n❌ Workflow failed: {str(e)}")
        return None

# Example usage
print("Ready to run research! Use the function like this:")
print("result = run_research('Your research question here')")

Ready to run research! Use the function like this:
result = run_research('Your research question here')


In [77]:
# Utility function to display results
def display_research_results(result):
    """
    Display research results in a nice format
    """
    if not result:
        print("❌ No results to display")
        return
    
    print("\n📊 RESEARCH SUMMARY")
    print("=" * 50)
    print(f"Query: {result.get('query', 'N/A')}")
    print(f"Sources found: {len(result.get('search_results', []))}")
    print(f"Pages processed: {len(result.get('page_contents', []))}")
    print(f"Summaries generated: {len(result.get('summaries', []))}")
    print(f"Citations: {len(result.get('citations', []))}")
    
    if result.get('error_message'):
        print(f"⚠️  Error: {result['error_message']}")
    
    print("\n📝 FINAL REPORT")
    print("=" * 50)
    report = result.get('final_report', 'No report generated')
    print(report)
    
    print("\n📚 CITATIONS")
    print("=" * 50)
    citations = result.get('citations', [])
    for citation in citations:
        print(citation.get('citation_format', 'Citation format error'))
    
    print("\n✅ Research completed!")

In [78]:
# Run a test research
# Uncomment and run the lines below to test the workflow

research_result = run_research("Latest advancements in renewable energy technologies 2024")
display_research_results(research_result)



🔬 Starting research workflow for: 'Latest advancements in renewable energy technologies 2024'
🔍 Searching for: Latest advancements in renewable energy technologies 2024
✅ Found 5 search results
📄 Loading content from 5 pages using WebBaseLoader...
  Loading page 1: https://bcse.org/market-trends-2024/2024-key-trend...
✅ Found 5 search results
📄 Loading content from 5 pages using WebBaseLoader...
  Loading page 1: https://bcse.org/market-trends-2024/2024-key-trend...




    ✅ Successfully loaded 4003 characters
  Loading page 2: https://www.iea.org/reports/renewables-2024...
  Loading page 2: https://www.iea.org/reports/renewables-2024...




    ✅ Successfully loaded 4003 characters
  Loading page 3: https://premierscience.com/wp-content/uploads/2024...
  Loading page 3: https://premierscience.com/wp-content/uploads/2024...




    ✅ Successfully loaded 4003 characters
  Loading page 4: https://www.sciencedirect.com/science/article/abs/...
  Loading page 4: https://www.sciencedirect.com/science/article/abs/...




    ✅ Successfully loaded 682 characters
  Loading page 5: https://firstignite.com/exploring-the-latest-green...
  Loading page 5: https://firstignite.com/exploring-the-latest-green...




    ✅ Successfully loaded 4003 characters
✅ Loaded content from 5 pages
📝 Summarizing 5 documents...
  Summarizing page 1: 2024 Key Trends - Business Council for S...
✅ Loaded content from 5 pages
📝 Summarizing 5 documents...
  Summarizing page 1: 2024 Key Trends - Business Council for S...
  Summarizing page 2: Renewables 2024 – Analysis - IEA...
  Summarizing page 2: Renewables 2024 – Analysis - IEA...
  Summarizing page 3: [PDF] Advancements in Renewable Energy T...
  Summarizing page 3: [PDF] Advancements in Renewable Energy T...
  Summarizing page 4: Advancements in Renewable Energy: Innova...
  Summarizing page 4: Advancements in Renewable Energy: Innova...
  Summarizing page 5: Green Energy Technology Advances 2024: A...
  Summarizing page 5: Green Energy Technology Advances 2024: A...
✅ Generated 5 summaries
📊 Writing comprehensive report...
✅ Generated 5 summaries
📊 Writing comprehensive report...
✅ Generated comprehensive report (6818 characters)
📚 Processing citations and ca

In [88]:
# Sample Research Queries and Usage Examples
print("🔬 SAMPLE RESEARCH QUERIES:")
print("=" * 50)

sample_queries = [
    "Latest developments in artificial intelligence 2024",
    "Climate change impacts on global agriculture",
    "Quantum computing breakthroughs and applications",
    "Sustainable transportation innovations",
    "Cybersecurity trends and threats 2024"
]

for i, query in enumerate(sample_queries, 1):
    print(f"{i}. {query}")

print(f"\n💡 Usage:")
print("research_result = run_research('Your research question here')")
print("show_complete_results()  # To display the full report")

print(f"\n🔧 Workflow Features:")
print("✅ LangChain WebBaseLoader for robust content extraction")
print("✅ Automatic error handling and SSL fallback")
print("✅ Multi-source document summarization") 
print("✅ Comprehensive report generation with citations")
print("✅ Professional academic-style output formatting")

🔬 SAMPLE RESEARCH QUERIES:
1. Latest developments in artificial intelligence 2024
2. Climate change impacts on global agriculture
3. Quantum computing breakthroughs and applications
4. Sustainable transportation innovations
5. Cybersecurity trends and threats 2024

💡 Usage:
research_result = run_research('Your research question here')
show_complete_results()  # To display the full report

🔧 Workflow Features:
✅ LangChain WebBaseLoader for robust content extraction
✅ Automatic error handling and SSL fallback
✅ Multi-source document summarization
✅ Comprehensive report generation with citations
✅ Professional academic-style output formatting


In [89]:
# Complete Research Results Display
def show_complete_results():
    """Display comprehensive research results"""
    try:
        if 'research_result' in locals() and research_result:
            print("📊 RESEARCH WORKFLOW RESULTS:")
            print("=" * 60)
            
            # Basic stats
            query = research_result.get('query', 'N/A')
            sources = len(research_result.get('search_results', []))
            pages = len(research_result.get('page_contents', []))
            summaries = len(research_result.get('summaries', []))
            citations = len(research_result.get('citations', []))
            report = research_result.get('final_report', '')
            
            print(f"🎯 Query: {query}")
            print(f"🔍 Sources: {sources} | 📄 Pages: {pages} | 📝 Summaries: {summaries}")
            print(f"📚 Citations: {citations} | 📋 Report Length: {len(report):,} chars")
            
            # Display full report
            print(f"\n📖 COMPLETE RESEARCH REPORT:")
            print("-" * 60)
            print(report)
            print("-" * 60)
            
            # Display citations
            print(f"\n📚 CITATIONS:")
            print("-" * 40)
            citations_list = research_result.get('citations', [])
            for citation in citations_list:
                print(f"  {citation.get('citation_format', 'Citation error')}")
            
            print(f"\n✅ Research completed successfully!")
            
        else:
            print("❌ No research results found.")
            print("Run: research_result = run_research('Your query here')")
            
    except NameError:
        print("❌ Research results not available")
        print("Run: research_result = run_research('Your query here')")

# Show the results
show_complete_results()

❌ No research results found.
Run: research_result = run_research('Your query here')
