In [1]:
from dotenv import load_dotenv
import os
import json
load_dotenv()

True

In [2]:
import operator 
from dataclasses import dataclass, field
from typing_extensions import TypedDict, Annotated, Literal

# All fields must be passed as keyword arguments.
@dataclass(kw_only=True)
class SummaryState:
    research_topic: str = field(default=None) # Report Topic
    search_query : str = field(default=None) # Search query
    web_research_result : Annotated[list, operator.add] = field(default_factory=list) # Stores list of URLs
    sources_gathered : Annotated[list, operator.add] = field(default_factory=list)
    research_loop_count: int = field(default=0) # loop count
    running_summary: str = field(default=None) # Final Report


@dataclass(kw_only=True)
class SummaryStateInput(TypedDict):
    research_topic: str = field(default=None) # Report topic     

@dataclass(kw_only=True)
class SummaryStateOutput(TypedDict):
    running_summary: str = field(default=None) # Final report

In [3]:
query_writer_instructions="""Your goal is to generate targeted web search query.

The query will gather information related to a specific topic.

Topic:
{research_topic}

Return your query as a JSON object:
{{
    "query": "string",
    "aspect": "string",
    "rationale": "string"
}}
"""

summarizer_instructions="""Your goal is to generate a high-quality summary of the web search results.

When EXTENDING an existing summary:
1. Seamlessly integrate new information without repeating what's already covered
2. Maintain consistency with the existing content's style and depth
3. Only add new, non-redundant information
4. Ensure smooth transitions between existing and new content

When creating a NEW summary:
1. Highlight the most relevant information from each source
2. Provide a concise overview of the key points related to the report topic
3. Emphasize significant findings or insights
4. Ensure a coherent flow of information

CRITICAL REQUIREMENTS:
- Start IMMEDIATELY with the summary content - no introductions or meta-commentary
- DO NOT include ANY of the following:
  * Phrases about your thought process ("Let me start by...", "I should...", "I'll...")
  * Explanations of what you're going to do
  * Statements about understanding or analyzing the sources
  * Mentions of summary extension or integration
- Focus ONLY on factual, objective information
- Maintain a consistent technical depth
- Avoid redundancy and repetition
- DO NOT use phrases like "based on the new results" or "according to additional sources"
- DO NOT add a References or Works Cited section
- DO NOT use any XML-style tags like <think> or <answer>
- Begin directly with the summary text without any tags, prefixes, or meta-commentary
"""

reflection_instructions = """You are an expert research assistant analyzing a summary about {research_topic}.

Your tasks:
1. Identify knowledge gaps or areas that need deeper exploration
2. Generate a follow-up question that would help expand your understanding
3. Focus on technical details, implementation specifics, or emerging trends that weren't fully covered

Ensure the follow-up question is self-contained and includes necessary context for web search.

Return your analysis as a JSON object:
{{ 
    "knowledge_gap": "string",
    "follow_up_query": "string"
}}"""

In [4]:
import json

In [5]:
import os
from dataclasses import dataclass, field, fields
from typing import Any, Optional

from langchain_core.runnables import RunnableConfig
from typing_extensions import Annotated
from dataclasses import dataclass

@dataclass(kw_only=True)
class Configuration:
    """The configurable fields for the research assistant."""
    max_web_research_loops: int = 3
    local_llm: str = "deepseek-r1:1.5b"

    @classmethod  # use method without creating class instance
    def from_runnable_config(
        cls, config: Optional[RunnableConfig] = None
    ) -> "Configuration":
        """Create a Configuration instance from a RunnableConfig."""
        configurable = (
            config["configurable"] if config and "configurable" in config else {}
        )
        values: dict[str, Any] = {
            f.name: os.environ.get(f.name.upper(), configurable.get(f.name))
            for f in fields(cls)
            if f.init
        }
        return cls(**{k: v for k, v in values.items() if v})

In [6]:
from tavily import TavilyClient

"""Returns:
        dict: Tavily search response containing:
            - results (list): List of search result dictionaries, each containing:
                - title (str): Title of the search result
                - url (str): URL of the search result
                - content (str): Snippet/summary of the content
                - raw_content (str): Full content of the page if available"""
def tavily_search(query, include_raw_content=True, max_results=3):
    api_key = os.getenv("TAVILY_API_KEY")
    tavily_client = TavilyClient(api_key=api_key)
    return tavily_client.search(query, 
                         max_results=max_results, 
                         include_raw_content=include_raw_content)


def deduplicate_and_format_sources(search_response, max_tokens_per_source, include_raw_content=True):
    """
    Takes either a single search response or list of responses from Tavily API and formats them.
    Limits the raw_content to approximately max_tokens_per_source.
    include_raw_content specifies whether to include the raw_content from Tavily in the formatted string.
    
    Args:
        search_response: Either:
            - A dict with a 'results' key containing a list of search results
            - A list of dicts, each containing search results
            
    Returns:
        str: Formatted string with deduplicated sources
    """
    # Convert input to list of results
    if isinstance(search_response, dict):
        sources_list = search_response['results']
    elif isinstance(search_response, list):
        sources_list = []
        for response in search_response:
            if isinstance(response, dict) and 'results' in response:
                sources_list.extend(response['results'])
            else:
                sources_list.extend(response)
    else:
        raise ValueError("Input must be either a dict with 'results' or a list of search results")
    
    # Deduplicate by URL
    unique_sources = {}
    for source in sources_list:
        if source['url'] not in unique_sources:
            unique_sources[source['url']] = source
    
    # Format output
    formatted_text = "Sources:\n\n"
    for i, source in enumerate(unique_sources.values(), 1):
        formatted_text += f"Source {source['title']}:\n===\n"
        formatted_text += f"URL: {source['url']}\n===\n"
        formatted_text += f"Most relevant content from source: {source['content']}\n===\n"
        if include_raw_content:
            # Using rough estimate of 4 characters per token
            char_limit = max_tokens_per_source * 4
            # Handle None raw_content
            raw_content = source.get('raw_content', '')
            if raw_content is None:
                raw_content = ''
                print(f"Warning: No raw_content found for source {source['url']}")
            if len(raw_content) > char_limit:
                raw_content = raw_content[:char_limit] + "... [truncated]"
            formatted_text += f"Full source content limited to {max_tokens_per_source} tokens: {raw_content}\n\n"
                
    return formatted_text.strip()

def format_sources(search_results):
    """Format search results into a bullet-point list of sources.
    
    Args:
        search_results (dict): Tavily search response containing results
        
    Returns:
        str: Formatted string with sources and their URLs
    """
    return '\n'.join(
        f"* {source['title']} : {source['url']}"
        for source in search_results['results']
    )

In [7]:
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_ollama import ChatOllama


def generate_query(state : SummaryState, model_name="deepseek-r1:1.5b"):

    query_writer_instructions_formatted = query_writer_instructions.format(research_topic=state.research_topic)

    #configurable = Configuration.from_runnable_config(config)
    llm_json_mode = ChatOllama(model=model_name,temperature=0,format="json")
    result = llm_json_mode.invoke(
        [SystemMessage(content=query_writer_instructions_formatted),
         HumanMessage(content=f"Generator a query for web search:")]
    )

    query = json.loads(result.content)

    return {"search_query": query['query']}



def web_search(state : SummaryState):

    search_results = tavily_search(state.search_query, include_raw_content=True, max_results=1)
    search_str = deduplicate_and_format_sources(search_results,max_tokens_per_source=2000)

    formatted_search_result = format_sources(search_results)
    return{"sources_gathered": [formatted_search_result], "research_loop_count":state.research_loop_count + 1, "web_research_results":[search_str]}


def summarize_sources(state: SummaryState, model_name="deepseek-r1:1.5b"):

    existing_summary = state.running_summary

    most_recent_web_search = state.web_research_result[-1]

    if existing_summary:
        print("SUMMARY IS NOT EMPTY")
        human_message_content = (
            f"Extend the existing summary: {existing_summary}\n\n"
            f"Include new search results: {most_recent_web_search} "
            f"That addresses the following topic: {state.research_topic}"
        )
    else:
        print("SUMMARY IS EMPTY")
        human_message_content = (
            f"Generate a summary of these search results: {most_recent_web_search} "
            f"That addresses the following topic: {state.research_topic}"
        )

    #configurable = Configuration.from_runnable_config(config)
    llm = ChatOllama(model=model_name, temperature=0)
    result = llm.invoke(
        [SystemMessage(content=summarizer_instructions),
        HumanMessage(content=human_message_content)]
    )

    running_summary = result.content
    think_tag = ""
    
    # Filtering out the think tags only for deepseek models
    # Can remove this tag if you want to check out the thought process
    if 'deepseek' in model_name:
        while "<think>" in running_summary and "</think>" in running_summary:
            start = running_summary.find("<think>")
            end = running_summary.find("</think>") + len("</think>")
            think_tag = running_summary[start:end]
            running_summary = running_summary[:start] + running_summary[end:]
            


    return {"running_summary" : running_summary, "think_tag" : think_tag}

def reflect_on_summary(state: SummaryState,model_name="deepseek-r1:1.5b"):
    """ Reflect on the summary and generate a follow-up query """

    # Generate a query
    #configurable = Configuration.from_runnable_config(config)
    llm_json_mode = ChatOllama(model=model_name, temperature=0, format="json")
    result = llm_json_mode.invoke(
        [SystemMessage(content=reflection_instructions.format(research_topic=state.research_topic)),
        HumanMessage(content=f"Identify a knowledge gap and generate a follow-up web search query based on our existing knowledge: {state.running_summary}")]
    )   
    follow_up_query = json.loads(result.content)

    # Get the follow-up query
    query = follow_up_query.get('follow_up_query')

    # JSON mode can fail in some cases
    if not query:

        # Fallback to a placeholder query
        return {"search_query": f"Tell me more about {state.research_topic}"}

    # Update search query with follow-up query
    return {"search_query": follow_up_query['follow_up_query']}


def finalize_summary(state : SummaryState):

    all_sources= "\n".join(source for source in state.sources_gathered)
    state.running_summary = f"## Summary\n\n{state.running_summary}\n\n ### Sources:\n{all_sources} "
    return {"running_summary" : state.running_summary}











In [8]:
def research_pipeline(topic="Latest AI advancements",no_of_iterations=2,model_name="deepseek-r1:1.5b"):
    state = SummaryState(research_topic=topic)
    config = RunnableConfig()
    query_result = generate_query(state,model_name)
    state.search_query = query_result["search_query"]
    print(query_result["search_query"])

    for _ in range(no_of_iterations):
        web_result = web_search(state)
        state.sources_gathered += web_result["sources_gathered"]
        print(state.sources_gathered)
        state.web_research_result = web_result["web_research_results"]
        #print(state.web_research_result)
        summarize_result = summarize_sources(state,model_name)
        state.running_summary = summarize_result["running_summary"]
        print(summarize_result["think_tag"])
        print(state.running_summary)
        reflect_result = reflect_on_summary(state,model_name)
        state.search_query = reflect_result["search_query"]
        print(f"NEW SEARCH QUERY :- {state.search_query}")

    final_result = finalize_summary(state)
    state.running_summary = final_result["running_summary"]

    print("\n\n\n")
    print(state.running_summary)




        


    

    

In [12]:
# deepseek-r1:8b        
research_pipeline(topic="Ethical implications of AI-generated art",no_of_iterations=3,model_name="deepseek-r1:8b")

What are the ethical implications of AI-generated art?
['* The Ethical Implication Of AI Generated Art : https://www.theartist.me/art/the-ethical-implication-of-ai-generated-art/']
SUMMARY IS EMPTY
<think>
Okay, so I need to generate a summary based on the provided sources about the ethical implications of AI-generated art. Let me start by reading through the content carefully.

The main points from the source are:

1. **Authorship and Attribution**: AI contributes significantly to the artwork, making it hard to determine who should be credited. This blurs traditional notions of authorship.
2. **Originality and Authenticity**: AI can replicate styles or mimic artists, leading to concerns about uniqueness and authenticity.
3. **Artistic Intention and Creativity**: Since AI lacks subjective consciousness, there's a question about whether it can have true artistic vision or emotional depth.
4. **Impact on Human Artists**: There's fear that AI might displace human artists, affecting their 

In [14]:
# llama3.1:8b        
research_pipeline(topic="Ethical implications of AI-generated art",no_of_iterations=3,model_name="llama3.1:8b")

ethics of artificial intelligence generated artwork
['* The Ethics of AI-Generated Art: Authorship, Ownership, and Creative ... : https://ai-oracles.com/2025/01/25/the-ethics-of-ai-generated-art-authorship-ownership-and-creative-freedoms/']
SUMMARY IS EMPTY

The emergence of Artificial Intelligence (AI) as a creative force has raised numerous questions and ethical considerations in the world of art. The concept of authorship is a thorny issue, with proponents arguing that AI can be seen as a tool to enhance human creativity, while critics contend that it challenges the very notion of authorship. Dr. [Expert Name] emphasizes the importance of recognizing the human element in the creative process and safeguarding the rights and recognition of human artists.

The issue of ownership and intellectual property rights is another complex area, with varying interpretations and potential challenges. Some legal scholars argue that the creator of the AI system should be considered the owner of the