## LLMs

In [1]:
from llama_index.llms.ollama import Ollama  

chat_model = Ollama(model="qwen2:7b", context_window=80000, request_timeout=300)


In [2]:
res = chat_model.complete("Hey")
res.raw["message"]["content"]
res.__dict__

{'text': 'Hello! How can I assist you today?',
 'additional_kwargs': {'tool_calls': None, 'thinking': None},
 'raw': {'model': 'qwen2:7b',
  'created_at': '2025-08-04T21:51:42.680495Z',
  'done': True,
  'done_reason': 'stop',
  'total_duration': 981048958,
  'load_duration': 29762875,
  'prompt_eval_count': 20,
  'prompt_eval_duration': 730119875,
  'eval_count': 10,
  'eval_duration': 220454708,
  'message': Message(role='assistant', content='Hello! How can I assist you today?', thinking=None, images=None, tool_calls=None),
  'usage': {'prompt_tokens': 20, 'completion_tokens': 10, 'total_tokens': 30}},
 'logprobs': None,
 'delta': None}

## Researcher: An Agent Workflow

Let's create an agent workflow that would: 
defintions  
     

In [3]:
from bs4 import BeautifulSoup

def extract_text_with_beautifulsoup(raw_html: str) -> str:
    """
    Extract clean text from raw HTML using BeautifulSoup.
    
    Args:
        raw_html (str): Raw HTML content
        
    Returns:
        str: Extracted clean text
    """
    soup = BeautifulSoup(raw_html, 'html.parser')
    
    # Remove script and style elements
    for script in soup(["script", "style", "nav", "footer", "header", "aside"]):
        script.decompose()
    
    # Get text and clean it up
    text = soup.get_text()
    
    # Clean up whitespace
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = ' '.join(chunk for chunk in chunks if chunk)
    
    return text


In [4]:
from typing import TypedDict, Optional, Annotated
from datetime import datetime
import urllib

from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage, AIMessage
from langgraph.graph.message import add_messages

# import mlflow
# mlflow.set_experiment(experiment_name=datetime.now().isoformat())
# mlflow.set_tracking_uri('http://localhost:5000')
# mlflow.llama_index.autolog()

MAX_NUMBER_URL_SOURCES = 1 

class SearchState(TypedDict):
    # Query to research
    query: str 

    # Keep the interactions for ReAct (Reason - Act - Observe - Repeat) style agent 
    messages: Annotated[list[AnyMessage], add_messages]

    # Section of the blogpost
    sections: list[str]

    # Current section that is being written 
    section: str

    # List of relevant urls webpages (resources/references) to the query 
    web_urls: Optional[list[str]] 

    # Current url that is being dowloaded processed 
    url: Optional[str] 

    # Retrieved text from from the respective urls websources 
    text_sources: Optional[list[str]]  

    # Final output blogpost
    post: str

def create_plan(state: SearchState):
    """
    Create an exectution plan for an agent. Must define what subsections a blogpost will have. 
    """
    query = state["query"]
    msgs = state["messages"]
    prompt = f"Based on the previous conversation: {msgs}; Create a list of subsection that a blogpost on this {query} topic must have. Just output a list of subsections so that it can be parsed in python format, [section_name_1, section_name_2,.. etc], do not out put new line charcters, nothing else, just plain list."
    res = chat_model.complete(prompt)

    state["messages"] += [AIMessage(content=res.raw["message"]["content"])]
    state["sections"] = res.raw["message"]["content"].strip()[1:-1].split(",")

    print("create_plan: ", state["sections"])

    return state

def check_plan(state: SearchState):
    human_feedback = input("Please provide your input, if any: ")
    if len(human_feedback) > 0: 
        state["messages"] += [HumanMessage(content=human_feedback)]
        return "iterate"
    else:
        return "proceed"

def start_section(state: SearchState):
    print("Starting to process a section..")
    return state 

def check_start_section(state: SearchState):
    section = state["sections"][0]
    if len(state["sections"])>0: 
        state["sections"] = state["sections"][1:]
        state["section"] = f"You are writting a section about {section} that is a part of the blogpost on {state['query']}" 
        return "proceed"
    else:
        print("No sectons to process..")
        return "stop"

def get_relevant_webpages(state: SearchState):
    """
    Gets the url of the most relevant webpages for the query.

    Args:
        query (str): what to search online on www.

    Returns:
        str: url link.
    """
    from ddgs import DDGS

    search_ggg = DDGS()
    results = search_ggg.text(query=state["section"], max_results=MAX_NUMBER_URL_SOURCES )
    state["web_urls"] += [res["href"] for res in results]

    print("get_relevant_webpages: ", state["web_urls"])
    
    return state

def download_webpages(state: SearchState) -> str:
    """
    Load the raw webpage of the url. Store it in the context.
    
    Args:
        url (str): www url of the page.
    
    Returns: 
        str: html string of the text on the webpage.
    """

    for url in state["web_urls"][-MAX_NUMBER_URL_SOURCES:]:
        try: 
            with urllib.request.urlopen(url) as response:
                html_text = response.read()
                state["text_sources"].append(extract_text_with_beautifulsoup(html_text))
        except urllib.error.URLError as e:
            print("Error getting the page: ", e)
        except Exception as e:
            print("Something happened: ", e)
    
    return state

def generate_blogpost_section(state:SearchState) -> str:
    """
    Generate a blogpost section in the markdown format based on the raw of a resource. 
    """
    # TODO: how to use all the resources? 
    sources = state["text_sources"][-MAX_NUMBER_URL_SOURCES:]
    task = f"Based on the this information {sources}, generate a extensive blogpost section about the topic {state['section']} in the markdown format."
    state["post"] += chat_model.complete(task)

    return state

In [None]:
from langgraph.graph import StateGraph, START, END
from IPython.display import Image, display

# Create the graph
researcher_graph = StateGraph(SearchState)

# Add nodes
researcher_graph.add_node("create_plan", create_plan)
researcher_graph.add_node("start_section", start_section)
researcher_graph.add_node("get_relevant_webpages", get_relevant_webpages)
researcher_graph.add_node("download_webpages", download_webpages)
researcher_graph.add_node("generate_blogpost_section", generate_blogpost_section)

researcher_graph.add_edge(START, "create_plan")
researcher_graph.add_conditional_edges(
    "create_plan",
    check_plan,
    {
        "proceed": "start_section",
        "iterate": "create_plan",
    }
)
researcher_graph.add_conditional_edges(
    "start_section",
    check_start_section,
    {
        "proceed": "get_relevant_webpages",
        "stop": END,
    }
)
researcher_graph.add_edge("get_relevant_webpages", "download_webpages")
researcher_graph.add_edge("download_webpages", "generate_blogpost_section")
researcher_graph.add_edge("generate_blogpost_section", END)
dag = researcher_graph.compile()
display(Image(dag.get_graph().draw_mermaid_png()))

In [None]:
init_state = SearchState(query="History of Prussia", text_sources=list(), web_urls=list(), post="")
res = dag.invoke(init_state)

In [None]:
# import pprint
# # Set width to control line length (e.g., 80 characters)
# pp = pprint.PrettyPrinter(width=80, depth=4)
# pp.pprint(res)

In [None]:
print(res["post"].text)
with open(f"post_v2_{datetime.now().strftime('%Y_%m_%d_%H_%M')}.md", "w") as f:
    f.write(res["post"].text)
