## LLMs

In [1]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

emb_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from llama_index.llms.ollama import Ollama  

chat_model = Ollama(model="qwen2:7b", context_window=89000)

## Researcher: An Agent Workflow

Let's create an agent workflow that would: 
 1) Take a topic to write a blogpost about and subdivide it into sections
 2) The first section should include variables, definitions, concepts and terminology that are going to be used for explaining (mathematicaly) the problem   
    - Briefly explain those concepts 
 3) Look up websites that explain the topic  
    - Download the selected websites and process their data (make a note that you need to refence them)
    - Select websites that best fit the subsections identified before  
    - Decide which website is the best at explaining the topic based on how it covers the subsections
1) Analyse the website and change rearrange its content to fit the desired subsection structure and the variables and defintions  
     

In [3]:
import urllib.error
import urllib.error
import urllib.request

from llama_index.core.workflow import (
    InputRequiredEvent,
    HumanResponseEvent,
)
from llama_index.core.tools import FunctionTool
from llama_index.core.workflow import Context
import mlflow

from datetime import datetime

mlflow.set_experiment(experiment_name=datetime.now().isoformat())
mlflow.set_tracking_uri('http://localhost:5000')
mlflow.llama_index.autolog()


def get_relevant_webpages(ctx: Context, query: str) -> tuple[list[dict[str, str]], int]:
    """
    Gets the url of the most relevant webpages for the query.

    Args:
        query (str): what to search online on www.

    Returns:
        str: url link.
    """
    from ddgs import DDGS

    search_ggg = DDGS()
    results = search_ggg.text(query=query, max_results=1)
    return results[0]["href"]

def download_webpage(ctx: Context, url: str) -> str:
    """
    Load the raw webpage of the url. Store it in the context.
    
    Args:
        url (str): www url of the page.
    
    Returns: 
        str: html string of the whole webpage.
    """
    try: 
        with urllib.request.urlopen(url) as response:
            html_text = response.read()
            ctx.store.set("html_resource", html_text)
            print(html_text)
            return html_text
    except urllib.error.URLError as e:
        print("Error getting the page.")
    except Exception as e:
        print("Something happened.")

def generate_blogpost(ctx: Context, text: str) -> str:
    """
    Generate a blogpost in a markdown format based on the raw of a resource. 
    """
    task = f"Based on the this resource html {text}, generate a blogpost about the topic in markdown format."
    resp = chat_model.complete(task)

    print(resp.text)

    while True: 
        question = "do you like what you see?"
        human_feedback = ctx.wait_for_event(
            HumanResponseEvent,
            waiter_id=question,
            waiter_event=InputRequiredEvent(
                prefix=question
            )
        )
        if len(human_feedback) == 0:
            return resp.text
        else:
            task = f"Based on the this resource html {text}, generate a blogpost about the topic in markdown format. Take into account previous feedback which was: {human_feedback}."
            resp = chat_model.complete(task)


2025/07/24 19:01:53 INFO mlflow.tracking.fluent: Experiment with name '2025-07-24T19:01:53.344213' does not exist. Creating a new experiment.


In [None]:
from llama_index.core.callbacks import CallbackManager, LlamaDebugHandler
from llama_index.core import Settings
from llama_index.core.agent.workflow import ReActAgent, AgentWorkflow, FunctionAgent


llama_debug_handler = LlamaDebugHandler(print_trace_on_end=True)
callback_manager = CallbackManager([llama_debug_handler])

Settings.callback_manager = callback_manager

web_search_tool = FunctionTool.from_defaults(
    fn=get_relevant_webpages,
    name="get_relevant_webpages",
    description="Useful for getting a url link to a relevant page for a particular query." 
)

page_download_tool = FunctionTool.from_defaults(
    fn=download_webpage,
    name="download_webpage",
    description="Useful for dowloading raw html of the page, storing it in the agent's context and returning the content of that page."
)

blogpost_write_tool = FunctionTool.from_defaults(
    fn=generate_blogpost,
    name="generate_blogpost",
    description="Generate a blogpost based on the provided resource's html webpage."
)

search_agent = FunctionAgent(
    name="WebSearcher",
    description="Search the web and provide a relevant link. Once you have the link, you must hand off control to the WebpageDownloader agent to download the page.",
    system_prompt="You are a WebSearcher. Your goal is to find relevant webpages for a given query. Once you have a sufficient list of URLs, you MUST hand off to the 'WebpageDownloader' agent to proceed with downloading the content. Your output should clearly indicate the list of URLs found.",
    tools=[web_search_tool],
    verbose=True,
    llm=chat_model,
    can_handoff_to=["WebpageDownloader"], 
)

download_page_agent = FunctionAgent(
    name="WebpageDownloader",
    description="Download a web page's html given a URL. Once the webpage is successfully downloaded, hand off control to the BlogpostWriter agent.",
    system_prompt="You are the WebpageDownloader. Your task is to download the raw HTML content of a given URL. After successfully downloading the page, you MUST hand off to the 'BlogpostWriter' agent, providing the downloaded HTML content for blog post generation.",
    tools=[page_download_tool],
    verbose=True,
    llm=chat_model,
    can_handoff_to=["BlogpostWriter"], 
)

blogpost_writer_agent = FunctionAgent(
    name="BlogpostWriter",
    description="Write a blogpost in markdown based on a resource's web page html. This is the final agent in the workflow.",
    system_prompt="You are the BlogpostWriter. Your job is to generate a comprehensive, modern, factual, and concise blogpost in markdown format using the provided raw HTML content of a webpage. Once the blogpost is generated and validated, you will provide the final answer.",
    tools=[blogpost_write_tool],
    verbose=True,
    llm=chat_model,
    # No can_handoff_to for the final agent
)

workflow = AgentWorkflow(agents=[search_agent,download_page_agent,blogpost_writer_agent], root_agent="WebSearcher")

ctx = Context(workflow=workflow)
handler = workflow.run("History of Prussia", ctx=ctx)

# async for event in handler.stream_events():
#     if isinstance(event, InputRequiredEvent):
#         response = input(event.prefix)
#         handler.ctx.send_event(
#             HumanResponseEvent(
#                 response=response
#             )
#         )

# answer = await handler        
# print(answer)

In [None]:
async for event in handler.stream_events():
    print(event)