## LLMs

In [None]:
from llama_index.llms.ollama import Ollama  

chat_model = Ollama(model="qwen2:7b", context_window=80000, request_timeout=300)

## Researcher: An Agent Workflow

Let's create an agent workflow that would: 
defintions  
     

In [None]:
from bs4 import BeautifulSoup

def extract_text_with_beautifulsoup(raw_html: str) -> str:
    """
    Extract clean text from raw HTML using BeautifulSoup.
    
    Args:
        raw_html (str): Raw HTML content
        
    Returns:
        str: Extracted clean text
    """
    soup = BeautifulSoup(raw_html, 'html.parser')
    
    # Remove script and style elements
    for script in soup(["script", "style", "nav", "footer", "header", "aside"]):
        script.decompose()
    
    # Get text and clean it up
    text = soup.get_text()
    
    # Clean up whitespace
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = ' '.join(chunk for chunk in chunks if chunk)
    
    return text


In [None]:
from typing import TypedDict, Optional
from datetime import datetime

import urllib

# import mlflow
# mlflow.set_experiment(experiment_name=datetime.now().isoformat())
# mlflow.set_tracking_uri('http://localhost:5000')
# mlflow.llama_index.autolog()


class SearchState(TypedDict):
    # Query to research
    query: str 

    # List of relevant urls webpages (resources/references) to the query 
    web_urls: Optional[list[str]] 

    # Current url that is being dowloaded processed 
    url: Optional[str] 

    # Retrieved text from from respective urls websources 
    text_sources: Optional[list[str]]  

    # Final output blogpost
    post: str


def get_relevant_webpages(state: SearchState):
    """
    Gets the url of the most relevant webpages for the query.

    Args:
        query (str): what to search online on www.

    Returns:
        str: url link.
    """
    from ddgs import DDGS

    search_ggg = DDGS()
    results = search_ggg.text(query=state["query"], max_results=2)
    state["web_urls"] = [res["href"] for res in results]
    
    return state

def check_urls(state: SearchState):
    """There could be complex comparison logic here, but we just implement this in `route_move_to_download`."""
    return state
    
def route_move_to_download(state: SearchState): 
    if len(state["web_urls"])>0:
        return "proceed"
    else:
        return "stop"

def download_webpages(state: SearchState) -> str:
    """
    Load the raw webpage of the url. Store it in the context.
    
    Args:
        url (str): www url of the page.
    
    Returns: 
        str: html string of the text on the webpage.
    """

    for url in state["web_urls"]:
        try: 
            with urllib.request.urlopen(url) as response:
                html_text = response.read()
                state["text_sources"].append(extract_text_with_beautifulsoup(html_text))
        except urllib.error.URLError as e:
            print("Error getting the page: ", e)
        except Exception as e:
            print("Something happened: ", e)
    
    return state

def generate_blogpost(state:SearchState) -> str:
    """
    Generate a blogpost in a markdown format based on the raw of a resource. 
    """
    # TODO: how to use all the resources? 
    sources = state["text_sources"][0]
    task = f"Based on the this information {sources}, generate a blogpost about the topic {state['query']} in the markdown format."
    state["post"] = chat_model.complete(task)

    return state

In [None]:
from langgraph.graph import StateGraph, START, END
from IPython.display import Image, display

# Create the graph
researcher_graph = StateGraph(SearchState)

# Add nodes
researcher_graph.add_node("get_relevant_webpages", get_relevant_webpages)
researcher_graph.add_node("check_urls", check_urls)
researcher_graph.add_node("download_webpages", download_webpages)
researcher_graph.add_node("generate_blogpost", generate_blogpost)

researcher_graph.add_edge(START, "get_relevant_webpages")
researcher_graph.add_edge("get_relevant_webpages", "check_urls")
researcher_graph.add_conditional_edges(
    "check_urls",
    route_move_to_download,
    {
        "proceed": "download_webpages",
        "stop": END
    }
)
researcher_graph.add_edge("download_webpages", "generate_blogpost")
researcher_graph.add_edge("generate_blogpost", END)

dag = researcher_graph.compile()
display(Image(dag.get_graph().draw_mermaid_png()))

In [None]:
init_state = SearchState(query="History of Prussia", text_sources=list())
res = dag.invoke(init_state)

In [None]:
# import pprint
# # Set width to control line length (e.g., 80 characters)
# pp = pprint.PrettyPrinter(width=80, depth=4)
# pp.pprint(res)

In [None]:
print(res["post"].text)
with open(f"post_{datetime.now().strftime('%Y_%m_%d_%H_%M')}.md", "w") as f:
    f.write(res["post"].text)
