In [55]:
# Imports and Setup
import os
import re
import json
import requests
import pandas as pd
from typing import List, Dict, Any, Optional, Union
from dataclasses import dataclass
from enum import Enum
import textwrap

# LangGraph and LangChain imports
from langgraph.graph import StateGraph, END
from langgraph.graph.message import add_messages
from typing import Annotated
from typing_extensions import TypedDict
from langchain_core.messages import HumanMessage, AIMessage
from langchain_text_splitters import CharacterTextSplitter

from langchain.tools import tool

# LLM imports
from llama_index.llms.ollama import Ollama

import json
import urllib.request
import urllib.error
import urllib.parse
from urllib.parse import quote, urlsplit, urlunsplit
import urllib.request
import urllib.error

from dataclasses import dataclass
from typing import List, Dict, Any, Optional

from bs4 import BeautifulSoup
from ddgs import DDGS

In [56]:
# Configuration and LLM Setup
REQUEST_TIMEOUT = 180
MODEL_NAME = "qwen2.5:14b"

# Initialize Ollama LLM
llm = Ollama(
    model=MODEL_NAME, 
    request_timeout=REQUEST_TIMEOUT
)

# Test LLM connection
test_response = llm.complete("Hello")
print(f"LLM initialized: {test_response.text[:50]}...")


def pprint(text):
    wrapped_lines = textwrap.wrap(text, width=130)
    for line in wrapped_lines:
        print(line)

LLM initialized: Hello! How can I assist you today?...


In [57]:
def create_plan(task: str) -> str:
    """
    Given a task, determine a step-by-step action plan of what needs to be done to accomplish this task and output the answer/result. 
    The most important actions that are taken: 
     1. Define the goal: what result is asked to be produced.
     2. List the steps: provide a short explanation for each action that needs to be taken.       
    """
    
    prompt = f"""
You are an expert project planner. Your task is to create a concise, step-by-step action plan to accomplish the user's goal.

User's Goal:
---
{task}
---

Instructions:
1. Clarify the Core Objective: Start by rephrasing the user's goal as a single, clear, and specific objective.
2. Develop a Chronological Action Plan: Break down the objective into a logical sequence of high-level steps.

Guiding Principles for the Plan:
- Tool-Agnostic: Focus on the action required, not the specific tool to perform it (e.g., use "Gather data on market trends" instead of "Search Google for market trends").
- Information First: The initial step should almost always be to gather and analyze the necessary information before taking further action.
- S.M.A.R. Steps: Each step must be Specific, Measurable, Achievable, and Relevant. The focus is on the logical sequence, not specific deadlines.
- Concise: Include only the critical steps needed to reach the objective.

Example Output Format (ALWAS **JSON** ):
{{
  "objective": "Plan and execute a one-day offsite event for a team of 10 people focused on team building and strategic planning.",
  "plan": [
    "Gather requirements including budget, potential dates, and key goals for the offsite from team leadership",
    "Research and shortlist suitable venues and activity options that fit the budget and goals",
    "Create a detailed agenda and budget proposal for approval",
    "Book the selected venue, catering, and activities upon approval",
    "Send out official invitations and manage attendee confirmations and dietary requirements",
    "Finalize all logistical details and communicate the full itinerary to the team"
  ]
}}
where
  "objective" 's value in the json is a clear, one-sentence summary of the end goal,
  "plan" 's value in the json is a list **ALWAYS SEPARATED BY PYTHON NEWLINE CHARCTER** like 
  [
    A short explanation of the first logical step", 
    A short explanation of the next step that follows from the first",
    And so on..."
  ]
"""
    task_response = llm.complete(prompt)

    return task_response.text

@dataclass
class Task:
    question: str
    objective: str
    plan: list[str]


def define_task(task:str) -> str: 
    task_plan_response = create_plan(task)

    # Assume llm outputs smth json-like with the correct keys.
    result = json.loads(task_plan_response)

    return Task(
      question=task,
      objective=result["objective"],
      plan=result["plan"],
    )

In [58]:
import json
from dataclasses import dataclass
from typing import List, Dict, Literal
from typing import Literal, NewType
from enum import Enum

class AgentCapability(str, Enum):
    DEEP_WEB_SEARCHER = "deep_web_search"
    VIDEO_PROCESSOR = "video_processing"  
    AUDIO_PROCESSOR = "audio_processing"
    IMAGE_PROCESSOR = "image_processing"
    STRUCTURED_DATA_PROCESSOR = "structured_data_processing"
    UNSTRUCTURED_DATA_PROCESSOR = "unstructured_data_processing"
    CODE_MATH_WRITTER = "code_math_writing"

@dataclass
class CapabilityPlan:
    """A structured plan outlining the sequence of capabilities and actions."""
    subplan: List[Dict[str, str | str]]

def create_capability_plan(task: str, attachments: List[str] = None) -> CapabilityPlan:
    """
    Analyzes a task and generates a sequential execution plan using available capabilities.

    Args:
        task (str): The description of the task to be performed.
        attachments (list[str]): A list of file names related to the task.

    Returns:
        CapabilityPlan: A dataclass containing the ordered list of sub-tasks.
    """
    attachment_info = ""
    if attachments:
        attachment_info = f"\n\nAttachments provided: {', '.join(attachments)}"

    # TODO: is this fine to map capability to an agent one-to-one? 
    planning_prompt = f"""
You are a highly intelligent planning agent. Your primary function is to analyze a user's task and create a precise, step-by-step execution plan using a predefined set of capabilities.

**Your Task:**
Analyze the provided task and create a sequential plan to accomplish it. The plan should be a list of steps, where each step defines the capability to use and the specific activity to perform.

**Capabilities:**
- `{AgentCapability.DEEP_WEB_SEARCHER.value}`: Find, evaluate, and download web content (e.g., articles, documents). This capability is for search and downloading web resources only, not for processing the content or getting any answers on the content.
- `{AgentCapability.VIDEO_PROCESSOR.value}`: Download video, extract frames or audio from a video file for further analysis.
- `{AgentCapability.AUDIO_PROCESSOR.value}`: Download audio, transcribe speech, identify sounds, or analyze properties of an audio file.
- `{AgentCapability.IMAGE_PROCESSOR.value}`: Download image, analyze an image to identify objects, read text (OCR), or understand its content.
- `{AgentCapability.STRUCTURED_DATA_PROCESSOR.value}`: Analyze, query, or visualize data from structured files like Parquet, CSV, JSON, or databases.
- `{AgentCapability.UNSTRUCTURED_DATA_PROCESSOR.value}`: Analyze, summarize, extract information from, or answer questions about raw text or documents (e.g., PDFs, TXT files, retrieved web content).
- `{AgentCapability.CODE_MATH_WRITTER.value}`: Generate or execute code, solve mathematical problems, or perform complex logical operations and computations.

Instructions:
Deconstruct the Task -> Assign Capabilities for each step -> Define the Activity for each step (i.e.,write a clear and concise description of the specific action to be performed using the chosen capability)

Example 1: Simple Fact Lookup
Task: "What is the boiling point of water at sea level?"
Output:
{{
  "subplan": [
    {{
      "capability": "{AgentCapability.DEEP_WEB_SEARCHER.value}",
      "activity": "Search for the boiling point of water at sea level"
    }},
    {{
      "capability": "{AgentCapability.UNSTRUCTURED_DATA_PROCESSOR.value}",
      "activity": "Analyze the downloaded web resources and find the reference to the boiling point temperature."
    }}
  ]
}}

Example 2: Multi-step Information Retrieval and Analysis

Task: "Find the Q2 2025 earnings report for NVIDIA and tell me what their 'Gaming' division revenue was."
Output:
{{
  "subplan": [
    {{
      "capability": "{AgentCapability.DEEP_WEB_SEARCHER.value}",
      "activity": "Search for and download NVIDIA's official Q2 2025 earnings report document and download it."
    }},
    {{
      "capability": "{AgentCapability.UNSTRUCTURED_DATA_PROCESSOR.value}",
      "activity": "Analyze the downloaded earnings report to find and extract the revenue figure for the 'Gaming' division."
    }}
  ]
}}

---
Begin Plan Generation

Task: "{task}"
Attachments: "{attachment_info}"

Respond in this exact JSON format:
{{
  "subplan": [
    {{
      "capability": "...",
      "activity": "..."
    }},
    {{
      "capability": "...",
      "activity": "..."
    }}
  ]
}}
"""
    response = llm.complete(planning_prompt)
    response_text = response.text.strip()

    result = json.loads(response_text)
    return CapabilityPlan(subplan=result["subplan"])

In [59]:
question = "Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M."

task = define_task(question)
pprint(task.__str__())

Task(question='Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M.',
objective="Identify the actor who played Ray in the Polish-language version of 'Everybody Loves Raymond' and determine their role
in the film 'Magda M.'", plan=["Gather information about the Polish-language cast of 'Everybody Loves Raymond'", 'Identify the
actor who played Ray in this adaptation', "Research the actor's roles in other films, focusing on 'Magda M.'"])


In [60]:
# Iterate over the plan
# Get a single task of the plan together with the objective 
current_plan_step = task.plan[0]
task_step = f"You need to FOCUS ON: {current_plan_step}. When your global objective which you SHOULD NOT FOCUS ON: {task.objective}."  

# What is required for this to be executed?
capability_plan = create_capability_plan(task=task_step)
pprint(capability_plan.__str__())

CapabilityPlan(subplan=[{'capability': 'deep_web_search', 'activity': "Search for and download information about the Polish-
language cast of 'Everybody Loves Raymond'."}, {'capability': 'unstructured_data_processing', 'activity': 'Analyze the downloaded
information to gather details about the Polish-language cast members.'}])


In [92]:
##################
# web_search_agent 
##################

LINK_KEY = "link"

@dataclass
class WebResource:
    """A unified dataclass for handling a web resource.

    Attributes:
        content: The main text content of the web page. Can be None if not yet downloaded.
        link: The unique URL for the resource.
        metadata: A dictionary containing additional information, such as search result data.
    """
    content: Optional[str]
    link: str
    metadata: Dict[str, Any]


def question_to_queries(question: str, max_queries: int = 2) -> List[str]:
    """Converts a user question into a list of optimized search engine queries.

    Note:
        This function requires a Large Language Model (LLM) to generate queries.
        The `llm.complete()` call is a placeholder for your model's inference logic.

    Args:
        question: The user's input question.
        max_queries: The maximum number of search queries to generate.

    Returns:
        A list of string queries optimized for a search engine.
    """
    prompt = f"""
    Create a list of general search engine queries for the following question: "{question}".

    Make sure that:
    - Your output is a list separated by a "|" character and nothing else.
    - You provide a MAXIMUM of {max_queries} search engine queries.
    - Each query is SHORT and precise.

    Example Output:
    Large urban population areas in Europe|Biggest cities in Europe
    """
    llm_response = llm.complete(prompt)
    response_text = llm_response.text

    return response_text.strip().split("|")


def duckduckgo_search(query: str, max_results: int = 2) -> List[WebResource]:
    """Performs a DuckDuckGo search and returns results as WebResource objects.

    Args:
        query: The search query string.
        max_results: The maximum number of search results to retrieve.

    Returns:
        A list of WebResource objects, where 'content' is None and 'metadata'
        contains the search result details.
    """
    found_resources = []
    with DDGS() as ddgs:
        results = list(ddgs.text(query, max_results=max_results))
        if not results:
            print(f"⚠️ No search results found for '{query}'")
            return []

        for i, result in enumerate(results):
            resource = WebResource(
                content=None,
                link=result.get('href', 'No URL'),
                metadata={
                    "search_order": i,
                    "web_page_title": result.get('title', 'No title'),
                    "web_page_summary": result.get('body', 'No description'),
                    "query": query
                }
            )
            found_resources.append(resource)

    return found_resources


def drop_non_unique_resources(resources: List[WebResource]) -> List[WebResource]:
    """Removes duplicate WebResource objects based on their 'link' attribute.

    Args:
        resources: A list of WebResource objects.

    Returns:
        A new list of WebResource objects with duplicates removed.
    """
    seen_links = set()
    unique_resources = []
    for resource in resources:
        if resource.link and resource.link not in seen_links:
            unique_resources.append(resource)
            seen_links.add(resource.link)
    return unique_resources


def extract_clean_text(raw_html: str) -> str:
    """Extracts clean, readable text from raw HTML content.

    This function removes scripts, styles, navigation, and other non-content
    elements, then cleans up whitespace.

    Args:
        raw_html: The raw HTML content of a webpage.

    Returns:
        The extracted and cleaned plain text.
    """
    soup = BeautifulSoup(raw_html, 'html.parser')
    # Remove elements that typically do not contain main content
    for element in soup(["script", "style", "nav", "footer", "header", "aside"]):
        element.decompose()

    # Extract text and clean up whitespace
    text = soup.get_text(separator=" ")
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    return ' '.join(chunk for chunk in chunks if chunk)


# This is a new helper function to safely encode URLs
def _encode_url_path(url: str) -> str:
    """
    Encodes the path and query components of a URL to be URL-safe.
    
    This prevents errors when a URL contains non-ASCII characters.
    """
    # Split the URL into its components
    scheme, netloc, path, query, fragment = urlsplit(url)
    
    # Encode the path and query parts
    path = quote(path)
    query = quote(query)
    
    # Reassemble the URL
    return urlunsplit((scheme, netloc, path, query, fragment))


import httpx
from urllib.parse import quote, urlsplit, urlunsplit

def download_content(resource: WebResource) -> WebResource:
    """
    Downloads the HTML from a resource's link and populates its 'content' field.
    This version includes robust URL encoding and safe error printing.
    """
    if not resource.link or not resource.link.startswith('http'):
        return resource

    # For this example, let's assume _encode_url_path is defined elsewhere
    # encoded_link = _encode_url_path(resource.link)
    response = httpx.get(resource.link, timeout=15)
    charset = response.encoding or 'utf-8'
    html_bytes = response.content
    html_content = html_bytes.decode(charset)
    resource.content = extract_clean_text(html_content)

    return resource

def web_search(question: str, links_per_query: int = 2) -> List[WebResource]:
    """Orchestrates the full web search process for a given question.

    This process includes:
    1. Converting the question into search queries.
    2. Searching the web to find resources.
    3. Downloading and extracting text content from each resource.

    Args:
        question: The user's question.
        links_per_query: The number of web links to retrieve for each search query.

    Returns:
        A list of WebResource objects, with their 'content' field populated.
    """
    # 1. Generate search queries from the question
    candidate_queries = question_to_queries(question)
    print(f"\nGenerated queries: {candidate_queries}")

    # 2. Search for relevant sources for each query
    all_sources = []
    for query in candidate_queries:
        sources_for_query = duckduckgo_search(query, links_per_query)
        all_sources.extend(sources_for_query)

    # 3. Filter out any duplicate resources found by different queries
    unique_sources = drop_non_unique_resources(all_sources)
    print(f"\nFound {len(unique_sources)} unique web resources.")

    # 4. Download content for each unique resource
    final_resources = []
    for source in unique_sources:
        populated_resource = download_content(source)
        if populated_resource.content: # Only keep resources where content was successfully downloaded
            final_resources.append(populated_resource)

    return final_resources

In [103]:
for capability_step in capability_plan.subplan:
    match capability_step["capability"]:
        case AgentCapability.DEEP_WEB_SEARCHER:
            web_resources = web_search(capability_step["activity"])
        case _:
            print("Finished processing")

for web_resource in web_resources:
    print(web_resource.link)


Generated queries: ['Polish cast Everybody Loves Raymond', 'Everybody Loves Raymond Polish voices']

Found 3 unique web resources.
Finished processing
https://en.wikipedia.org/wiki/Everybody_Loves_Raymond
https://myflixer-to.tube/tv/watch-everybody-loves-raymond-movies-free-myflixer-38914
https://en.wikipedia.org/wiki/Wszyscy_kochają_Romana


In [104]:
##################################
# unstructered_text_processor 
##################################

def contruct_final_answer(task:str, context:str) -> str:
    prompt = f"""

    You are presented with a list of expert answers from different source that you need summarise.

    LIST:
    {context}
    
    Based **ONLY** on that list and without any addition assumptions from your side, perform the the task specified. 
    
    TASK:
    {task}
    
    Your answer should be in json format like so:
    {{
        "answer": <a single number, word of a phrase which si the answer to the question>,
        "clarification": <very short mention of what the answer is based on>,
    }}

    Rules:
        - If the text contains the complete answer → put the exact answer in "answer".
        - If the text contains no relevant information → put "answer": "not found".
        - If the text contains some but not all information → put "answer": "not found".
        - The "clarification" must mention the relevant part of the text and explain briefly.

    Examples:
    Q: "Who won the 2022 FIFA World Cup?"
    {{
    "answer": "not found",
    "clarification": "The text mentions the location of the tournament but not the winner."
    }}
    Q: "How many colours there is in the rainbow"
    {{
    "answer": "12",
    "clarification": "Red,Orange,Yellow,Chartreuse green,Green,Blue-green,Cyan,Azure,Violet,Purple,Magenta,Red"
    }}
    Q:"What's the name of Russian Santa?"
    {{
    "answer": "Ded Moroz",
    "clarification": "Easter Slavic Father Frost"
    }}
    """
    answer = llm.complete(prompt)

    return answer.text

def task_with_text_llm(task: str, text:str) -> str:
    prompt = f"""
    Perform the instruction/task in the user's question. 
    Use only the information provided in the context. 
    
    TASK 
    {task}

    CONTEXT
    {text}

    **IMPORTANT** If the text does not include the SPECIFIC information about the task, output "NOT FOUND"
    Output
    """

    llm_result = llm.complete(prompt)
    
    return llm_result.text

def text_process_llm(task: str, text:str):
    text_splitter = CharacterTextSplitter(
        chunk_size=10000,        
        chunk_overlap=500,     
        separator=""            
    )
    chunks = text_splitter.split_text(text)

    responses = list()
    for chunk in chunks: 
        answer_response = task_with_text_llm(task, chunk)

        responses.append(answer_response)
        
    return responses

In [None]:
text = [web_resource.content for web_resource in web_resources]
text_process_llm(task=capability_plan.subplan[1]["activity"], text=text)

Created a chunk of size 47551, which is longer than the specified 10000


['NOT FOUND',
 'The Polish-language cast members of the TV series "Wszyscy kochają Romana" are:\n\n- Bartłomiej Kasprzykowski as Roman (Ray)\n- Aneta Todorczuk-Perchuć as Dorota (Debra)\n- Anna Seniuk as Maryla (Marie)\n- Joachim Lamża as Zygmunt (Frank)\n- Tede as Robert (Robert)']

In [98]:
# Based on capabilities let an agent do the task 

# Combine task and objective and hand over the execution to the agent 

In [None]:
# Agent's work flow:
#  1. Compose query 
#  2. Do search & collect results 
#  3. Download and process results
#  4. Save results 
#  5. Produce summery of how the data looks like and where to find it.
 

In [None]:
#  Define agents 
from llama_index.core.agent import ReActAgent, FunctionCallingAgent
from llama_index.core.tools import FunctionTool

In [None]:
# question = "Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M."

# task = define_task(question)
# pprint(task.__str__())
