In [None]:
# Imports and Setup
import os
import re
import json
import requests
import pandas as pd
from typing import List, Dict, Any, Optional, Union
from dataclasses import dataclass
from enum import Enum
import textwrap

# LangGraph and LangChain imports
from langgraph.graph import StateGraph, END
from langgraph.graph.message import add_messages
from typing import Annotated
from typing_extensions import TypedDict
from langchain_core.messages import HumanMessage, AIMessage
from langchain_text_splitters import CharacterTextSplitter

from langchain.tools import tool

# LLM imports
from llama_index.llms.ollama import Ollama

# Additional tools
import yt_dlp
import whisper
from PIL import Image
import subprocess

In [None]:
# Configuration and LLM Setup
REQUEST_TIMEOUT = 180
CONTEXT_WINDOW = 80000
MODEL_NAME = "qwen2:7b"
WEB_SOURCE_URL_KEY = "href"

# Initialize Ollama LLM
llm = Ollama(
    model=MODEL_NAME, 
    context_window=CONTEXT_WINDOW, 
    request_timeout=REQUEST_TIMEOUT
)

# Test LLM connection
test_response = llm.complete("Hello")
print(f"LLM initialized: {test_response.text[:50]}...")

# Configuration
MAX_RETRIES = 3
TEMP_DIR = "./temp_files"
os.makedirs(TEMP_DIR, exist_ok=True)

### Utility functions

In [None]:
def pprint(text):
    wrapped_lines = textwrap.wrap(text, width=130)
    for line in wrapped_lines:
        print(line)

## Generalist Nodes

The following code will be simple base functions that will describe capabilities of the generalist agent.

### Query classification

In [None]:
# Question Classification System
@dataclass
class QuestionAnalysis:
    question_analysis: str
    deep_web_search: bool  
    video_processing: bool 
    audio_prcessing: bool 
    image_processing: bool 
    structured_data_processing: bool    
    unstructered_data_processing: bool  
    code_math_writing: bool 

def classify_question(question: str, attachments: List[str] = None) -> QuestionAnalysis:
    """
    Use LLM to analyze a question and determine what capabilities/steps are needed.
    
    Args:
     question (str): text of the question 
     attachments (str): list of files that are related to the question 

    Returns: 
        QuestionAnalysis: dataclass that describes what answering this question requires 
    """
    attachment_info = ""
    if attachments:
        attachment_info = f"\n\nAttachments mentioned: {', '.join(attachments)}"
    
    classification_prompt = f"""
You are a highly intelligent routing agent. Your primary function is to analyze a user's question and determine the precise capabilities required to answer it accurately and efficiently.

Question: {question}
Attachments: {attachment_info}

**Your Task:**
Analyze the provided question and determine the most logical and efficient plan to answer it using the capabilities listed below. Your analysis must be detailed in a step-by-step plan and then summarized in a series of boolean flags.

**Capabilities:**
- `deep_web_search`: Executing iterative search queries to find websources that answer the query the best, i.e., find and evaluate web content. This steps also often includes SIMPLE synthesize/understanding information from web pages. Use this for questions requiring up-to-date or niche knowledge.
- `video_processing`: Processing a video file to analyze its content, typically by extracting frames for image analysis or audio for transcription/analysis.
- `audio_processing`: Processing an audio file to transcribe speech, identify sounds, or analyze acoustic properties.
- `image_processing`: Visually analyzing an image to identify objects, read text, or understand its content.
- `structured_data_processing`: Analyzing, querying, or visualizing data from structured files like Parquet, CSV, JSON, or databases.
- `unstructured_data_processing`: Performing detailed analysis on a provided block of raw text or multiple (retrieved) documents (e.g., summarization, sentiment analysis, entity extraction, processing multiple pieces of text). This is for analyzing *provided* text.
- `code_math_writing`: Generating or executing code, solving mathematical problems, or performing complex computations.

**Instructions:**
1.  **Analyze and Plan:** First, create a clear, direct and concise description of AI capabilities needed to answer this question, put the answer in the `question_analysis` field.
2.  **Select Minimum Capabilities:** Based on your description/plan, set the corresponding boolean flags to `true`. Only activate the capabilities that are *absolutely necessary* for your plan. For example, a simple fact-lookup might not require `unstructured_data_processing` on top of `deep_web_search`.
3.  **Ensure Consistency:** The capabilities mentioned in your `question_analysis` text MUST perfectly match the boolean flags set to `true`.
4.  **Respond in JSON:** Your entire output must be in the exact JSON format specified below.

**Better Examples:**

Question: "What is the boiling point of water at sea level?"
Analysis:
{{
    "question_analysis": "This is a direct factual query. It requires a single deep web search to look up a well-known scientific constant.",
    "deep_web_search": "true",
    "video_processing": "false",
    "audio_prcessing": "false",
    "image_processing": "false",
    "structured_data_processing": "false",
    "unstructered_data_processing": "false",
    "code_math_writing": "false"
}}

Question: "Summarize the attached meeting notes for me." (with a .txt file attached)
Analysis:
{{
    "question_analysis": "The user has provided a text document and wants a summary. This requires unstructured data processing to read the attached text and generate a concise summary of its key points.",
    "deep_web_search": "false",
    "video_processing": "false",
    "audio_prcessing": "false",
    "image_processing": "false",
    "structured_data_processing": "false",
    "unstructered_data_processing": "true",
    "code_math_writing": "false"
}}

Question: "Count the number of cars in this video and plot their positions on a heatmap." (with a video file attached)
Analysis:
{{
    "question_analysis": "This is a multi-step task. First, it requires video processing to extract frames from the attached video file. Second, it needs image processing to be run on those frames to detect and count objects identified as 'cars' and log their coordinates. Finally, it requires code and mathematical computations to aggregate these coordinates and generate a heatmap visualization.",
    "deep_web_search": "false",
    "video_processing": "true",
    "audio_prcessing": "false",
    "image_processing": "true",
    "structured_data_processing": "false",
    "unstructered_data_processing": "false",
    "code_math_writing": "true"
}}

---
**Begin Analysis**

Question: "{question}"
Attachments: "{attachment_info}"

Respond in this exact JSON format:
{{
    "question_analysis": "Breakdown of what this question requires, including all necessary capabilities and processes",
    "deep_web_search": "true"/"false",
    "video_processing": "true"/"false",
    "audio_prcessing": "true"/"false",
    "image_processing": "true"/"false",
    "structured_data_processing": "true"/"false",
    "unstructered_data_processing": "true"/"false",
    "code_math_writing": "true"/"false"
}}
"""
    response = llm.complete(classification_prompt)

    response_text = response.text.strip()
    result = json.loads(response_text)
    return QuestionAnalysis(
        question_analysis=result["question_analysis"],
        deep_web_search=result["deep_web_search"],
        video_processing=result["video_processing"], 
        audio_prcessing=result["audio_prcessing"], 
        image_processing=result["image_processing"], 
        structured_data_processing=result["structured_data_processing"],    
        unstructered_data_processing=result["unstructered_data_processing"],  
        code_math_writing=result["code_math_writing"], 
    )

### Web Search 

In [None]:
from ddgs import DDGS
from bs4 import BeautifulSoup
import urllib.request
import urllib.error
from bs4 import BeautifulSoup
import urllib.parse
from urllib.parse import urlparse


def parse_into_list(text:str, separator: str = "|") -> list[str]:
    """
    """
    return text.strip().strip("[]").split(separator)


def question_to_query(question: str) -> list[str]:
    """ 
    """
    prompt = f"""
    Create a set of general search engine queries for the following question: "{question}". 
    
    Make sure that:
    - Your output is a list separated by "|" sing and nothing else
    - Give maximum of two options where each query should be uniquely phrased 
    - Never use double or single quates anywhere in the answer
    - Do not mention any specific website where the information should be searched
    """

    print("question_to_query prompt: ", prompt)
    # Send this to an llm 
    query_responses = llm.complete(prompt)
    
    # Parse the response 
    return parse_into_list(query_responses.text)

# @tool
def duckduckgo_search(query: str, max_results: int = 5) -> str:
    """
    Search the web for information using DuckDuckGo search.
    """
    try:
        with DDGS() as ddgs:
            results = list(ddgs.text(query, max_results=max_results))
            
            if not results:
                return f"No search results found for '{query}'"

            formatted_results = []
            for i, result in enumerate(results, 1):
                title = result.get('title', 'No title')
                body = result.get('body', 'No description')
                href = result.get(WEB_SOURCE_URL_KEY, 'No URL')
                
                formatted_results.append({"search_order": i, "web_page_title": title, "web_page_summary": body, "url": href})
            return formatted_results
    
    except Exception as e:
        return f"Error performing web search: {e}"

def drop_non_unique_dicts(lst:list, unique_key:str=WEB_SOURCE_URL_KEY):
    seen_hrefs = set()
    result_list = []
    
    for item in lst:
        href = item.get(unique_key)
        
        if href not in seen_hrefs:
            result_list.append(item)
            seen_hrefs.add(href)
            
    return result_list

def web_search_question(question: str):
    """ 
    """
    # Disambiguate the question into a query
    candidate_queries = question_to_query(question)
    
    print("Candidate queries for the question are: ", candidate_queries)

    # Search for relevant sources 
    sources = list()
    for query in candidate_queries:
        sources_query = duckduckgo_search(query)
        sources.extend(sources_query)
    
    print("Found resources are: ", sources)
    return drop_non_unique_dicts(sources)


def extract_text_with_links(raw_html: str, base_url: str) -> str:
    """
    Extract clean text from raw HTML, converting hyperlinks to Markdown format.
    
    Args:
        raw_html (str): Raw HTML content
        base_url (str): Base URL to resolve relative links (e.g., Wikipedia base)
        
    Returns:
        str: Extracted text with hyperlinks in Markdown format: [text](url)
    """
    try:
        soup = BeautifulSoup(raw_html, 'html.parser')
        
        # Remove unwanted elements
        for element in soup(["script", "style", "nav", "footer", "header", "aside"]):
            element.decompose()
        
        # Convert all <a> tags to Markdown-style links
        for a_tag in soup.find_all('a', href=True):
            link_text = a_tag.get_text(separator=" ", strip=True)
            if not link_text:  # Skip if link has no text
                a_tag.replace_with("")
                continue
                
            href = a_tag['href']
            
            # Resolve relative URLs
            if href.startswith('/'):
                full_url = urllib.parse.urljoin(base_url, href)
            else:
                full_url = href

            # Replace the <a> tag with Markdown link
            markdown_link = f"[{link_text}]({full_url})"
            a_tag.replace_with(markdown_link)
        
        # Extract all text (now with Markdown links)
        text = soup.get_text(separator=" ")

        # Clean up whitespace: remove extra spaces and newlines
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        cleaned_text = ' '.join(chunk for chunk in chunks if chunk)
        
        return cleaned_text
    
    except Exception as e:
        print(f"Error extracting text from HTML: {e}")
        return ""

from bs4 import BeautifulSoup

def extract_clean_text(raw_html: str) -> str:
    """
    Extract clean text from raw HTML, removing hyperlinks and unwanted elements.
    
    Args:
        raw_html (str): Raw HTML content
        
    Returns:
        str: Extracted plain text with no hyperlinks
    """
    try:
        soup = BeautifulSoup(raw_html, 'html.parser')
        
        # Remove unwanted elements
        for element in soup(["script", "style", "nav", "footer", "header", "aside"]):
            element.decompose()

        # Extract all visible text
        text = soup.get_text(separator=" ")

        # Clean up whitespace: remove extra spaces and newlines
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        cleaned_text = ' '.join(chunk for chunk in chunks if chunk)
        
        return cleaned_text
    
    except Exception as e:
        print(f"Error extracting text from HTML: {e}")
        return ""


def get_base_url(url: str) -> str:
    """
    Extract base URL (scheme + netloc) from a full URL.
    Example:
        Input:  https://en.wikipedia.org/wiki/Prussia
        Output: https://en.wikipedia.org
    """
    parsed = urlparse(url)
    return f"{parsed.scheme}://{parsed.netloc}"

def download_text_content(url: str):
    """
    Download and process content from approved sources.
    """
    text_content = None
    
    try:
        print(f"⬇️ Starting to download: {url}")
        with urllib.request.urlopen(url, timeout=30) as response:
            html_content = response.read()
            text_content = extract_clean_text(html_content) #extract_text_with_links(html_content, base_url=get_base_url(url))         
    # ToDo: create custom errors    
    except urllib.error.URLError as e:
        print(f"❌ URL Error for {url}: {e}")
    except Exception as e:
        print(f"❌ Unexpected error for {url}: {e}")
    
    return text_content

In [None]:
def inspect_text_with_relation_to_question(question:str, text: str) -> str:
    # Analyse the content and see if the query answer is in there 

    pprint(f"==========================\nPAGE CONTENT:  {text} \n")

    prompt = f"""
    You are an expert AI research assistant performing a deep analysis of a webpage.
    Your goal is to determine if the provided `page_content` conclusively answers the user's `query`.

    --- USER QUERY ---
    {question}

    --- PAGE CONTENT ---
    {text}

    --- INSTRUCTIONS ---
    Carefully analyze the `page_content` in relation to the `user_query` and follow this process:
    1.  **Decision:** Based on your analysis, decide on one of two possible outcomes:
        a. **ANSWER_FOUND:** The content provides a direct, complete, and trustworthy answer to the query.
        b. **DEAD_END:** The content is irrelevant, low-quality, or does not contain any useful hyperlinks to continue the search.
    2.  **Output:** Respond with a single JSON object with two keys: "status" and "result". The content of "result" depends on the status.

        - If the status is **"ANSWER_FOUND"**:
        The "result" key must contain a string with the final, extracted answer.
        Example: {{"status": "ANSWER_FOUND", "result": "The final answer is X."}}

        - If the status is **"DEAD_END"**:
        The "result" key must contain a string briefly explaining why the search has hit a dead end on this page.
        Example: {{"status": "DEAD_END", "result": "Page is a login form with no relevant information."}}

    **CRITICAL RULE:** Your entire response must be a single, valid JSON object and nothing else. Do not add any text before or after the JSON.
    """
    analysis_response  = llm.complete(prompt)
    return    analysis_response.text

def synthesize_answer(question: str, url: str) -> str:
    """Analyzes a webpage's content to find an answer or promising hyperlinks.

    This function is a core reasoning step in an iterative web search agent.
    It assesses if the text content of a given URL contains a conclusive answer
    to the user's query. 

    Args:
        query (str): The original, high-level question the user is trying to
            answer. This provides the context for the analysis.
        url (str): The URL of the specific webpage to download and analyze.

    Returns:
        List of str where str is a JSON-formatted string representing one of three outcomes:
        1.  **Answer Found**: A JSON object containing the status "ANSWER_FOUND"
            and the extracted answer.
            Example: '{"status": "ANSWER_FOUND", "result": "Mercedes Sosa released 2 studio albums between 2000 and 2009."}'

        2.  **Dead End**: A JSON object with the status "DEAD_END" if the page
            is irrelevant and offers no promising links for follow-up.
            Example: '{"status": "DEAD_END", "result": "The page content is not relevant to the query and contains no useful links."}'
        each element of the (json) represents where the answer was found on a particular web page text chunk.
    """
    # Download the content of the page 
    page_content = download_text_content(url)

    text_splitter = CharacterTextSplitter(
        chunk_size=4000,        
        chunk_overlap=500,     
        separator="."            
    )

    chunks = text_splitter.split_text(page_content)
    print(f"Splitted text in deep_web_analysis in {len(chunks)} chunks.")

    analysis_responses = list()
    for i, chunk in enumerate(chunks):   
        print("Starting chunk: ", i)     
        result_inspection = inspect_text_with_relation_to_question(question, chunk)
        analysis_responses.append(result_inspection)
        print("Finished chunk: ", i) 

    return analysis_responses

In [None]:
question = "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia."

search_results = web_search_question(question) 
search_results

for search_result in search_results: 
    page_analysis_result = synthesize_answer(question, search_result["url"])
    print(page_analysis_result)
    break
# res = synthesize_answer(question, "https://en.wikipedia.org/w/index.php?title=Mercedes_Sosa&action=edit&section=6")


### Image analysis with Llava 

Llava does not usually provide accurate and correct results for specific queries related to images (e.g., tasks like counting object or describing what text is on the image). It can however describe in very very general terms what is on the image.

In [None]:
# Image Analyzer Tool with LLaVA Integration
from PIL import Image
import base64
import io
from typing import Literal

@tool
def image_analyzer_llava(image_path: str, task: str = Literal["describe", "text"]) -> str:
    """
    Analyze images using local LLaVA instance - describe content, analyze chess positions, read text, etc.
    """
    try:
        # Check if file exists
        if not os.path.exists(image_path):
            return f"Image file not found: {image_path}"
        
        # Load and process image
        image = Image.open(image_path)
        
        # Get basic image information
        width, height = image.size
        mode = image.mode
        format_type = image.format
        file_size = os.path.getsize(image_path)
        file_name = os.path.basename(image_path)
        
        # Convert image to base64 for API transmission
        buffered = io.BytesIO()
        image.save(buffered, format="PNG")
        img_base64 = base64.b64encode(buffered.getvalue()).decode()
        
        # Create task-specific prompts
        if task == "describe":
            prompt = "Describe what you see in this image in detail."
        elif task == "text":
            prompt = "Extract and read any text visible in this image."
        else:
            prompt = f"Analyze this image for the following task: {task}"
        
        # Placeholder for LLaVA API call
        llava_response = send_to_llava(img_base64, prompt)
        
        basic_info = f"Image: {file_name}\nSize: {width}x{height}\nMode: {mode}\nFormat: {format_type}\nFile size: {file_size} bytes"
        
        return f"{basic_info}\n\nLLaVA Analysis:\n{llava_response}"
    
    except Exception as e:
        return f"Error analyzing image '{image_path}': {e}"

def send_to_llava(image_base64: str, prompt: str) -> str:
    """
    Placeholder function to send image to local LLaVA instance.
    Replace this with actual API call to your LLaVA server.
    """
    try:
        # Placeholder for actual LLaVA API integration
        # This would typically be a POST request to localhost:11434 or similar
        
        # Example of what the actual implementation might look like:
        import requests
        
        payload = {
            "model": "llava",
            "prompt": prompt,
            "images": [image_base64],
            "stream": False
        }
        
        response = requests.post("http://localhost:11434/api/generate", 
                               json=payload, 
                               timeout=60)
        
        if response.status_code == 200:
            return response.json().get("response", "No response from LLaVA")
        else:
            return f"LLaVA API error: {response.status_code}"
    
    except Exception as e:
        return f"Error communicating with LLaVA: {e}"

### Structured (tabular) data processing

In [None]:
# File Processor Tool
import pandas as pd
import csv

@tool
def file_processor(file_path: str, file_type: str = "auto") -> str:
    """
    Process various file types - Excel files, CSV files, Parquet files, text files, etc.
    """
    try:
        # Check if file exists
        if not os.path.exists(file_path):
            return f"File not found: {file_path}"
        
        file_name = os.path.basename(file_path)
        file_size = os.path.getsize(file_path)
        
        if file_path.endswith(('.xlsx', '.xls')):
            # Process Excel files
            df = pd.read_excel(file_path)
            
            # Basic analysis
            shape_info = f"Shape: {df.shape[0]} rows, {df.shape[1]} columns"
            columns_info = f"Columns: {list(df.columns)}"
            
            # Calculate totals for numeric columns
            numeric_cols = df.select_dtypes(include=['number']).columns
            totals_info = ""
            if len(numeric_cols) > 0:
                totals = df[numeric_cols].sum()
                totals_info = f"Column totals: {totals.to_dict()}"
            
            sample_data = f"First 5 rows:\n{df.head().to_string()}"
            
            return f"Excel file: {file_name}\nFile size: {file_size} bytes\n{shape_info}\n{columns_info}\n{totals_info}\n\n{sample_data}"
        
        elif file_path.endswith('.csv'):
            # Process CSV files with automatic delimiter detection
            def detect_delimiter(file_path):
                with open(file_path, 'r', encoding='utf-8') as f:
                    sample = f.read(1024)
                    sniffer = csv.Sniffer()
                    delimiter = sniffer.sniff(sample).delimiter
                    return delimiter
            
            try:
                delimiter = detect_delimiter(file_path)
                df = pd.read_csv(file_path, delimiter=delimiter)
                
                # Basic analysis
                shape_info = f"Shape: {df.shape[0]} rows, {df.shape[1]} columns"
                columns_info = f"Columns: {list(df.columns)}"
                delimiter_info = f"Detected delimiter: '{delimiter}'"
                
                # Calculate totals for numeric columns
                numeric_cols = df.select_dtypes(include=['number']).columns
                totals_info = ""
                if len(numeric_cols) > 0:
                    totals = df[numeric_cols].sum()
                    totals_info = f"Column totals: {totals.to_dict()}"
                
                sample_data = f"First 5 rows:\n{df.head().to_string()}"
                
                return f"CSV file: {file_name}\nFile size: {file_size} bytes\n{delimiter_info}\n{shape_info}\n{columns_info}\n{totals_info}\n\n{sample_data}"
            
            except Exception as csv_error:
                # Fallback to text processing if CSV parsing fails
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                
                line_count = len(content.split('\n'))
                preview = content[:500] + "..." if len(content) > 500 else content
                
                return f"CSV file (read as text due to parsing error): {file_name}\nFile size: {file_size} bytes\nLines: {line_count}\nError: {csv_error}\n\nContent preview:\n{preview}"
        
        elif file_path.endswith('.parquet'):
            # Process Parquet files
            df = pd.read_parquet(file_path)
            
            # Basic analysis
            shape_info = f"Shape: {df.shape[0]} rows, {df.shape[1]} columns"
            columns_info = f"Columns: {list(df.columns)}"
            
            # Calculate totals for numeric columns
            numeric_cols = df.select_dtypes(include=['number']).columns
            totals_info = ""
            if len(numeric_cols) > 0:
                totals = df[numeric_cols].sum()
                totals_info = f"Column totals: {totals.to_dict()}"
            
            sample_data = f"First 5 rows:\n{df.head().to_string()}"
            
            return f"Parquet file: {file_name}\nFile size: {file_size} bytes\n{shape_info}\n{columns_info}\n{totals_info}\n\n{sample_data}"
        
        else:
            # Read as text file for all other formats (including Python files)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
            except UnicodeDecodeError:
                # Try with different encoding if UTF-8 fails
                with open(file_path, 'r', encoding='latin-1') as f:
                    content = f.read()
            
            line_count = len(content.split('\n'))
            word_count = len(content.split())
            char_count = len(content)
            
            preview = content[:500] + "..." if len(content) > 500 else content
            
            file_type_desc = "Python file" if file_path.endswith('.py') else "Text file"
            
            return f"{file_type_desc}: {file_name}\nFile size: {file_size} bytes\nLines: {line_count}\nWords: {word_count}\nCharacters: {char_count}\n\nContent preview:\n{preview}"
    
    except Exception as e:
        return f"Error processing file '{file_path}': {e}"

print("File processor tool implemented successfully")