In [None]:
# Imports and Setup
import os
import re
import json
import requests
import pandas as pd
from typing import List, Dict, Any, Optional, Union
from dataclasses import dataclass
from enum import Enum
import textwrap

# LangGraph and LangChain imports
from langgraph.graph import StateGraph, END
from langgraph.graph.message import add_messages
from typing import Annotated
from typing_extensions import TypedDict
from langchain_core.messages import HumanMessage, AIMessage
from langchain_text_splitters import CharacterTextSplitter

from langchain.tools import tool

# LLM imports
from llama_index.llms.ollama import Ollama

# Additional tools
import yt_dlp
import whisper
from PIL import Image
import subprocess

# RAG system imports
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, PointStruct
from sentence_transformers import SentenceTransformer
import uuid

In [20]:
# Configuration and LLM Setup
REQUEST_TIMEOUT = 180
MODEL_NAME = "qwen2:7b"
WEB_SOURCE_URL_KEY = "href"

# Initialize Ollama LLM
llm = Ollama(
    model=MODEL_NAME, 
    request_timeout=REQUEST_TIMEOUT
)

# Test LLM connection
test_response = llm.complete("Hello")
print(f"LLM initialized: {test_response.text[:50]}...")

# Configuration
MAX_RETRIES = 3
TEMP_DIR = "./temp_files"
os.makedirs(TEMP_DIR, exist_ok=True)

# Initialize RAG system components
print("Initializing RAG system...")

# Initialize sentence transformer for embeddings
embedding_model = SentenceTransformer('allenai/longformer-base-4096')
embedding_dimension = 768  # Dimension the embedding model 

# Initialize Qdrant in-memory client
qdrant_client = QdrantClient(":memory:")
collection_name = "web_content"
try:
    qdrant_client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=embedding_dimension, distance=Distance.COSINE)
    )
    print(f"Created Qdrant collection: {collection_name}")
except Exception as e:
    print(f"Failed to create Qdrant collection: {e}")

print("RAG system initialized successfully!")

LLM initialized: Hello! How can I assist you today?...
Initializing RAG system...


No sentence-transformers model found with name allenai/longformer-base-4096. Creating a new one with mean pooling.


Created Qdrant collection: web_content
RAG system initialized successfully!


In [38]:
# Scroll through the collection
scroll_result, next_page = qdrant_client.scroll(
    collection_name=collection_name,
    limit=10,              # how many points to fetch at once
    with_payload=True,     # include stored payload
    with_vectors=False     # set True if you also want full vectors
)
len(scroll_result[0].payload["text"])


2000

In [40]:
emb_ve = embedding_model.encode("studio albums")
res = qdrant_client.search(collection_name=collection_name, query_vector=emb_ve)
res

  res = qdrant_client.search(collection_name=collection_name, query_vector=emb_ve)


[ScoredPoint(id='e4a40781-6850-4dcf-8533-fd1d8cdb9d0b', version=0, score=0.9218921661376953, payload={'url': 'https://en.wikipedia.org/wiki/Mercedes_Sosa', 'chunk_index': 10, 'text': 's Sosa sang on YouTube . Retrieved 3 March 2010. ^ a b c d e f g h "Latin artist Mercedes Sosa dies" . BBC . 4 October 2009 . Retrieved 5 October 2009 . ^ Karush, Matthew (2017). Musicians in Transit: Argentina and the Globalization of Popular Music . Duke. p.\xa0168. ISBN \xa0 978-0-8223-7377-3 . ^ a b Associated Press [ dead link ] ^ a b "Biografía" . Fundación Mercedes Sosa (in Spanish) . Retrieved 8 March 2025 . ^ "El folclore argentino llora la muerte de Daniel Toro - Notas - Viva la Radio" . Cadena 3 Argentina (in Spanish) . Retrieved 14 March 2025 . ^ Nilsson, Leopoldo Torre (7 April 1971), Güemes - la tierra en armas (Drama, History), Alfredo Alcón, Norma Aleandro, Gabriela Gili, Producciones Cinematográficas Cerrillos , retrieved 8 March 2025 ^ Rodrigo (10 September 2020). "Patricio Manns: Cuando

### Utility functions

In [22]:
def pprint(text):
    wrapped_lines = textwrap.wrap(text, width=130)
    for line in wrapped_lines:
        print(line)

## Generalist Nodes

The following code will be simple base functions that will describe capabilities of the generalist agent.

### Query classification

In [23]:
# Question Classification System
@dataclass
class QuestionAnalysis:
    question_analysis: str
    deep_web_search: bool  
    video_processing: bool 
    audio_prcessing: bool 
    image_processing: bool 
    structured_data_processing: bool    
    unstructered_data_processing: bool  
    code_math_writing: bool 

def classify_question(question: str, attachments: List[str] = None) -> QuestionAnalysis:
    """
    Use LLM to analyze a question and determine what capabilities/steps are needed.
    
    Args:
     question (str): text of the question 
     attachments (str): list of files that are related to the question 

    Returns: 
        QuestionAnalysis: dataclass that describes what answering this question requires 
    """
    attachment_info = ""
    if attachments:
        attachment_info = f"\n\nAttachments mentioned: {', '.join(attachments)}"
    
    classification_prompt = f"""
You are a highly intelligent routing agent. Your primary function is to analyze a user's question and determine the precise capabilities required to answer it accurately and efficiently.

Question: {question}
Attachments: {attachment_info}

**Your Task:**
Analyze the provided question and determine the most logical and efficient plan to answer it using the capabilities listed below. Your analysis must be detailed in a step-by-step plan and then summarized in a series of boolean flags.

**Capabilities:**
- `deep_web_search`: Executing iterative search queries to find websources that answer the query the best, i.e., find and evaluate web content. This steps also often includes SIMPLE synthesize/understanding information from web pages. Use this for questions requiring up-to-date or niche knowledge.
- `video_processing`: Processing a video file to analyze its content, typically by extracting frames for image analysis or audio for transcription/analysis.
- `audio_processing`: Processing an audio file to transcribe speech, identify sounds, or analyze acoustic properties.
- `image_processing`: Visually analyzing an image to identify objects, read text, or understand its content.
- `structured_data_processing`: Analyzing, querying, or visualizing data from structured files like Parquet, CSV, JSON, or databases.
- `unstructured_data_processing`: Performing detailed analysis on a provided block of raw text or multiple (retrieved) documents (e.g., summarization, sentiment analysis, entity extraction, processing multiple pieces of text). This is for analyzing *provided* text.
- `code_math_writing`: Generating or executing code, solving mathematical problems, or performing complex computations.

**Instructions:**
1.  **Analyze and Plan:** First, create a clear, direct and concise description of AI capabilities needed to answer this question, put the answer in the `question_analysis` field.
2.  **Select Minimum Capabilities:** Based on your description/plan, set the corresponding boolean flags to `true`. Only activate the capabilities that are *absolutely necessary* for your plan. For example, a simple fact-lookup might not require `unstructured_data_processing` on top of `deep_web_search`.
3.  **Ensure Consistency:** The capabilities mentioned in your `question_analysis` text MUST perfectly match the boolean flags set to `true`.
4.  **Respond in JSON:** Your entire output must be in the exact JSON format specified below.

**Better Examples:**

Question: "What is the boiling point of water at sea level?"
Analysis:
{{
    "question_analysis": "This is a direct factual query. It requires a single deep web search to look up a well-known scientific constant.",
    "deep_web_search": "true",
    "video_processing": "false",
    "audio_prcessing": "false",
    "image_processing": "false",
    "structured_data_processing": "false",
    "unstructered_data_processing": "false",
    "code_math_writing": "false"
}}

Question: "Summarize the attached meeting notes for me." (with a .txt file attached)
Analysis:
{{
    "question_analysis": "The user has provided a text document and wants a summary. This requires unstructured data processing to read the attached text and generate a concise summary of its key points.",
    "deep_web_search": "false",
    "video_processing": "false",
    "audio_prcessing": "false",
    "image_processing": "false",
    "structured_data_processing": "false",
    "unstructered_data_processing": "true",
    "code_math_writing": "false"
}}

Question: "Count the number of cars in this video and plot their positions on a heatmap." (with a video file attached)
Analysis:
{{
    "question_analysis": "This is a multi-step task. First, it requires video processing to extract frames from the attached video file. Second, it needs image processing to be run on those frames to detect and count objects identified as 'cars' and log their coordinates. Finally, it requires code and mathematical computations to aggregate these coordinates and generate a heatmap visualization.",
    "deep_web_search": "false",
    "video_processing": "true",
    "audio_prcessing": "false",
    "image_processing": "true",
    "structured_data_processing": "false",
    "unstructered_data_processing": "false",
    "code_math_writing": "true"
}}

---
**Begin Analysis**

Question: "{question}"
Attachments: "{attachment_info}"

Respond in this exact JSON format:
{{
    "question_analysis": "Breakdown of what this question requires, including all necessary capabilities and processes",
    "deep_web_search": "true"/"false",
    "video_processing": "true"/"false",
    "audio_prcessing": "true"/"false",
    "image_processing": "true"/"false",
    "structured_data_processing": "true"/"false",
    "unstructered_data_processing": "true"/"false",
    "code_math_writing": "true"/"false"
}}
"""
    response = llm.complete(classification_prompt)

    response_text = response.text.strip()
    result = json.loads(response_text)
    return QuestionAnalysis(
        question_analysis=result["question_analysis"],
        deep_web_search=result["deep_web_search"],
        video_processing=result["video_processing"], 
        audio_prcessing=result["audio_prcessing"], 
        image_processing=result["image_processing"], 
        structured_data_processing=result["structured_data_processing"],    
        unstructered_data_processing=result["unstructered_data_processing"],  
        code_math_writing=result["code_math_writing"], 
    )

### Web Search 

In [24]:
from ddgs import DDGS
from bs4 import BeautifulSoup
import urllib.request
import urllib.error
from bs4 import BeautifulSoup
import urllib.parse
from urllib.parse import urlparse


def parse_into_list(text:str, separator: str = "|") -> list[str]:
    """
    """
    return text.strip().strip("[]").split(separator)


def question_to_query(question: str) -> list[str]:
    """ 
    """
    prompt = f"""
    Create a set of general search engine queries for the following question: "{question}". 
    
    Make sure that:
    - Your output is a list separated by "|" sing and nothing else
    - Give maximum of two options where each query should be uniquely phrased 
    - Never use double or single quates anywhere in the answer
    - Do not mention any specific website where the information should be searched
    """

    print("question_to_query prompt: ", prompt)
    # Send this to an llm 
    query_responses = llm.complete(prompt)
    
    # Parse the response 
    return parse_into_list(query_responses.text)

# @tool
def duckduckgo_search(query: str, max_results: int = 5) -> str:
    """
    Search the web for information using DuckDuckGo search.
    """
    try:
        with DDGS() as ddgs:
            results = list(ddgs.text(query, max_results=max_results))
            
            if not results:
                return f"No search results found for '{query}'"

            formatted_results = []
            for i, result in enumerate(results, 1):
                title = result.get('title', 'No title')
                body = result.get('body', 'No description')
                href = result.get(WEB_SOURCE_URL_KEY, 'No URL')
                
                formatted_results.append({"search_order": i, "web_page_title": title, "web_page_summary": body, "url": href})
            return formatted_results
    
    except Exception as e:
        return f"Error performing web search: {e}"

def drop_non_unique_dicts(lst:list, unique_key:str=WEB_SOURCE_URL_KEY):
    seen_hrefs = set()
    result_list = []
    
    for item in lst:
        href = item.get(unique_key)
        
        if href not in seen_hrefs:
            result_list.append(item)
            seen_hrefs.add(href)
            
    return result_list

def web_search_question(question: str, web_search_links_per_query: int=3):
    """ 
    """
    # Disambiguate the question into a query
    candidate_queries = question_to_query(question)
    
    print("Candidate queries for the question are: ", candidate_queries)

    # Search for relevant sources 
    sources = list()
    for query in candidate_queries:
        sources_query = duckduckgo_search(query, web_search_links_per_query)
        sources.extend(sources_query)
    
    print("Found resources are: ", sources)
    return drop_non_unique_dicts(sources)


def extract_text_with_links(raw_html: str, base_url: str) -> str:
    """
    Extract clean text from raw HTML, converting hyperlinks to Markdown format.
    
    Args:
        raw_html (str): Raw HTML content
        base_url (str): Base URL to resolve relative links (e.g., Wikipedia base)
        
    Returns:
        str: Extracted text with hyperlinks in Markdown format: [text](url)
    """
    try:
        soup = BeautifulSoup(raw_html, 'html.parser')
        
        # Remove unwanted elements
        for element in soup(["script", "style", "nav", "footer", "header", "aside"]):
            element.decompose()
        
        # Convert all <a> tags to Markdown-style links
        for a_tag in soup.find_all('a', href=True):
            link_text = a_tag.get_text(separator=" ", strip=True)
            if not link_text:  # Skip if link has no text
                a_tag.replace_with("")
                continue
                
            href = a_tag['href']
            
            # Resolve relative URLs
            if href.startswith('/'):
                full_url = urllib.parse.urljoin(base_url, href)
            else:
                full_url = href

            # Replace the <a> tag with Markdown link
            markdown_link = f"[{link_text}]({full_url})"
            a_tag.replace_with(markdown_link)
        
        # Extract all text (now with Markdown links)
        text = soup.get_text(separator=" ")

        # Clean up whitespace: remove extra spaces and newlines
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        cleaned_text = ' '.join(chunk for chunk in chunks if chunk)
        
        return cleaned_text
    
    except Exception as e:
        print(f"Error extracting text from HTML: {e}")
        return ""

from bs4 import BeautifulSoup

def extract_clean_text(raw_html: str) -> str:
    """
    Extract clean text from raw HTML, removing hyperlinks and unwanted elements.
    
    Args:
        raw_html (str): Raw HTML content
        
    Returns:
        str: Extracted plain text with no hyperlinks
    """
    try:
        soup = BeautifulSoup(raw_html, 'html.parser')
        
        # Remove unwanted elements
        for element in soup(["script", "style", "nav", "footer", "header", "aside"]):
            element.decompose()

        # Extract all visible text
        text = soup.get_text(separator=" ")

        # Clean up whitespace: remove extra spaces and newlines
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        cleaned_text = ' '.join(chunk for chunk in chunks if chunk)
        
        return cleaned_text
    
    except Exception as e:
        print(f"Error extracting text from HTML: {e}")
        return ""


def get_base_url(url: str) -> str:
    """
    Extract base URL (scheme + netloc) from a full URL.
    Example:
        Input:  https://en.wikipedia.org/wiki/Prussia
        Output: https://en.wikipedia.org
    """                                                                                         
    parsed = urlparse(url)
    return f"{parsed.scheme}://{parsed.netloc}"

def download_text_content(url: str):
    """
    Download and process content from approved sources.
    """
    text_content = None
    
    try:
        print(f"⬇️ Starting to download: {url}")
        with urllib.request.urlopen(url, timeout=30) as response:
            html_content = response.read()
            text_content = extract_clean_text(html_content) #extract_text_with_links(html_content, base_url=get_base_url(url))         
    # ToDo: create custom errors    
    except urllib.error.URLError as e:
        print(f"❌ URL Error for {url}: {e}")
    except Exception as e:
        print(f"❌ Unexpected error for {url}: {e}")
    
    return text_content

In [25]:
def preprocess_text_from_wiki(raw_text: str) -> str:
    import mwparserfromhell
    wikicode = mwparserfromhell.parse(raw_text)
    return str(wikicode)


def preprocess_text_w_llm(raw_text: str) -> str:
    """
    Preprocess text to be in Markdown format by making a call to Ollama model and asking it to clean up the format.
    
    Args:
        raw_text (str): Raw text content to be preprocessed
        
    Returns:
        str: Clean text formatted in Markdown with proper structure
    """
    preprocessing_prompt = f"""
You are an expert text formatter and technical writer. Your task is to convert the provided raw text into clean, well-structured Markdown format.

**Instructions:**
1. **General Formatting**: Convert the text to proper Markdown format with appropriate headers, lists, emphasis, and structure
2. **Tables**: If you find any tabular data or information that looks like a table:
    - Convert it to JSON format in a code block
    - Add a descriptive annotation before the JSON explaining what the table contains
    - Use this format:
    ```
    **Table Description: [Brief description of what this data represents]**
    ```json
    {{
        "data": [your JSON structure here]
    }}
    ```
3. **Equations**: If you find mathematical equations or formulas:
    - Format them properly using Markdown/LaTeX syntax when possible
    - Add annotations explaining what each equation represents
    - Use this format:
    ```
    **Equation: [Brief description of what this equation represents]**
    $$equation here$$
    ```
4. **Lists**: Convert any list-like content to proper Markdown lists
5. **Headers**: Create appropriate heading hierarchy using # ## ### etc.
6. **Emphasis**: Use **bold** and *italic* appropriately for important terms
7. **Code**: Wrap any code snippets in appropriate code blocks with language specification
8. **Links**: Preserve and properly format any URLs or references

**Quality Standards:**
- Maintain all original information and data
- Ensure the output is readable and well-organized
- Use consistent formatting throughout
- Remove any formatting artifacts or noise from the original text
- Preserve the logical structure and flow of information

**Raw Text to Process:**
{raw_text}

**Output the cleaned Markdown version:**
"""
        
    print("Processing text with Ollama for Markdown formatting...")
    response = llm.complete(preprocessing_prompt)
    cleaned_text = response.text.strip()
    
    return cleaned_text
        

def embed_and_store_document(text: str, url: str, metadata: dict = None) -> str:
    """
    Embed document text and store it in Qdrant vector store.
    
    Args:
        text (str): The text content to embed and store
        url (str): Source URL of the document
        metadata (dict): Additional metadata to store with the document
        
    Returns:
        str: Document ID that was stored
    """
    try:
        # Split text into chunks for better retrieval
        text_splitter = CharacterTextSplitter(
            chunk_size=2000,        
            chunk_overlap=500,     
            separator=""            
        )
        chunks = text_splitter.split_text(text)
        
        for i, chunk in enumerate(chunks):
            # Preprocess chunk
            cleaned_chunk = preprocess_text_from_wiki(chunk) #preprocess_text_w_llm(chunk)
            
            # Generate embedding for the chunk
            embedding = embedding_model.encode(cleaned_chunk).tolist()
            
            # Create document ID
            doc_id = str(uuid.uuid4())
            
            # Prepare metadata
            chunk_metadata = {
                "url": url,
                "chunk_index": i,
                "text": chunk,
                "total_chunks": len(chunks)
            }
            if metadata:
                chunk_metadata.update(metadata)
            
            # Store in Qdrant
            point = PointStruct(
                id=doc_id,
                vector=embedding,
                payload=chunk_metadata
            )
            
            qdrant_client.upsert(
                collection_name=collection_name,
                points=[point]
            )
        
        print(f"Stored {len(chunks)} chunks from {url}")
    except Exception as e:
        print(f"Error storing document: {e}")

def search_vector_store(query: str, limit: int = 5) -> List[Dict]:
    """
    Search the vector store for relevant documents.
    
    Args:
        query (str): Search query
        limit (int): Maximum number of results to return
        
    Returns:
        List[Dict]: List of relevant documents with metadata
    """
    try:
        # Embed the query
        query_embedding = embedding_model.encode(query).tolist()
        
        # Search in Qdrant
        search_results = qdrant_client.search(
            collection_name=collection_name,
            query_vector=query_embedding,
            limit=limit,
            with_payload=True
        )
        
        # Format results
        results = []
        for result in search_results:
            results.append({
                "score": result.score,
                "text": result.payload.get("text", ""),
                "url": result.payload.get("url", ""),
                "chunk_index": result.payload.get("chunk_index", 0),
                "metadata": result.payload
            })
        
        return results
        
    except Exception as e:
        print(f"Error searching vector store: {e}")
        return []

def answer_from_rag(question: str, context_limit: int = 3) -> str:
    """
    Answer a question using RAG approach - search vector store and generate answer.
    
    Args:
        question (str): The question to answer
        context_limit (int): Maximum number of context chunks to use
        
    Returns:
        str: Generated answer based on retrieved context
    """
    try:
        # Search for relevant documents
        relevant_docs = search_vector_store(question, limit=context_limit)
        
        if not relevant_docs:
            return "No relevant information found in the vector store."
        
        # Combine context from relevant documents
        context_pieces = []
        urls_used = set()
        
        for doc in relevant_docs:
            context_pieces.append(f"Source: {doc['url']}\nContent: {doc['text']}")
            urls_used.add(doc['url'])
        
        combined_context = "\n\n---\n\n".join(context_pieces)
        
        # Generate answer using LLM with retrieved context
        prompt = f"""
        Based on the following retrieved information, answer the user's question. 
        Use only the information provided in the context. If the answer cannot be found 
        in the context, say so explicitly.

        QUESTION: {question}

        RETRIEVED CONTEXT:
        {combined_context}

        ANSWER:"""
        
        response = llm.complete(prompt)
        
        # Add source information
        sources_info = f"\n\nSources used: {', '.join(urls_used)}"
        
        return response.text + sources_info
        
    except Exception as e:
        return f"Error generating RAG answer: {e}"

print("RAG system functions implemented successfully!")

RAG system functions implemented successfully!


In [26]:
def synthesize_answer_rag(question: str, url: str) -> str:
    """
    RAG-based web page analysis that stores content in vector store and queries it.
    
    Args:
        question (str): The original question to answer
        url (str): The URL of the webpage to download and analyze
        
    Returns:
        str: Answer generated using RAG approach
    """
    try:
        # Download the content of the page 
        page_content = download_text_content(url)
        
        if not page_content:
            return f"Could not retrieve content from {url}"
        
        # Store the content in vector store
        embed_and_store_document(page_content, url)
        print(f"Seemed like successfully stored content from {url} in vector store")
        
        # Now use RAG to answer the question
        rag_answer = answer_from_rag(question)
        
        return rag_answer
        
    except Exception as e:
        return f"Error in RAG-based synthesis for {url}: {e}"

def web_search_with_rag(question: str, web_search_links_per_query: int = 3) -> str:
    """
    Enhanced web search that uses RAG system to store and query multiple web pages.
    
    Args:
        question (str): The question to answer
        
    Returns:
        str: Final answer based on RAG analysis of multiple sources
    """
    try:
        # Get search results
        search_results = web_search_question(question, web_search_links_per_query)
        
        if not search_results:
            return "No search results found for the question."
        
        print(f"Found search results:\n{search_results} ")
        
        # Process each search result and store in vector store
        for i, search_result in enumerate(search_results[:3]):  # Limit to top 3 results
            print(f"Processing result {i+1}: {search_result['url']}")
            try:
                page_content = download_text_content(search_result["url"])
                if page_content:
                    # Add search result metadata
                    metadata = {
                        "title": search_result.get("web_page_title", ""),
                        "summary": search_result.get("web_page_summary", ""),
                        "search_order": search_result.get("search_order", i+1)
                    }
                    embed_and_store_document(page_content, search_result["url"], metadata)
                else:
                    print(f"Could not retrieve content from {search_result['url']}")
            except Exception as e:
                print(f"Error processing {search_result['url']}: {e}")
        
        # Generate final answer using RAG
        final_answer = answer_from_rag(question, context_limit=5)
        
        return final_answer
        
    except Exception as e:
        return f"Error in RAG-based web search: {e}"

print("RAG-enhanced web search functions implemented successfully!")

RAG-enhanced web search functions implemented successfully!


In [27]:
# question = "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia."
# search_results = web_search_question(question, web_search_links_per_query=3)

# if not search_results:
#     print("!!!No search results found for the question!!!")

# print(f"Found search results:\n{search_results}")

# # Process each search result and store in vector store
# for i, search_result in enumerate(search_results[:3]):  # Limit to top 3 results
#     print(f"Processing result {i+1}: {search_result['url']}")
#     try:
#         page_content = download_text_content(search_result["url"])
#         if page_content:
#             # Add search result metadata
#             metadata = {
#                 "title": search_result.get("web_page_title", ""),
#                 "summary": search_result.get("web_page_summary", ""),
#                 "search_order": search_result.get("search_order", i+1)
#             }
#             embed_and_store_document(page_content, search_result["url"], metadata)
#         else:
#             print(f"Could not retrieve content from {search_result['url']}")
#     except Exception as e:
#         print(f"Error processing {search_result['url']}: {e}")

# search_vec = embedding_model.encode("studio albums")

# search_results = qdrant_client.search(
#     collection_name=collection_name,
#     query_vector=search_vec,
#     with_payload=True
# )
# search_results

In [28]:
# Test the RAG-based web search system
question = "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia."

print("Testing RAG-based web search...")
print(f"Question: {question}")
print("=" * 80)

# Use the new RAG-based web search
rag_answer = web_search_with_rag(question)
print("\nRAG-based Answer:")
print(rag_answer)

Testing RAG-based web search...
Question: How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.
question_to_query prompt:  
    Create a set of general search engine queries for the following question: "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.". 

    Make sure that:
    - Your output is a list separated by "|" sing and nothing else
    - Give maximum of two options where each query should be uniquely phrased 
    - Never use double or single quates anywhere in the answer
    - Do not mention any specific website where the information should be searched
    
Candidate queries for the question are:  ['"number of studio albums released by Mercedes Sosa between 2000 and 2009" ', ' "Mercedes Sosa\'s total studio album count from 2000 to 2009"']
Found resources are:  [{'search_order': 1, 'web_pa

  search_results = qdrant_client.search(



RAG-based Answer:
The retrieved information does not explicitly state the number of studio albums published by Mercedes Sosa between 2000 and 2009. Therefore, I cannot provide an explicit answer based on the context given. The focus seems to be more on her discography as a whole rather than within that specific time frame.

Sources used: https://en.wikipedia.org/wiki/Mercedes_Sosa


In [29]:
# Test RAG-based analysis of a specific URL
print("\nTesting RAG-based analysis of specific URL...")
specific_url = "https://en.wikipedia.org/w/index.php?title=Mercedes_Sosa&action=edit&section=6"
rag_result = synthesize_answer_rag(question, specific_url)
print(f"\nRAG Analysis Result for {specific_url}:")
print(rag_result)


Testing RAG-based analysis of specific URL...
⬇️ Starting to download: https://en.wikipedia.org/w/index.php?title=Mercedes_Sosa&action=edit&section=6
Stored 4 chunks from https://en.wikipedia.org/w/index.php?title=Mercedes_Sosa&action=edit&section=6
Seemed like successfully stored content from https://en.wikipedia.org/w/index.php?title=Mercedes_Sosa&action=edit&section=6 in vector store


  search_results = qdrant_client.search(



RAG Analysis Result for https://en.wikipedia.org/w/index.php?title=Mercedes_Sosa&action=edit&section=6:
The context provided does not contain information about the exact number of studio albums published by Mercedes Sosa between 2000 and 2009.

Sources used: https://en.wikipedia.org/wiki/Mercedes_Sosa


### Image analysis with Llava 

Llava does not usually provide accurate and correct results for specific queries related to images (e.g., tasks like counting object or describing what text is on the image). It can however describe in very very general terms what is on the image.

In [30]:
# Image Analyzer Tool with LLaVA Integration
from PIL import Image
import base64
import io
from typing import Literal

@tool
def image_analyzer_llava(image_path: str, task: str = Literal["describe", "text"]) -> str:
    """
    Analyze images using local LLaVA instance - describe content, analyze chess positions, read text, etc.
    """
    try:
        # Check if file exists
        if not os.path.exists(image_path):
            return f"Image file not found: {image_path}"
        
        # Load and process image
        image = Image.open(image_path)
        
        # Get basic image information
        width, height = image.size
        mode = image.mode
        format_type = image.format
        file_size = os.path.getsize(image_path)
        file_name = os.path.basename(image_path)
        
        # Convert image to base64 for API transmission
        buffered = io.BytesIO()
        image.save(buffered, format="PNG")
        img_base64 = base64.b64encode(buffered.getvalue()).decode()
        
        # Create task-specific prompts
        if task == "describe":
            prompt = "Describe what you see in this image in detail."
        elif task == "text":
            prompt = "Extract and read any text visible in this image."
        else:
            prompt = f"Analyze this image for the following task: {task}"
        
        # Placeholder for LLaVA API call
        llava_response = send_to_llava(img_base64, prompt)
        
        basic_info = f"Image: {file_name}\nSize: {width}x{height}\nMode: {mode}\nFormat: {format_type}\nFile size: {file_size} bytes"
        
        return f"{basic_info}\n\nLLaVA Analysis:\n{llava_response}"
    
    except Exception as e:
        return f"Error analyzing image '{image_path}': {e}"

def send_to_llava(image_base64: str, prompt: str) -> str:
    """
    Placeholder function to send image to local LLaVA instance.
    Replace this with actual API call to your LLaVA server.
    """
    try:
        # Placeholder for actual LLaVA API integration
        # This would typically be a POST request to localhost:11434 or similar
        
        # Example of what the actual implementation might look like:
        import requests
        
        payload = {
            "model": "llava",
            "prompt": prompt,
            "images": [image_base64],
            "stream": False
        }
        
        response = requests.post("http://localhost:11434/api/generate", 
                               json=payload, 
                               timeout=60)
        
        if response.status_code == 200:
            return response.json().get("response", "No response from LLaVA")
        else:
            return f"LLaVA API error: {response.status_code}"
    
    except Exception as e:
        return f"Error communicating with LLaVA: {e}"

### Structured (tabular) data processing

In [31]:
# File Processor Tool
import pandas as pd
import csv

@tool
def file_processor(file_path: str, file_type: str = "auto") -> str:
    """
    Process various file types - Excel files, CSV files, Parquet files, text files, etc.
    """
    try:
        # Check if file exists
        if not os.path.exists(file_path):
            return f"File not found: {file_path}"
        
        file_name = os.path.basename(file_path)
        file_size = os.path.getsize(file_path)
        
        if file_path.endswith(('.xlsx', '.xls')):
            # Process Excel files
            df = pd.read_excel(file_path)
            
            # Basic analysis
            shape_info = f"Shape: {df.shape[0]} rows, {df.shape[1]} columns"
            columns_info = f"Columns: {list(df.columns)}"
            
            # Calculate totals for numeric columns
            numeric_cols = df.select_dtypes(include=['number']).columns
            totals_info = ""
            if len(numeric_cols) > 0:
                totals = df[numeric_cols].sum()
                totals_info = f"Column totals: {totals.to_dict()}"
            
            sample_data = f"First 5 rows:\n{df.head().to_string()}"
            
            return f"Excel file: {file_name}\nFile size: {file_size} bytes\n{shape_info}\n{columns_info}\n{totals_info}\n\n{sample_data}"
        
        elif file_path.endswith('.csv'):
            # Process CSV files with automatic delimiter detection
            def detect_delimiter(file_path):
                with open(file_path, 'r', encoding='utf-8') as f:
                    sample = f.read(1024)
                    sniffer = csv.Sniffer()
                    delimiter = sniffer.sniff(sample).delimiter
                    return delimiter
            
            try:
                delimiter = detect_delimiter(file_path)
                df = pd.read_csv(file_path, delimiter=delimiter)
                
                # Basic analysis
                shape_info = f"Shape: {df.shape[0]} rows, {df.shape[1]} columns"
                columns_info = f"Columns: {list(df.columns)}"
                delimiter_info = f"Detected delimiter: '{delimiter}'"
                
                # Calculate totals for numeric columns
                numeric_cols = df.select_dtypes(include=['number']).columns
                totals_info = ""
                if len(numeric_cols) > 0:
                    totals = df[numeric_cols].sum()
                    totals_info = f"Column totals: {totals.to_dict()}"
                
                sample_data = f"First 5 rows:\n{df.head().to_string()}"
                
                return f"CSV file: {file_name}\nFile size: {file_size} bytes\n{delimiter_info}\n{shape_info}\n{columns_info}\n{totals_info}\n\n{sample_data}"
            
            except Exception as csv_error:
                # Fallback to text processing if CSV parsing fails
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                
                line_count = len(content.split('\n'))
                preview = content[:500] + "..." if len(content) > 500 else content
                
                return f"CSV file (read as text due to parsing error): {file_name}\nFile size: {file_size} bytes\nLines: {line_count}\nError: {csv_error}\n\nContent preview:\n{preview}"
        
        elif file_path.endswith('.parquet'):
            # Process Parquet files
            df = pd.read_parquet(file_path)
            
            # Basic analysis
            shape_info = f"Shape: {df.shape[0]} rows, {df.shape[1]} columns"
            columns_info = f"Columns: {list(df.columns)}"
            
            # Calculate totals for numeric columns
            numeric_cols = df.select_dtypes(include=['number']).columns
            totals_info = ""
            if len(numeric_cols) > 0:
                totals = df[numeric_cols].sum()
                totals_info = f"Column totals: {totals.to_dict()}"
            
            sample_data = f"First 5 rows:\n{df.head().to_string()}"
            
            return f"Parquet file: {file_name}\nFile size: {file_size} bytes\n{shape_info}\n{columns_info}\n{totals_info}\n\n{sample_data}"
        
        else:
            # Read as text file for all other formats (including Python files)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
            except UnicodeDecodeError:
                # Try with different encoding if UTF-8 fails
                with open(file_path, 'r', encoding='latin-1') as f:
                    content = f.read()
            
            line_count = len(content.split('\n'))
            word_count = len(content.split())
            char_count = len(content)
            
            preview = content[:500] + "..." if len(content) > 500 else content
            
            file_type_desc = "Python file" if file_path.endswith('.py') else "Text file"
            
            return f"{file_type_desc}: {file_name}\nFile size: {file_size} bytes\nLines: {line_count}\nWords: {word_count}\nCharacters: {char_count}\n\nContent preview:\n{preview}"
    
    except Exception as e:
        return f"Error processing file '{file_path}': {e}"

print("File processor tool implemented successfully")

File processor tool implemented successfully
