In [None]:
# Imports and Setup
import os
import re
import json
import requests
import pandas as pd
from typing import List, Dict, Any, Optional, Union
from dataclasses import dataclass
from enum import Enum
import textwrap

# LangGraph and LangChain imports
from langgraph.graph import StateGraph, END
from langgraph.graph.message import add_messages
from typing import Annotated
from typing_extensions import TypedDict
from langchain_core.messages import HumanMessage, AIMessage
from langchain_text_splitters import CharacterTextSplitter

from langchain.tools import tool

# LLM imports
from llama_index.llms.ollama import Ollama

# Additional tools
import yt_dlp
import whisper
from PIL import Image
import subprocess

# RAG system imports
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, PointStruct
from sentence_transformers import SentenceTransformer
import uuid

In [None]:
# Configuration and LLM Setup
REQUEST_TIMEOUT = 180
# ollama run qwen2:7b
# ollama run phi4:14b
# ollama run deepseek-r1:14b
# ollama run qwen2.5:14b
#  try Falcon3-10B-Instruct ?
MODEL_NAME = "qwen2.5:14b"
WEB_SOURCE_URL_KEY = "href"

CHUNK_SIZE_TEXT = 10000 
CHUNK_SIZE_RAG = 4000

# Initialize Ollama LLM
llm = Ollama(
    model=MODEL_NAME, 
    request_timeout=REQUEST_TIMEOUT
)

# Test LLM connection
test_response = llm.complete("Hello")
print(f"LLM initialized: {test_response.text[:50]}...")

# Configuration
MAX_RETRIES = 3
TEMP_DIR = "./temp_files"
os.makedirs(TEMP_DIR, exist_ok=True)

# Initialize RAG system components
print("Initializing RAG system...")

# Initialize sentence transformer for embeddings
embedding_model = SentenceTransformer('allenai/longformer-base-4096')
embedding_dimension = 768  # Dimension the embedding model 

# Initialize Qdrant in-memory client
qdrant_client = QdrantClient(":memory:")
collection_name = "web_content"
try:
    qdrant_client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=embedding_dimension, distance=Distance.COSINE)
    )
    print(f"Created Qdrant collection: {collection_name}")
except Exception as e:
    print(f"Failed to create Qdrant collection: {e}")
print("RAG system initialized successfully!")

### Utility functions

In [None]:
def pprint(text):
    wrapped_lines = textwrap.wrap(text, width=130)
    for line in wrapped_lines:
        print(line)

## Generalist Nodes

The following code will be simple base functions that will describe capabilities of the generalist agent.

### Defining the task
Involves:
1. Defining what is asked  
2. Defining the steps to answer the question: planning stage

In [None]:
def create_plan(task: str) -> str:
    """
    Given a task, determine a step-by-step action plan of what needs to be done to accomplish this task and output the answer/result. 
    The most important actions that are taken: 
     1. Define the goal: what result is asked to be produced.
     2. List the steps: provide a short explanation for each action that needs to be taken.       
    """
    
    prompt = f"""
You are an expert project planner. Your task is to create a concise, step-by-step action plan to accomplish the user's goal.

User's Goal:
---
{task}
---

Instructions:
1. Clarify the Core Objective: Start by rephrasing the user's goal as a single, clear, and specific objective.
2. Develop a Chronological Action Plan: Break down the objective into a logical sequence of high-level steps.

Guiding Principles for the Plan:
- Tool-Agnostic: Focus on the action required, not the specific tool to perform it (e.g., use "Gather data on market trends" instead of "Search Google for market trends").
- Information First: The initial step should almost always be to gather and analyze the necessary information before taking further action.
- S.M.A.R. Steps: Each step must be Specific, Measurable, Achievable, and Relevant. The focus is on the logical sequence, not specific deadlines.
- Concise: Include only the critical steps needed to reach the objective.

Example Output Format (ALWAS **JSON** ):
{{
  "objective": "Plan and execute a one-day offsite event for a team of 10 people focused on team building and strategic planning.",
  "plan": [
    "Gather requirements including budget, potential dates, and key goals for the offsite from team leadership",
    "Research and shortlist suitable venues and activity options that fit the budget and goals",
    "Create a detailed agenda and budget proposal for approval",
    "Book the selected venue, catering, and activities upon approval",
    "Send out official invitations and manage attendee confirmations and dietary requirements",
    "Finalize all logistical details and communicate the full itinerary to the team"
  ]
}}
where
  "objective" 's value in the json is a clear, one-sentence summary of the end goal,
  "plan" 's value in the json is a list **ALWAYS SEPARATED BY PYTHON NEWLINE CHARCTER** like 
  [
    A short explanation of the first logical step", 
    A short explanation of the next step that follows from the first",
    And so on..."
  ]
"""
    task_response = llm.complete(prompt)

    return task_response.text

@dataclass
class Task:
    question: str
    objective: str
    plan: list[str]


def define_task(task:str) -> str: 
    task_plan_response = create_plan(task)

    # Assume llm outputs smth json-like with the correct keys.
    result = json.loads(task_plan_response)

    return Task(
      question=task,
      objective=result["objective"],
      plan=result["plan"],
    )

### Identify tools

In [None]:
@dataclass
class Capabilities:
    analysis: str
    deep_web_search: bool  
    video_processing: bool 
    audio_prcessing: bool 
    image_processing: bool 
    structured_data_processing: bool    
    unstructered_data_processing: bool  
    code_math_writing: bool 

def get_capabilities_required(task: str, attachments: List[str] = None) -> Capabilities:
    """
    Specification of the task at hand and use an LLM to analyze it. 
    Determine what capabilities are needed.
    
    Args:
     task (str): description of the task
     attachments (str): list of files that are related to the task 

    Returns: 
        QuestionAnalysis: dataclass that describes what executing this task requires 
    """
    attachment_info = ""
    if attachments:
        attachment_info = f"\n\nAttachments mentioned: {', '.join(attachments)}"
    
    classification_prompt = f"""
You are a highly intelligent routing agent. Your primary function is to analyze a user's task and determine the precise capabilities required to execute it accurately and efficiently.

Task: {task}
Attachments: {attachment_info}

**Your Task:**
Analyze the provided question and determine the most logical and efficient plan to answer it using the capabilities listed below. Your analysis must be detailed in a step-by-step plan and then summarized in a series of boolean flags.

**Capabilities:**
- `deep_web_search`: Executing iterative search queries to find websources that answer the query the best, i.e., find, evaluate and download web content. This steps DOES NOT INCLUDE synthesize/understanding information from the web resources. Use this for questions requiring up-to-date or niche knowledge.
- `video_processing`: Processing a video file to analyze its content, typically by extracting frames for image analysis or audio for transcription/analysis.
- `audio_processing`: Processing an audio file to transcribe speech, identify sounds, or analyze acoustic properties.
- `image_processing`: Visually analyzing an image to identify objects, read text, or understand its content.
- `structured_data_processing`: Analyzing, querying, or visualizing data from structured files like Parquet, CSV, JSON, or databases.
- `unstructured_data_processing`: Performing detailed analysis on a provided block of raw text or multiple (retrieved) documents (e.g., summarization, sentiment analysis, entity extraction, processing multiple pieces of text). This is for analyzing *provided* text.
- `code_math_writing`: Generating or executing code, solving mathematical problems, or performing complex computations.

**Instructions:**
1.  **Analyze and Plan:** First, create a clear, direct and concise description of AI capabilities needed to answer this question, put the answer in the `analysis` field.
2.  **Select Minimum Capabilities:** Based on your description/plan, set the corresponding boolean flags to `true`. Only activate the capabilities that are *absolutely necessary* for your plan. For example, a simple fact-lookup might not require `unstructured_data_processing` on top of `deep_web_search`.
3.  **Ensure Consistency:** The capabilities mentioned in your `analysis` text MUST perfectly match the boolean flags set to `true`.
4.  **Respond in JSON:** Your entire output must be in the exact JSON format specified below.

**Better Examples:**

Task: "What is the boiling point of water at sea level?"
Analysis:
{{
    "analysis": "This is a direct factual query. It requires a single deep web search to look up a well-known scientific constant.",
    "deep_web_search": "true",
    "video_processing": "false",
    "audio_prcessing": "false",
    "image_processing": "false",
    "structured_data_processing": "false",
    "unstructered_data_processing": "false",
    "code_math_writing": "false"
}}

Question: "Summarize the attached meeting notes for me." (with a .txt file attached)
Analysis:
{{
    "analysis": "The user has provided a text document and wants a summary. This requires unstructured data processing to read the attached text and generate a concise summary of its key points.",
    "deep_web_search": "false",
    "video_processing": "false",
    "audio_prcessing": "false",
    "image_processing": "false",
    "structured_data_processing": "false",
    "unstructered_data_processing": "true",
    "code_math_writing": "false"
}}

Question: "Count the number of cars in this video and plot their positions on a heatmap." (with a video file attached)
Analysis:
{{
    "analysis": "This is a multi-step task. First, it requires video processing to extract frames from the attached video file. Second, it needs image processing to be run on those frames to detect and count objects identified as 'cars' and log their coordinates. Finally, it requires code and mathematical computations to aggregate these coordinates and generate a heatmap visualization.",
    "deep_web_search": "false",
    "video_processing": "true",
    "audio_prcessing": "false",
    "image_processing": "true",
    "structured_data_processing": "false",
    "unstructered_data_processing": "false",
    "code_math_writing": "true"
}}

------------------
**Begin Analysis**

Question: "{task}"
Attachments: "{attachment_info}"

Respond in this exact JSON format:
{{
    "analysis": "Breakdown of what this question requires, including all necessary capabilities and processes",
    "deep_web_search": "true"/"false",
    "video_processing": "true"/"false",
    "audio_prcessing": "true"/"false",
    "image_processing": "true"/"false",
    "structured_data_processing": "true"/"false",
    "unstructered_data_processing": "true"/"false",
    "code_math_writing": "true"/"false"
}}
"""
    response = llm.complete(classification_prompt)

    response_text = response.text.strip()
    result = json.loads(response_text)
    return Capabilities(
        analysis=result["analysis"],
        deep_web_search=result["deep_web_search"],
        video_processing=result["video_processing"], 
        audio_prcessing=result["audio_prcessing"], 
        image_processing=result["image_processing"], 
        structured_data_processing=result["structured_data_processing"],    
        unstructered_data_processing=result["unstructered_data_processing"],  
        code_math_writing=result["code_math_writing"], 
    )

### Web Search 

Web search defined as an information retrieval task.

It deals with finding relevant resources on the web, downloading them and processing them into a usable format. 

In [None]:
from ddgs import DDGS
from bs4 import BeautifulSoup
import urllib.request
import urllib.error
from bs4 import BeautifulSoup
import urllib.parse
from urllib.parse import urlparse
from bs4 import BeautifulSoup


def parse_into_list(text:str, separator: str = "|") -> list[str]:
    """
    """
    return text.strip().strip("[]").split(separator)


def question_to_query(question: str) -> list[str]:
    """ 
    """
    prompt = f"""
    Create a list of general search engine queries for the following question: {question}. 
    
    Make sure that:
    - Your output is a list separated by "|" sing and nothing else
    - Give MAXIMUM of TWO (2) search engine queries  
    - Each query should be SHORT and precise

    Example Output: 
    Large urban population areas in Europe | Biggest cities in Europe
    Short History of Prussia | Origins and the story of East Germany    
    """

    print("question_to_query prompt: ", prompt)
    # Send this to an llm 
    query_responses = llm.complete(prompt)
    
    # Parse the response 
    return parse_into_list(query_responses.text)

def duckduckgo_search(query: str, max_results: int = 2) -> str:
    """
    Search the web for information using DuckDuckGo search.
    
    Format of returning json:
        {"search_order": i, "web_page_title": title, "web_page_summary": summary, WEB_SOURCE_URL_KEY: url}
    """
    try:
        with DDGS() as ddgs:
            results = list(ddgs.text(query, max_results=max_results))
            
            if not results:
                return f"No search results found for '{query}'"

            formatted_results = []
            for i, result in enumerate(results):
                title = result.get('title', 'No title')
                body = result.get('body', 'No description')
                href = result.get(WEB_SOURCE_URL_KEY, 'No URL')
                
                formatted_results.append({"search_order": i, "web_page_title": title, "web_page_summary": body, WEB_SOURCE_URL_KEY: href})
            return formatted_results
    
    except Exception as e:
        return f"Error performing web search: {e}"

def drop_non_unique_dicts(lst:list, unique_key:str=WEB_SOURCE_URL_KEY):
    seen_hrefs = set()
    result_list = []
    
    for item in lst:
        href = item.get(unique_key)
        
        if href not in seen_hrefs:
            result_list.append(item)
            seen_hrefs.add(href)
            
    return result_list

def web_search_question(question: str, web_search_links_per_query: int=1) -> list[str]:
    """
    """
    # Disambiguate the question into a query
    candidate_queries = question_to_query(question)
    
    print("Candidate queries for the question are: ", candidate_queries)

    # Search for relevant sources 
    sources = list()
    for query in candidate_queries:
        sources_query = duckduckgo_search(query, web_search_links_per_query)
        sources.extend(sources_query)
    
    return drop_non_unique_dicts(sources)


def extract_text_with_links(raw_html: str, base_url: str) -> str:
    """
    Extract clean text from raw HTML, converting hyperlinks to Markdown format.
    
    Args:
        raw_html (str): Raw HTML content
        base_url (str): Base URL to resolve relative links (e.g., Wikipedia base)
        
    Returns:
        str: Extracted text with hyperlinks in Markdown format: [text](url)
    """
    try:
        soup = BeautifulSoup(raw_html, 'html.parser')
        
        # Remove unwanted elements
        for element in soup(["script", "style", "nav", "footer", "header", "aside"]):
            element.decompose()
        
        # Convert all <a> tags to Markdown-style links
        for a_tag in soup.find_all('a', href=True):
            link_text = a_tag.get_text(separator=" ", strip=True)
            if not link_text:  # Skip if link has no text
                a_tag.replace_with("")
                continue
                
            href = a_tag['href']
            
            # Resolve relative URLs
            if href.startswith('/'):
                full_url = urllib.parse.urljoin(base_url, href)
            else:
                full_url = href

            # Replace the <a> tag with Markdown link
            markdown_link = f"[{link_text}]({full_url})"
            a_tag.replace_with(markdown_link)
        
        # Extract all text (now with Markdown links)
        text = soup.get_text(separator=" ")

        # Clean up whitespace: remove extra spaces and newlines
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        cleaned_text = ' '.join(chunk for chunk in chunks if chunk)
        
        return cleaned_text
    
    except Exception as e:
        print(f"Error extracting text from HTML: {e}")
        return ""

def extract_clean_text(raw_html: str) -> str:
    """
    Extract clean text from raw HTML, removing hyperlinks and unwanted elements.
    
    Args:
        raw_html (str): Raw HTML content
        
    Returns:
        str: Extracted plain text with no hyperlinks
    """
    try:
        soup = BeautifulSoup(raw_html, 'html.parser')
        
        # Remove unwanted elements
        for element in soup(["script", "style", "nav", "footer", "header", "aside"]):
            element.decompose()

        # Extract all visible text
        text = soup.get_text(separator=" ")

        # Clean up whitespace: remove extra spaces and newlines
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        cleaned_text = ' '.join(chunk for chunk in chunks if chunk)
        
        return cleaned_text
    
    except Exception as e:
        print(f"Error extracting text from HTML: {e}")
        return ""

def get_base_url(url: str) -> str:
    """
    Extract base URL (scheme + netloc) from a full URL.
    Example:
        Input:  https://en.wikipedia.org/wiki/Prussia
        Output: https://en.wikipedia.org
    """                                                                                         
    parsed = urlparse(url)
    return f"{parsed.scheme}://{parsed.netloc}"

def extract_text_from_wiki(raw_text: str) -> str:
    import mwparserfromhell
    wikicode = mwparserfromhell.parse(raw_text)
    return str(wikicode) 

def download_text_content(url: str):
    """
    Download and process content from approved sources.
    """
    text_content = None
    
    try:
        print(f"⬇️ Starting to download: {url}")
        with urllib.request.urlopen(url, timeout=30) as response:
            html_content = response.read()
            # TODO: determine how to better choose between these options:
            # - extract_text_from_wiki (LEAVES HTML TAGS)
            # - extract_clean_text(html_content) 
            # - extract_text_with_links(html_content, base_url=get_base_url(url))       
            text_content = extract_clean_text(html_content)   
    # ToDo: create custom errors    
    except urllib.error.URLError as e:
        print(f"❌ URL Error for {url}: {e}")
    except Exception as e:
        print(f"❌ Unexpected error for {url}: {e}")
    
    return text_content


def preprocess_text_w_llm(raw_text: str) -> str:
    """
    Preprocess text to be in Markdown format by making a call to Ollama model and asking it to clean up the format.
    
    Args:
        raw_text (str): Raw text content to be preprocessed
        
    Returns:
        str: Clean text formatted in Markdown with proper structure
    """
    preprocessing_prompt = f"""
You are an expert text formatter and technical writer. Your task is to convert the provided raw text into clean, well-structured Markdown format.

**Instructions:**
1. **General Formatting**: Convert the text to proper Markdown format with appropriate headers, lists, emphasis, and structure
2. **Tables**: If you find any tabular data or information that looks like a table:
    - Convert it to JSON format in a code block
    - Add a descriptive annotation before the JSON explaining what the table contains
    - Use this format:
    ```
    **Table Description: [Brief description of what this data represents]**
    ```json
    {{
        "data": [your JSON structure here]
    }}
    ```
3. **Equations**: If you find mathematical equations or formulas:
    - Format them properly using Markdown/LaTeX syntax when possible
    - Add annotations explaining what each equation represents
    - Use this format:
    ```
    **Equation: [Brief description of what this equation represents]**
    $$equation here$$
    ```
4. **Lists**: Convert any list-like content to proper Markdown lists
5. **Headers**: Create appropriate heading hierarchy using # ## ### etc.
6. **Emphasis**: Use **bold** and *italic* appropriately for important terms
7. **Code**: Wrap any code snippets in appropriate code blocks with language specification
8. **Links**: Preserve and properly format any URLs or references

**Quality Standards:**
- Maintain all original information and data
- Ensure the output is readable and well-organized
- Use consistent formatting throughout
- Remove any formatting artifacts or noise from the original text
- Preserve the logical structure and flow of information

**Raw Text to Process:**
{raw_text}

**Output the cleaned Markdown version:**
"""
        
    print("Processing text with Ollama for Markdown formatting...")
    response = llm.complete(preprocessing_prompt)
    cleaned_text = response.text.strip()
    
    return cleaned_text

In [None]:
# TODO: Specify the format for retrieving information from the web  = THE RESOURCE FORMAT as  
# {
#     "text": text string,
#     "url": web link
#     "metadata": some info
# }

In [None]:
def web_search(question: str, web_search_links_per_query: int = 1) -> str:
    """
    """
    search_results = web_search_question(question, web_search_links_per_query)
    
    if not search_results:
        return "No search results found for the question."
    
    print(f"Found search results:\n{search_results} ")
    resources = list()
    for i, search_result in enumerate(search_results):  
        page_content = download_text_content(search_result[WEB_SOURCE_URL_KEY])

        resources.append({
            "text": page_content,
            WEB_SOURCE_URL_KEY: search_result[WEB_SOURCE_URL_KEY],
            # Todo: clean up metadata
            "metadata": search_result,
        })

    return resources

### Unstructured data processing

In [None]:
def contruct_final_answer(task:str, context:str) -> str:
    prompt = f"""

    You are presented with a list of expert answers from different source that you need summarise.

    LIST:
    {context}
    
    Based **ONLY** on that list and without any addition assumptions from your side, perform the the task specified. 
    
    TASK:
    {task}
    
    Your answer should be in json format like so:
    {{
        "answer": <a single number, word of a phrase which si the answer to the question>,
        "clarification": <very short mention of what the answer is based on>,
    }}

    Rules:
        - If the text contains the complete answer → put the exact answer in "answer".
        - If the text contains no relevant information → put "answer": "not found".
        - If the text contains some but not all information → put "answer": "not found".
        - The "clarification" must mention the relevant part of the text and explain briefly.

    Examples:
    Q: "Who won the 2022 FIFA World Cup?"
    {{
    "answer": "not found",
    "clarification": "The text mentions the location of the tournament but not the winner."
    }}
    Q: "How many colours there is in the rainbow"
    {{
    "answer": "12",
    "clarification": "Red,Orange,Yellow,Chartreuse green,Green,Blue-green,Cyan,Azure,Violet,Purple,Magenta,Red"
    }}
    Q:"What's the name of Russian Santa?"
    {{
    "answer": "Ded Moroz",
    "clarification": "Easter Slavic Father Frost"
    }}
    """
    answer = llm.complete(prompt)

    return answer.text

def task_with_text_llm(task: str, text:str) -> str:
    prompt = f"""
    Perform the instruction/task in the user's question. 
    Use only the information provided in the context. 
    
    TASK 
    {task}

    CONTEXT
    {text}

    **IMPORTANT** If the text does not include the SPECIFIC information about the task, output "NOT FOUND"
    Output
    """

    llm_result = llm.complete(prompt)
    
    return llm_result.text

def text_process_llm(task: str, text:str):
    text_splitter = CharacterTextSplitter(
        chunk_size=CHUNK_SIZE_TEXT,        
        chunk_overlap=500,     
        separator=""            
    )
    chunks = text_splitter.split_text(text)

    responses = list()
    for chunk in chunks: 
        answer_response = task_with_text_llm(task, chunk)

        responses.append(answer_response)
        
    return responses

In [None]:
# question = "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia."

# task = define_task(question)
# pprint(task.__str__())
# web_resources = web_search(question)
# llm_response = text_process_llm(task.plan[1] + task.plan[2], web_resources[0]["text"])
# contruct_final_answer(task.plan[-1], context=llm_response[1])

In [None]:
# question = "Who did the actor who played Ray in the Polish-language version of Everybody Loves Raymond play in Magda M."

# task = define_task(question)
# pprint(task.__str__())


In [None]:
# question = '.rewsna eht sa "tfel" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI'
# task = define_task(question)
# pprint(task.__str__())

In [None]:
# question = "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?"
# task = define_task(question)
# pprint(task.__str__())

In [None]:
# question = "How many at bats did the Yankee with the most walks in the 1977 regular season have that same season?"
# task = define_task(question)
# pprint(task.__str__())

In [None]:
# question = "What is the surname of the equine veterinarian mentioned in 1.E Exercises from the chemistry materials licensed by Marisa Alviar-Agnew & Henry Agnew under the CK-12 license in LibreText's Introductory Chemistry materials as compiled 08/21/2023?"
# task = define_task(question)
# pprint(task.__str__())

In [None]:
question = "In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?"
task = define_task(question)
pprint(task.__str__())

#### Create RAG 

In [None]:
def embed_and_store_document(text: str, url: str, metadata: dict = None) -> str:
    """
    Embed document text and store it in Qdrant vector store.
    
    Args:
        text (str): The text content to embed and store
        url (str): Source URL of the document
        metadata (dict): Additional metadata to store with the document
        
    Returns:
        str: Document ID that was stored
    """
    # Split text into chunks for better retrieval
    text_splitter = CharacterTextSplitter(
        chunk_size=CHUNK_SIZE_RAG,        
        chunk_overlap=500,     
        separator=""            
    )
    chunks = text_splitter.split_text(text)
    
    for i, chunk in enumerate(chunks):
        # Preprocess chunk
        cleaned_chunk = extract_text_from_wiki(chunk) #preprocess_text_w_llm(chunk)
        
        # Generate embedding for the chunk
        # TODO: trying out to embed also meta info
        to_embed = cleaned_chunk + url
        embedding = embedding_model.encode(to_embed).tolist()
        
        # Prepare metadata
        chunk_metadata = {
            WEB_SOURCE_URL_KEY: url,
            "chunk_index": i,
            "text": chunk,
            "total_chunks": len(chunks)
        }
        if metadata:
            chunk_metadata.update(metadata)
        
        # Store in Qdrant
        point = PointStruct(
            id=str(uuid.uuid4()),
            vector=embedding,
            payload=chunk_metadata
        )
        
        qdrant_client.upsert(
            collection_name=collection_name,
            points=[point]
        )
    print(f"Stored {len(chunks)} chunks from {url}")


def search_vector_store(query: str, limit: int = 2) -> List[Dict]:
    """
    Search the vector store for relevant documents.
    
    Args:
        query (str): Search query
        limit (int): Maximum number of results to return
        
    Returns:
        List[Dict]: List of relevant documents with metadata
    """
    # Embed the query
    query_embedding = embedding_model.encode(query).tolist()
    
    # Search in Qdrant
    search_results = qdrant_client.search(
        collection_name=collection_name,
        query_vector=query_embedding,
        limit=limit,
        with_payload=True
    )
    
    # Format results
    results = []
    for result in search_results:
        # Use `resource` format 
        resource = {
            "text": result.payload.get("text", ""),
            WEB_SOURCE_URL_KEY: result.payload.get(WEB_SOURCE_URL_KEY, ""),
            "metadata": result.payload
        } 
        results.append(resource)

        with open("text-dump-rag.txt","a") as file:
            file.write(resource.__str__())
            file.write("\n")
    
    return results

def answer_from_rag(question: str, context_limit: int = 3) -> str:
    """
    Answer a question using RAG approach - search vector store and generate answer.
    
    Args:
        question (str): The question to answer
        context_limit (int): Maximum number of context chunks to use
        
    Returns:
        str: Generated answer based on retrieved context
    """
    # Search for relevant documents
    relevant_docs = search_vector_store(question, limit=context_limit)
    
    if not relevant_docs:
        return "No relevant information found in the vector store."
    
    # Combine context from relevant documents
    context_pieces = []
    urls_used = set()
    
    for doc in relevant_docs:
        context_pieces.append(f"Source: {doc[WEB_SOURCE_URL_KEY]}\nContent: {doc['text']}")
        urls_used.add(doc[WEB_SOURCE_URL_KEY])
    
    combined_context = "\n\n---\n\n".join(context_pieces)
    
    # Call to LLM to perform "process unstructured data" call       
    response = text_process_llm(question,combined_context)
    
    return response.text 
        
def create_rag_system_from_web_resources(web_resources: list[dict], web_search_links_per_query: int = 1):
    """
    Enhanced web search that uses RAG system to store and query multiple web pages.
    
    Args:
        question (str): The question to answer
        
    Returns:
        str: Final answer based on RAG analysis of multiple sources
    """
    # Process each search result and store in vector store
    for i, web_resource in enumerate(web_resources):  
        print(f"Processing result {i+1}: {web_resource[WEB_SOURCE_URL_KEY]}")
        if web_resource["text"]:
            embed_and_store_document(web_resource["text"], web_resource[WEB_SOURCE_URL_KEY], web_resource["metadata"])
        else:
            print(f"Could not retrieve any content from {web_resource}")

### Image analysis with Llava 

Llava does not usually provide accurate and correct results for specific queries related to images (e.g., tasks like counting object or describing what text is on the image). It can however describe in very very general terms what is on the image. 

**IT MAKES A LOT OF MISTAKES**

In [None]:
# Image Analyzer Tool with LLaVA Integration
from PIL import Image
import base64
import io
from typing import Literal

def image_analyzer_llava(image_path: str, task: str = Literal["describe", "text"]) -> str:
    """
    Analyze images using local LLaVA instance - describe content, analyze chess positions, read text, etc.
    """
    try:
        # Check if file exists
        if not os.path.exists(image_path):
            return f"Image file not found: {image_path}"
        
        # Load and process image
        image = Image.open(image_path)
        
        # Get basic image information
        width, height = image.size
        mode = image.mode
        format_type = image.format
        file_size = os.path.getsize(image_path)
        file_name = os.path.basename(image_path)
        
        # Convert image to base64 for API transmission
        buffered = io.BytesIO()
        image.save(buffered, format="PNG")
        img_base64 = base64.b64encode(buffered.getvalue()).decode()
        
        # Create task-specific prompts
        if task == "describe":
            prompt = "Describe what you see in this image in detail."
        elif task == "text":
            prompt = "Extract and read any text visible in this image."
        else:
            prompt = f"Analyze this image for the following task: {task}"
        
        # Placeholder for LLaVA API call
        llava_response = send_to_llava(img_base64, prompt)
        
        basic_info = f"Image: {file_name}\nSize: {width}x{height}\nMode: {mode}\nFormat: {format_type}\nFile size: {file_size} bytes"
        
        return f"{basic_info}\n\nLLaVA Analysis:\n{llava_response}"
    
    except Exception as e:
        return f"Error analyzing image '{image_path}': {e}"

def send_to_llava(image_base64: str, prompt: str) -> str:
    """
    Placeholder function to send image to local LLaVA instance.
    Replace this with actual API call to your LLaVA server.
    """
    try:
        # Placeholder for actual LLaVA API integration
        # This would typically be a POST request to localhost:11434 or similar
        
        # Example of what the actual implementation might look like:
        import requests
        
        payload = {
            "model": "llava",
            "prompt": prompt,
            "images": [image_base64],
            "stream": False
        }
        
        response = requests.post("http://localhost:11434/api/generate", 
                               json=payload, 
                               timeout=60)
        
        if response.status_code == 200:
            return response.json().get("response", "No response from LLaVA")
        else:
            return f"LLaVA API error: {response.status_code}"
    
    except Exception as e:
        return f"Error communicating with LLaVA: {e}"
    
# image_analyzer_llava(image_path="/Users/maksim.rostov/Desktop/vllama2.png", task="describe") 

### Structured (tabular) data processing

In [None]:
# File Processor Tool for tables
import pandas as pd
import csv

def peak_into_table(file_path: str) -> pd.DataFrame:
    """
    Load a tabular file, options: 
     - Excel files 
     - CSV files 
     - Parquet files  
    into memory and get some information about there contents.
    """
    df = pd.DataFrame()
    err = None    
    try:
        
        if not os.path.exists(file_path):
            return f"File not found: {file_path}"
        
        if file_path.endswith(('.xlsx', '.xls')):
            df = pd.read_excel(file_path)
        
        elif file_path.endswith('.csv'):
            # Process CSV files with automatic delimiter detection
            def detect_delimiter(file_path):
                with open(file_path, 'r', encoding='utf-8') as f:
                    sample = f.read(1024)
                    sniffer = csv.Sniffer()
                    delimiter = sniffer.sniff(sample).delimiter
                    return delimiter
            
            delimiter = detect_delimiter(file_path)
            df = pd.read_csv(file_path, delimiter=delimiter)
                        
        elif file_path.endswith('.parquet'):
            df = pd.read_parquet(file_path)
        else:
            err = "ERROR: unrecognised file extension."
            print(err)
    
    except Exception as e:
        err = f"ERROR: tried but could not load the file: {e}"
        print(err)
    
    if not df.empty:
        shape_info = f"Shape: {df.shape[0]} rows, {df.shape[1]} columns"
        columns_info = f"Columns: {list(df.columns)}"
        sample_data = f"First 5 rows:\n{df.head().to_string()}"
        from io import StringIO
        str_buffer = StringIO()
        df.info(buf=str_buffer)
        info_df = "df.info = " + str_buffer.getvalue()

        return shape_info+"\n"+columns_info+"\n"+sample_data+"\n"+info_df 
            
    return f"Failed to look into data: {err}"

# print(peak_into_table("7bd855d8-463d-4ed5-93ca-5fe35145f733.xls"))

In [None]:
def generate_python_for_table_analysis(task:str, context: str) -> str:
    """
    Generate python code with llm to analyse tablar data.     
    """
    prompt = f"""
    Perform the following task: {task}  
    
    Based on the following context
    CONTEXT 
    {context}

    You should output python code that would process and analyse data that is presented in the context to accomplish the task.
    The code will have a single main function that would only have one argument which is the filename of a csv that that needs to be loaded.  
    Output your results in the following format: 
    ```python
    < imports necesary >
    import pandas as pd 
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("--filepath")

    args = parser.parse_args()
    args = parser.parse_args()

    def main(filepath):
        < code here >
        df = pd.read_csv(filepath)
        return result 

    print(main(args.filepath))
    ```
    and should include all the necessary imports and finally the call to the function. 
    
    Examples: 

    ```python
        import matplotlib.pyplot as plt
        from io import BytesIO
        import pandas as pd 
        
        import argparse

        parser = argparse.ArgumentParser()

        parser.add_argument("--filepath")

        args = parser.parse_args()

        def main(filepath):
            '''Plots a scatter plot of all the columns in the dataframe and return just the image''' 
            df = pd.read_csv(filepath) 
            fig, ax = plt.subplots()
            column = "apples"
            ax = df[column].plot(ax=ax)

            buffer = BytesIO()
            plt.savefig(buffer, format="png")
            buffer.seek(0)
            img_png = buffer.getvalue()
            img_png_encoded = base64.b64decode(img_png)

            return img_png_encoded

        print(main(args.filepath))
    ```

    ```python
        import pandas as pd 
        import argparse

        parser = argparse.ArgumentParser()

        parser.add_argument("--<argument>")

        args = parser.parse_args()        
        def main(filepath):
            '''Get average growth difference between April 2014 and 2013''' 
            df = pd.read_csv(filepath) 
            df_agg = df.groupby(["year", "month"]).mean()
            diff_yield = df_agg[(df_agg.index.get_index_value(0)==2014) & ((df_agg.get_index_value(1)=="April"))] - df_agg[(df_agg.index.get_index_value(0)==2013) & ((df_agg.get_index_value(1)=="April"))] 
            return diff_yield 

        print(main(args.filepath))
    ```
    """

    code_response =  llm.complete(prompt)

    return code_response.text

def execute_code_on_dataframe(df: pd.DataFrame, code: str) -> str:
    import subprocess
    import tempfile

    filename = "table_data.csv"
    df.to_csv(filename)

    open_file_handle = tempfile.NamedTemporaryFile("wt")
    with open_file_handle as f:
        f.write(code)
        f.flush()

        call_result = subprocess.run(["python3", open_file_handle.name, "--filepath", filename], capture_output=True, text=True)

    return  call_result.stdout, call_result.stderr


############
# EXAMPLE
############

# # The attached Excel file contains the sales of menu items for a local fast-food chain.
# # What were the total sales that the chain made from food (not including drinks)? Express your answer in USD with two decimal places.

# df = tabular_file_to_pandas("./7bd855d8-463d-4ed5-93ca-5fe35145f733.xls")

# # TODO: maybe makes sense to put these into preprocessing step so that we get some info about the data right away 
# shape_info = f"Shape: {df.shape[0]} rows, {df.shape[1]} columns"
# columns_info = f"Columns: {list(df.columns)}"
# sample_data = f"First 5 rows:\n{df.head().to_string()}"
# print(df.info())

# from io import StringIO

# str_buffer = StringIO()
# df.info(buf=str_buffer)
# info_df = str_buffer.getvalue()

# code = generate_python_for_table_analysis(
#     "Sum all the columns that are food (ONLY EXCLUDE THE DRINKS, DESERTS IS FOOD)", 
#     context=shape_info+"\n"+columns_info+"\n"+info_df
# )
# print(code)

# code_cleaned = code.strip("`").strip("'").strip("python")
# exec_stdout, exec_stderr = execute_code_on_dataframe(df, code_cleaned)
# print(exec_stdout)
# print(exec_stderr)

# df[["Burgers","Hot Dogs","Salads","Fries","Ice Cream"]].sum().sum()

In [None]:
def write_python_code(task:str, context: str):
    """
    Generate python code for an arbitrary task and uses context for the information needed to accopmplish it. 
    Usually the context is the details where to take inputs/data for the task.     
    """
    
    prompt = f"""
    Perform the following task: {task}  
    
    Based on the following context:
    {context}

    You should output python code that would perform the task while using any neccessary information specified in the context.
    Often context will include names of files/resources/name that you should use.

    The code will have a single main function that would only have one argument which is the filename of a csv that that needs to be loaded.  
    Output the code and nothing else. You should include all the necessary imports and the code should only perform steps needed to achieve the task. 
    
    Example 1 (produce and save a line plot for 'apples' column of the table in 'table_data.csv'):
    Output: 
    import matplotlib.pyplot as plt
    import pandas as pd 
    def main(inputfilepath, outputfile):
        '''Plots a scatter plot of all the columns in the dataframe and return just the image''' 
        df = pd.read_csv(inputfilepath) 
        fig, ax = plt.subplots()
        column = "apples"
        ax = df[column].plot(ax=ax)
        plt.savefig(outputfile, format="png")
    inputfilepath = 'table_data.csv'
    outputfile = 'line_plot_apples.png'
    main(inputfilepath, outputfile)

    Example 2 (calculate the difference in yield btw april's of 2014 and 2013 data in 'table_data.csv'):
    Output: 
    import pandas as pd 
    def main(inputfilepath, outputfile):
        '''Get average growth difference between April 2014 and 2013''' 
        df = pd.read_csv(inputfilepath) 
        df_agg = df.groupby(["year", "month"]).mean()
        diff_yield = df_agg[(df_agg.index.get_index_value(0)==2014) & ((df_agg.get_index_value(1)=="April"))] - df_agg[(df_agg.index.get_index_value(0)==2013) & ((df_agg.get_index_value(1)=="April"))] 
        with open(outputfile, 'wt') as f:
             f.write(diff_yield) 
    inputfilepath = 'table_data.csv'
    outputfile = 'yield_diff.txt'
    main(inputfilepath, outputfile)

    ------------------------------------------------
    **IMPORTANT**: Output code and nothing else, no addition formatting needed, just code that could be executed by running `python3` command on it. 
    """

    code_reps = llm.complete(prompt)

    return code_reps.text

def execute_python_code(code: str):
    """
    Function that executes arbitruary python code and outputs its stdout and stderr. 

    Args:
        code (str): should always parse key-value arguments as would be given to subprocess command. 
    """
    import subprocess
    import tempfile

    open_file_handle = tempfile.NamedTemporaryFile("wt")
    with open_file_handle as f:
        f.write(code)
        f.flush()

        call_result = subprocess.run(["python3", open_file_handle.name], capture_output=True, text=True)

    return  call_result.stdout, call_result.stderr

In [None]:
task = "The attached Excel file contains the sales of menu items for a local fast-food chain." \
"What were the total sales that the chain made from food (not including drinks)? " \
"Express your answer in USD with two decimal places."
context = "The table data is saved inside `7bd855d8-463d-4ed5-93ca-5fe35145f733.xls` that you can access from this folder."
peak_result = peak_into_table("7bd855d8-463d-4ed5-93ca-5fe35145f733.xls")
context += "Data in the table looks like this:\n"+peak_result

code = write_python_code(task, context)
execute_python_code(code)