In [1]:
!pip install pydantic -q

In [1]:
import requests
from typing import List, Optional, Dict, Any
from pydantic import BaseModel
from urllib.parse import urlencode

# Equivalent to `getSearxngApiEndpoint()` in TypeScript
def get_searxng_api_endpoint() -> str:
    return "http://localhost:8080"  # Or your Docker container address

class SearxngSearchOptions(BaseModel):
    categories: Optional[List[str]] = None
    engines: Optional[List[str]] = None
    language: Optional[str] = None
    pageno: Optional[int] = None

class SearxngSearchResult(BaseModel):
    title: str
    url: str
    img_src: Optional[str] = None
    thumbnail_src: Optional[str] = None
    thumbnail: Optional[str] = None
    content: Optional[str] = None
    author: Optional[str] = None
    iframe_src: Optional[str] = None

class SearxngSearchResponse(BaseModel):
    results: List[SearxngSearchResult]
    suggestions: List[str]

def search_searxng(query: str, opts: Optional[SearxngSearchOptions] = None) -> SearxngSearchResponse:
    base_url = f"{get_searxng_api_endpoint()}/search?format=json"
    params = {'q': query}

    if opts:
        for key, value in opts.model_dump(exclude_none=True).items():
            if isinstance(value, list):
                params[key] = ','.join(value)
            else:
                params[key] = str(value)

    response = requests.get(base_url, params=params)
    response.raise_for_status()
    data = response.json()

    return SearxngSearchResponse(results=data['results'], suggestions=data.get('suggestions', []))


In [2]:
opts = SearxngSearchOptions(
    engines=["google", "bing", "DuckDuckGo"],
    language="en",
    pageno=1
)
result = search_searxng("What is the definition of the best pizza?", opts)

In [3]:
result.results[0].url

'https://www.reddit.com/r/Pizza/comments/1dfj86w/what_makes_a_perfect_pizza_this_gets_close_for_me/'

In [24]:
len(result.results)

10

In [26]:
import requests

def scrape_jina_ai(url:str)->str:
    response = requests.get("https://r.jina.ai/"+url)
    return response.text

In [27]:
class Document(BaseModel):
    page_content:str
    metadata:Dict[str, Any]

documents:List[Document] = []

for r in result.results:
    text = scrape_jina_ai(url=r.url)
    documents.append(Document(page_content=text, metadata={"source": r.url, "title": r.title}))

In [28]:
len(documents)

10

In [31]:
idx = 1
print(documents[idx].metadata['source'])
print(documents[idx].page_content)

https://www.pizzauniversity.org/how-to-identify-good-pizza-in-8-easy-steps
Title: How to Identify Good Pizza in 8 Easy Steps – Pizza University & Culinary Arts Center

URL Source: https://www.pizzauniversity.org/how-to-identify-good-pizza-in-8-easy-steps

Markdown Content:
How to Identify Good Pizza in 8 Easy Steps – Pizza University & Culinary Arts Center

[![Image 5](https://www.pizzauniversity.org/wp/wp-content/uploads/2023/09/Pizza-University-logo-horizontal-2.png)](https://www.pizzauniversity.org/)

*   [Intensive Courses](https://www.pizzauniversity.org/pizzauu-courses/)
*   [Online Courses](https://www.pizzauniversity.org/online-courses/)
*   [Gift Cards](https://www.pizzauniversity.org/gift-cards/)
*   [Recreational Classes](https://www.pizzauniversity.org/recreational-classes/)
    *   [Pizza-Making Workshops](https://www.pizzauniversity.org/pizza-making-workshops/)
    *   [Pizza Wars](https://www.pizzauniversity.org/pizza-wars/)
*   [Gallery](https://www.pizzauniversity.org/