In [1]:
from dotenv import load_dotenv
load_dotenv("../app/.env")
import sys
sys.path.append("../app")

In [2]:
import evars
evars.AI_CHAT_MODEL_PROVIDER

'OPENAI'

In [3]:
from services.ai.aimodels import chat_model, embedding_model

print(embedding_model.embed("ping").shape)

(3072,)


In [6]:
from services.googlesearch.google import google

search_links = google.search(["Adolf Hitler"], 5)
search_links

ModuleNotFoundError: No module named 'services.websearch.googlesearch'

In [3]:
from services.websearch.webscraper.scraper import webscraper

page = webscraper.scrape("https://de.wikipedia.org/wiki/Adolf_Hitler")
page

ModuleNotFoundError: No module named 'services.websearch.webscraper'

In [None]:
with open("../dev/hitler-wiki--2", "w") as f:
    f.write(page.content)

## Pool

In [4]:
import multiprocessing

num_cpus = multiprocessing.cpu_count()
num_cpus

8

In [10]:
import multiprocessing

from concurrent.futures import ThreadPoolExecutor
from services.webscraper.scraper import webscraper
from services.googlesearch.google import google
from services.ai.aimodels import embedding_model
from services.ai import aiutils
from services.imindexsearch.indexdb import InMemoryIndexDBFactory


index = InMemoryIndexDBFactory.new()


def prepare_page_results(page_results: list) -> tuple[list, list]:
    prep_results = []
    pages_text_chunks = []
    err_urls = []
    for page in page_results:
        if page.content is not None:
            prep_results.append(page)
            page_chunks = aiutils.chunk_text_with_overlap(page.content)
            pages_text_chunks.append(page_chunks)
        else:
            err_urls.append(page.url)

    return prep_results, pages_text_chunks, err_urls


ggl_queries = [
    "Adolf Hitler"
    ,
    "Eva Braun"
]

vector_queries = [
    "Wie viele Bücher hatte hitler?",
    "Wann starb eva braun"
]
vector_queries_args = [(query, 3, True) for query in vector_queries]
def vector_search_wrapper(args: tuple[str, int, bool]):
    query, k, as_dict = args
    return index.search(query, k=k, as_dict=as_dict)

search_links = google.search(ggl_queries, 5)
processes = multiprocessing.cpu_count()
with ThreadPoolExecutor(max_workers=processes) as pool:
    print("START")

    scraped_pages = list(pool.map(webscraper.scrape, search_links))
    pages_results, pages_text_chunks, err_urls = prepare_page_results(scraped_pages)

    print("Embeddings", len(pages_text_chunks))
    pages_embeddings = list(pool.map(embedding_model.embed_batch, pages_text_chunks))

    for page, chunks, embeddings in zip(pages_results, pages_text_chunks, pages_embeddings):
        print("add_batch", page.url, len(chunks))
        index.add_batch(reference=page.url, texts=chunks, embeddings=embeddings)

    vector_results = list(pool.map(vector_search_wrapper, vector_queries_args))


# print("write results")
# for i, results in enumerate(vector_results, 1):
#     with open(f"../dev/answer-{i}.txt", "w") as f:
#         text = "\n\n===========================\n\n".join([r["text"] for r in results])
#         f.write(text)

print("DOC-COUNT:", len(index._documents))

[19.05.2025 08:33:23,f] [DEBUG] connectionpool.py Starting new HTTPS connection (1): www.google.com:443
[19.05.2025 08:33:23,f] [DEBUG] connectionpool.py https://www.google.com:443 "GET /search?q=Adolf+Hitler&num=7&hl=en&start=0&safe=active HTTP/1.1" 200 None
[19.05.2025 08:33:23,f] [DEBUG] connectionpool.py Starting new HTTPS connection (1): www.google.com:443
[19.05.2025 08:33:24,f] [DEBUG] connectionpool.py https://www.google.com:443 "GET /search?q=Eva+Braun&num=8&hl=en&start=0&safe=active HTTP/1.1" 200 None
[19.05.2025 08:33:24,f] [DEBUG] connectionpool.py Starting new HTTPS connection (1): en.wikipedia.org:443
[19.05.2025 08:33:24,f] [DEBUG] connectionpool.py Starting new HTTPS connection (1): www.dhm.de:443
[19.05.2025 08:33:24,f] [DEBUG] connectionpool.py Starting new HTTPS connection (1): www.britannica.com:443
[19.05.2025 08:33:24,f] [DEBUG] connectionpool.py Starting new HTTPS connection (1): de.wikipedia.org:443
[19.05.2025 08:33:24,f] [DEBUG] connectionpool.py Starting new 

START


[19.05.2025 08:33:24,f] [DEBUG] connectionpool.py https://www.dhm.de:443 "GET /lemo/biografie/eva-braun HTTP/1.1" 200 6905
[19.05.2025 08:33:24,f] [DEBUG] connectionpool.py https://www.nsdoku.de:443 "GET /lexikon/artikel/braun-eva-105 HTTP/1.1" 200 9423
[19.05.2025 08:33:24,f] [DEBUG] connectionpool.py https://www.britannica.com:443 "GET /biography/Eva-Braun HTTP/1.1" 200 None
[19.05.2025 08:33:24,f] [DEBUG] connectionpool.py https://www.dhm.de:443 "GET /lemo/biografie/adolf-hitler HTTP/1.1" 200 12505
[19.05.2025 08:33:24,f] [DEBUG] connectionpool.py https://www.britannica.com:443 "GET /biography/Adolf-Hitler HTTP/1.1" 200 None
[19.05.2025 08:33:24,f] [DEBUG] _base_client.py Request options: {'method': 'post', 'url': '/embeddings', 'files': None, 'post_parser': <function Embeddings.create.<locals>.parser at 0x156872a20>, 'json_data': {'input': ['Adolf Hitler | History, Biography, Actions, & Facts | Britannica\n\n\nSearch Britannica\n\n\nClick here to search\n\n\nSearch Britannica\n\n\n

Embeddings 7


[19.05.2025 08:33:25,f] [DEBUG] _trace.py receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Mon, 19 May 2025 06:33:25 GMT'), (b'Content-Type', b'application/json'), (b'Transfer-Encoding', b'chunked'), (b'Connection', b'keep-alive'), (b'access-control-allow-origin', b'*'), (b'access-control-expose-headers', b'X-Request-ID'), (b'openai-model', b'text-embedding-3-large'), (b'openai-organization', b'emtec-e-v'), (b'openai-processing-ms', b'105'), (b'openai-version', b'2020-10-01'), (b'strict-transport-security', b'max-age=31536000; includeSubDomains; preload'), (b'via', b'envoy-router-7d5ccf99bc-9gsvd'), (b'x-envoy-upstream-service-time', b'124'), (b'x-ratelimit-limit-requests', b'5000'), (b'x-ratelimit-limit-tokens', b'5000000'), (b'x-ratelimit-remaining-requests', b'4999'), (b'x-ratelimit-remaining-tokens', b'4999025'), (b'x-ratelimit-reset-requests', b'12ms'), (b'x-ratelimit-reset-tokens', b'11ms'), (b'x-request-id', b'req_c9c403cb8b5332b62442aa4e88d8

add_batch https://en.wikipedia.org/wiki/Adolf_Hitler 63
add_batch https://www.britannica.com/biography/Adolf-Hitler 5
add_batch https://www.dhm.de/lemo/biografie/adolf-hitler 5
add_batch https://www.britannica.com/biography/Eva-Braun 3
add_batch https://de.wikipedia.org/wiki/Eva_Braun 7
add_batch https://www.dhm.de/lemo/biografie/eva-braun 1
add_batch https://www.nsdoku.de/lexikon/artikel/braun-eva-105 2


[19.05.2025 08:33:28,f] [DEBUG] _trace.py receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Mon, 19 May 2025 06:33:28 GMT'), (b'Content-Type', b'application/json'), (b'Transfer-Encoding', b'chunked'), (b'Connection', b'keep-alive'), (b'access-control-allow-origin', b'*'), (b'access-control-expose-headers', b'X-Request-ID'), (b'openai-model', b'text-embedding-3-large'), (b'openai-organization', b'emtec-e-v'), (b'openai-processing-ms', b'95'), (b'openai-version', b'2020-10-01'), (b'strict-transport-security', b'max-age=31536000; includeSubDomains; preload'), (b'via', b'envoy-router-54f9f45cdb-x2ps4'), (b'x-envoy-upstream-service-time', b'98'), (b'x-ratelimit-limit-requests', b'5000'), (b'x-ratelimit-limit-tokens', b'5000000'), (b'x-ratelimit-remaining-requests', b'4999'), (b'x-ratelimit-remaining-tokens', b'4999993'), (b'x-ratelimit-reset-requests', b'12ms'), (b'x-ratelimit-reset-tokens', b'0s'), (b'x-request-id', b'req_1a28538b5576d7f208cae525ee03cd2f

DOC-COUNT: 86
