In [1]:
from dotenv import load_dotenv
load_dotenv("../app/.env")
import sys
sys.path.append("../app")

In [3]:
import evars
evars.AI_CHAT_MODEL_PROVIDER

'OPENAI'

In [9]:
from services.ai.aimodels import chat_model, embedding_model

print(embedding_model.embed("ping").shape)

(3072,)


In [2]:
from services.websearch.googlesearch.google import google

search_links = google.search(["Adolf Hitler"], 5)
search_links

['https://de.wikipedia.org/wiki/Adolf_Hitler',
 'https://www.britannica.com/biography/Adolf-Hitler',
 'https://www.dhm.de/lemo/biografie/adolf-hitler']

In [None]:
from services.websearch.webscraper.scraper import webscraper

page = webscraper.scrape("https://de.wikipedia.org/wiki/Adolf_Hitler")
page

[16.05.2025 19:47:59,f] [DEBUG] connectionpool.py Starting new HTTPS connection (1): de.wikipedia.org:443
[16.05.2025 19:47:59,f] [DEBUG] connectionpool.py https://de.wikipedia.org:443 "GET /wiki/Adolf_Hitler HTTP/1.1" 200 197821


WebPageResult(url='https://de.wikipedia.org/wiki/Adolf_Hitler', content="Adolf Hitler  Wikipedia\n\n\nZum Inhalt springen\n\n\nSuche\n\n\nSuchen\n\n\nAdolf Hitler\n\n\n241 Sprachen\n\n\nAfrikaansAlemannischAragonsngliscAsturianuAzrbaycancaBasa BaliBoarischemaitkaBikol Central ()BetawiBanjarBrezhonegBosanskiBatak MandailingCatalChavacano de Zamboanga / Mng-dng-ngCebuanoCorsuQrmtatarcaetinaKaszbscziCymraegDanskThujZazakiEmilin e rumagnlEnglishEsperantoEspaolEestiEuskaraEstremeuFulfuldeSuomiVroFroysktFranaisNordfriiskFurlanFryskGaeilgeKriyl gwiyannenGidhligGalegoAvae'  / Gychi KonknniGungbeGaelgHausa / Hak-k-ngHawaiiFiji HindiHrvatskiHornjoserbsceKreyl ayisyenMagyarInterlinguaBahasa IndonesiaInterlingueIgboIlokanoIdoslenskaItaliano / inuktitutPatoisLa .lojban.JawaQaraqalpaqshaTaqbaylitKabyYerwa Kanuri / RipoarischKurdKernowekLatinaLadinoLtzebuergeschLingua Franca NovaLugandaLimburgsLadinLombardLietuviLatgauLatvieuMadhurBasa BanyumasanMalagasy MoriMinangkabauBahasa MelayuMaltiMirandsNhuatl

In [None]:
with open("../dev/hitler-wiki--2", "w") as f:
    f.write(page.content)

## Pool

In [None]:
from multiprocessing import Pool
from services.websearch.webscraper.scraper import webscraper
from services.websearch.googlesearch.google import google
from services.ai.aimodels import embedding_model
from services.ai import aiutils
from services.imindexsearch.indexdb import InMemoryIndexDBFactory


index = InMemoryIndexDBFactory.new()


def prepare_page_results(page_results: list) -> tuple[list, list]:
    prep_results = []
    pages_text_chunks = []
    err_urls = []
    for page in page_results:
        if page.content is not None:
            prep_results.append(page)
            page_chunks = aiutils.chunk_text_with_overlap(page.content)
            pages_text_chunks.append(page_chunks)
        else:
            err_urls.append(page.url)

    return prep_results, pages_text_chunks, err_urls


ggl_queries = [
    "Adolf Hitler"
    ,
    "Eva Braun"
]

vector_queries = [
    "Wie viele Bücher hatte hitler?",
    "Wann starb eva braun"
]

search_links = google.search(ggl_queries, 5)
processes = min(8, len(search_links))
with Pool(processes=processes) as pool:
    print("START")
    pages_results, pages_text_chunks, err_urls = prepare_page_results(pool.map(webscraper.scrape, search_links))
    print("Embeddings", len(pages_text_chunks))
    pages_embeddings = pool.map(embedding_model.embed_batch, pages_text_chunks)

    for page, chunks, embeddings in zip(pages_results, pages_text_chunks, pages_embeddings):
        print("add_batch", page.url, len(chunks))
        index.add_batch(reference=page.url, texts=chunks, embeddings=embeddings)

    vector_results = pool.map(index.search, vector_queries)

print("write results")
for i, results in enumerate(vector_results, 1):
    with open(f"../dev/answer-{i}.txt", "w") as f:
        text = "\n\n===========================\n\n".join([r.text for r in results])
        f.write(text)

print("DOC-COUNT:", len(index._documents))

[16.05.2025 21:25:11,f] [DEBUG] connectionpool.py Starting new HTTPS connection (1): www.google.com:443
[16.05.2025 21:25:11,f] [DEBUG] connectionpool.py https://www.google.com:443 "GET /search?q=Adolf+Hitler&num=7&hl=en&start=0&safe=active HTTP/1.1" 200 None
[16.05.2025 21:25:12,f] [DEBUG] connectionpool.py Starting new HTTPS connection (1): www.google.com:443
[16.05.2025 21:25:12,f] [DEBUG] connectionpool.py https://www.google.com:443 "GET /search?q=Eva+Braun&num=8&hl=en&start=0&safe=active HTTP/1.1" 200 None


START


[16.05.2025 21:25:12,f] [DEBUG] connectionpool.py Starting new HTTPS connection (1): de.wikipedia.org:443
[16.05.2025 21:25:12,f] [DEBUG] connectionpool.py Starting new HTTPS connection (1): www.dhm.de:443
[16.05.2025 21:25:12,f] [DEBUG] connectionpool.py Starting new HTTPS connection (1): de.wikipedia.org:443
[16.05.2025 21:25:12,f] [DEBUG] connectionpool.py Starting new HTTPS connection (1): www.britannica.com:443
[16.05.2025 21:25:12,f] [DEBUG] connectionpool.py Starting new HTTPS connection (1): www.nsdoku.de:443
[16.05.2025 21:25:12,f] [DEBUG] connectionpool.py Starting new HTTPS connection (1): www.dhm.de:443
[16.05.2025 21:25:12,f] [DEBUG] connectionpool.py Starting new HTTPS connection (1): www.britannica.com:443
[16.05.2025 21:25:12,f] [DEBUG] connectionpool.py https://www.nsdoku.de:443 "GET /lexikon/artikel/braun-eva-105 HTTP/1.1" 200 9423
[16.05.2025 21:25:13,f] [DEBUG] connectionpool.py https://de.wikipedia.org:443 "GET /wiki/Eva_Braun HTTP/1.1" 200 31116
[16.05.2025 21:25:

Embeddings 7


[16.05.2025 21:25:14,f] [DEBUG] _base_client.py Request options: {'method': 'post', 'url': '/embeddings', 'files': None, 'post_parser': <function Embeddings.create.<locals>.parser at 0x10a7647c0>, 'json_data': {'input': ["Eva Braun | Facts, Biography, Picture, & Death | Britannica\n\n\nSearch Britannica\n\n\nClick here to search\n\n\nSearch Britannica\n\n\nClick here to search\n\n\n   SUBSCRIBE\n\n\n   SUBSCRIBE\n\n\nLogin\n\nhttps://premium.britannica.com/premium-membership/?utm_source=premium&utm_medium=nav-login-box&utm_campaign=evergreen\n\n\n  SUBSCRIBE\n\n\nHome\nHistory & Society\nScience & Tech\nBiographies\nAnimals & Nature\nGeography & Travel\nArts & Culture\nProCon\nMoney\n\n\nGames & Quizzes\nVideos\nOn This Day\nOne Good Fact\nDictionary\nNew Articles\n\nHistory & Society\n\nLifestyles & Social Issues\nPhilosophy & Religion\nPolitics, Law & Government\nWorld History\n\nScience & Tech\n\nHealth & Medicine\nScience\nTechnology\n\nBiographies\n\nBrowse Biographies\n\nAnimals 

add_batch https://de.wikipedia.org/wiki/Adolf_Hitler 118
add_batch https://www.britannica.com/biography/Adolf-Hitler 5
add_batch https://www.dhm.de/lemo/biografie/adolf-hitler 5
add_batch https://de.wikipedia.org/wiki/Eva_Braun 7
add_batch https://www.britannica.com/biography/Eva-Braun 3
add_batch https://www.dhm.de/lemo/biografie/eva-braun 1
add_batch https://www.nsdoku.de/lexikon/artikel/braun-eva-105 2


[16.05.2025 21:25:17,f] [DEBUG] _base_client.py Request options: {'method': 'post', 'url': '/embeddings', 'files': None, 'post_parser': <function Embeddings.create.<locals>.parser at 0x1162ac7c0>, 'json_data': {'input': 'Wann starb eva braun', 'model': 'text-embedding-3-large', 'encoding_format': 'base64'}}
[16.05.2025 21:25:17,f] [DEBUG] _base_client.py Sending HTTP Request: POST https://api.openai.com/v1/embeddings
[16.05.2025 21:25:17,f] [DEBUG] _trace.py send_request_headers.started request=<Request [b'POST']>
[16.05.2025 21:25:17,f] [DEBUG] _trace.py send_request_headers.complete
[16.05.2025 21:25:17,f] [DEBUG] _trace.py send_request_body.started request=<Request [b'POST']>
[16.05.2025 21:25:17,f] [DEBUG] _trace.py send_request_body.complete
[16.05.2025 21:25:17,f] [DEBUG] _trace.py receive_response_headers.started request=<Request [b'POST']>
[16.05.2025 21:25:17,f] [DEBUG] _base_client.py Request options: {'method': 'post', 'url': '/embeddings', 'files': None, 'post_parser': <fun

write results
DOC-COUNT: 141


[16.05.2025 21:25:18,f] [DEBUG] _trace.py receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Fri, 16 May 2025 19:25:18 GMT'), (b'Content-Type', b'application/json'), (b'Transfer-Encoding', b'chunked'), (b'Connection', b'keep-alive'), (b'access-control-allow-origin', b'*'), (b'access-control-expose-headers', b'X-Request-ID'), (b'openai-model', b'text-embedding-3-large'), (b'openai-organization', b'emtec-e-v'), (b'openai-processing-ms', b'70'), (b'openai-version', b'2020-10-01'), (b'strict-transport-security', b'max-age=31536000; includeSubDomains; preload'), (b'via', b'envoy-router-6d87b587bb-4mzgj'), (b'x-envoy-upstream-service-time', b'72'), (b'x-ratelimit-limit-requests', b'5000'), (b'x-ratelimit-limit-tokens', b'5000000'), (b'x-ratelimit-remaining-requests', b'4999'), (b'x-ratelimit-remaining-tokens', b'4999993'), (b'x-ratelimit-reset-requests', b'12ms'), (b'x-ratelimit-reset-tokens', b'0s'), (b'x-request-id', b'req_a55fbf176b387e94411c31a59a704c06