In [18]:
!pip install pymupdf==1.26.4 requests==2.32.4 beautifulsoup4==4.13.5 google-api-python-client==2.179.0 readability-lxml==0.8.4.1



In [None]:
import os
API_KEY = os.getenv('google_api_secret')
SEARCH_ENGINE_ID = os.getenv('google_search_id')


In [7]:
'''
Ohne KI Version
'''
import time
import requests
import pandas as pd
import pymupdf
from bs4 import BeautifulSoup
from googleapiclient.discovery import build
from readability import Document


API_KEY = os.getenv('google_api_secret')
SEARCH_ENGINE_ID = os.getenv('google_search_id')

# Load kwp excel
kwp_df = pd.read_csv('kwp_subset_clean.csv', sep=',')

# if Stand in der KWP is 'Stand Unbekannt', add to Gemeinde list
gemeinde_dict = {}
for index, row in kwp_df.iterrows():
    if row['Stand in der KWP'] == 'Stand unbekannt':
        gemeinde_name_clean = row['Gemeindename'].split(' (')[0]
        gemeinde_dict[gemeinde_name_clean] = str(row['Gemeindeschlüssel'])


# for names in list, remove anything between parentheses
gemeinde_list = [x.split(' (')[0] for x in gemeinde_dict.keys()]


# === CONFIGURATION ===


# --- Search Configuration ---
MAX_RESULTS_PER_QUERY = 5  # Number of Google results to check per query

# === FUNCTIONS ===

def GoogleSearch(api_key, cx, query, num_results):
    """Performs a Google search and returns a list of URLs."""
    try:
        service = build("customsearch", "v1", developerKey=api_key)
        res = service.cse().list(q=query, cx=cx, num=num_results).execute()

        if 'items' not in res:
            print(f"No results found for query: {query}")
            return []

        return [item['link'] for item in res['items']]
    except Exception as e:
        print(f"❌ An error occurred with the Google Search API: {e}")
        return []

from bs4 import BeautifulSoup
from bs4.builder import ParserRejectedMarkup

def extract_text_from_html_url(url):
    """Extracts the main readable content from a webpage."""
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()

        doc = Document(response.text)
        html = doc.summary()  # Extracted main content HTML
        title = doc.title()

        soup = BeautifulSoup(html, 'html.parser')

        # Remove script/style tags just in case
        for tag in soup(['script', 'style']):
            tag.decompose()

        text = ' '.join(s.strip() for s in soup.stripped_strings)
        return text

    except requests.RequestException as e:
        print(f"❌ Failed to fetch: {e}")
        return None

    except ValueError as e:
        print(f"❌ Failed to parse: {e}")
        return None


def extract_text_from_pdf_url(url, tmp_file="temp_download.pdf"):
    """Downloads a PDF from a URL and extracts its text."""
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers, timeout=20)
        response.raise_for_status()

        with open(tmp_file, 'wb') as f:
            f.write(response.content)

        doc = pymupdf.open(tmp_file)
        pages = min(len(doc), 10)  # Limit to first 10 pages
        text = "\n".join(doc[i].get_text() for i in range(pages))

        os.remove(tmp_file)
        return text
    except Exception as e:
        print(f"Failed to process PDF from {url}: {e}")
        if os.path.exists(tmp_file):
            os.remove(tmp_file)
        return None

# === MAIN LOGIC ===
results = []
processed_urls = set()

for gemeinde in gemeinde_list[:10]:
    print(f"\n{'='*20}\n🔎 Searching for: {gemeinde}\n{'='*20}")
    for term in ["Wärmeplan", "Wärmeplanung"]:
        query = f'"{gemeinde}" AND {term}'

        print(f"Executing Google search for query: '{query}'")
        urls = GoogleSearch(API_KEY, SEARCH_ENGINE_ID, query, num_results=MAX_RESULTS_PER_QUERY)
        time.sleep(1)

        for url in urls:
            if url in processed_urls:
                continue
            processed_urls.add(url)

            print(f"\nProcessing URL: {url}")
            text = None

            if url.lower().endswith('.pdf'):
                text = extract_text_from_pdf_url(url)
            else:
                text = extract_text_from_html_url(url)
            if text and text.strip():
                # Check if Gemeindename AND "wärmeplan"/"wärmeplanung" appear in text
                if gemeinde.lower() in text.lower() and ("wärmeplanung" in text.lower()):
                    print(f"✅ Match found for {gemeinde} in {url}")
                    results.append({
                        "Gemeinde": gemeinde,
                        'Gemeindeschlüssel': gemeinde_dict[f'{gemeinde}'],
                        "Source_url": url
                    })
                else:
                    print("No relevant text, skipping.")

# --- Save results to Excel ---
if results:
    results_df = pd.DataFrame(results)
    output_filename = "Voll_GoogleSearch_Waermeplan_Treffer_Github.xlsx"
    results_df.to_excel(output_filename, index=False)
    print(f"\n\n🎉 Process complete! Results saved to '{output_filename}'.")




  kwp_df = pd.read_csv('/content/Laufende KWPs(ListeKWPs) (1).csv', sep=';')



🔎 Searching for: Albersdorf
Executing Google search for query: '"Albersdorf" AND Wärmeplan'

Processing URL: https://www.dithmarschen.de/fileadmin/download/themen/klimaschutz/20250326_krdithmarschen_waermekataster_plus_info-veranstaltung.pdf
No relevant text, skipping.

Processing URL: https://www.shgt.de/fileadmin/download/die_gemeinde/zeitschrift_2022/Die_Gemeinde_06_2022.pdf
No relevant text, skipping.

Processing URL: https://www.dithmarschen.de/fileadmin/download/themen/klimaschutz/20250611_kreisdithmarschen-waermewende.pdf
No relevant text, skipping.

Processing URL: https://www.co2online.de/modernisieren-und-bauen/heizung/fernwaerme/kommunale-waermeplanung/
No relevant text, skipping.

Processing URL: https://kahla.de/cs/Aktuelles.php
✅ Match found for Albersdorf in https://kahla.de/cs/Aktuelles.php
Executing Google search for query: '"Albersdorf" AND Wärmeplanung'

Processing URL: https://www.albersdorf.de/fileadmin/Dateien/Gemeinde_Albersdorf/Buergerservice_Politik/Ortsrecht/

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>