In [15]:
%%capture
%pip install pymongo selenium dateparser

In [16]:
#%%cmd
#playwright install

In [1]:
from IPython.core.magic import register_cell_magic
from IPython.display import display, Markdown

@register_cell_magic
def deactivate(line, cell):
    """
    Magic cell Funktion: Deaktiviert die Ausführung der Zelle.
    """
    message = "⚠️ **Diese Zelle ist deaktiviert und wurde nicht ausgeführt.**"
    display(Markdown(message))


In [2]:
# config.py

import socket

CHROMEDRIVER_PATH = "/usr/bin/chromedriver" if socket.gethostname() == "raspberrypi" else "chromedriver.exe"
FRONTPAGE_URL = "https://www.derstandard.at/"
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"

In [3]:
# setup_mongodb.py

from pymongo import MongoClient
import os

# set user creds
username = os.getenv("MONGODB_USER")
password = os.getenv("MONGODB_PWD")

# setup client
client = MongoClient(f"mongodb://{username}:{password}@BlackWidow:27017")

# load db and collections
db = client['newspapers']
derStandard_collection = db['derStandard']

In [4]:
# logger_setup.py

import logging

def setup_logger(name=__name__, log_file='scraper.log', level=logging.DEBUG):
    logger = logging.getLogger(name)
    logger.setLevel(level)
    
    # Log-Format definieren
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    
    # Console-Handler hinzufügen
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.INFO)
    console_handler.setFormatter(formatter)
    logger.addHandler(console_handler)
    
    # File-Handler hinzufügen
    file_handler = logging.FileHandler(log_file)
    file_handler.setLevel(logging.DEBUG)
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)
    
    return logger


In [5]:
# utils.py

from selenium.webdriver.remote.webelement import WebElement

def expand_shadow_element(driver, element: WebElement):
    """Erweitert ein Shadow DOM-Element und gibt das Shadow Root zurück."""
    shadow_root = driver.execute_script('return arguments[0].shadowRoot', element)
    return shadow_root


In [6]:
# driver.py

import time
from selenium import webdriver as wd
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import datetime

def configure_driver(headless=True):
    chrome_options = wd.ChromeOptions()
    if headless:
        chrome_options.add_argument("--headless")
    chrome_options.add_argument(f"--user-agent={USER_AGENT}")
    chrome_options.add_argument("--blink-settings=imagesEnabled=false")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    
    chrome_prefs = {
        "profile.default_content_settings.images": 2,
        "profile.managed_default_content_settings.images": 2
    }
    chrome_options.experimental_options["prefs"] = chrome_prefs
    chrome_options.page_load_strategy = 'none'
    
    service = ChromeService(executable_path=CHROMEDRIVER_PATH)
    driver = wd.Chrome(service=service, options=chrome_options)
    
    # POPUP WEGKLICKEN
    driver.get(FRONTPAGE_URL + datetime.date.today().strftime("%Y/%m/%d"))
    time.sleep(5)
    try:
        WebDriverWait(driver, 10).until(
            lambda d: d.execute_script("return document.readyState") == 'complete'
        )
        driver.switch_to.frame(driver.find_element(By.XPATH, "/html/body/div/iframe"))
        driver.find_element(By.XPATH, "/html/body/div[1]/div[2]/div[3]/div[1]/button").click()
        driver.switch_to.parent_frame()
    except NoSuchElementException:
        raise Exception("Popup nicht gefunden")
    
    return driver


In [7]:
# parsers.py

import re
import datetime
import dateparser
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import inspect

def parse_posting(posting_element, logger):
    # Parst ein einzelnes <dst-posting>-Element und extrahiert die relevanten Daten.
    try:
        # Extrahiere den Autor
        author = "Unbekannter Benutzer"
        try:
            usermenu = posting_element.find_element(By.CSS_SELECTOR, "dst-posting--user button")
            spans = usermenu.find_elements(By.CSS_SELECTOR, "span > span")
            if spans:
                author = spans[0].text.strip()
        except NoSuchElementException:
            pass

        # Extrahiere die Anzahl der Follower
        user_followers = 0
        try:
            followers_div = posting_element.find_element(By.CSS_SELECTOR, "dst-posting--user button div[title]")
            followers_text = followers_div.get_attribute("title")
            followers_match = re.search(r'\d+', followers_text)
            if followers_match:
                user_followers = int(followers_match.group())
        except NoSuchElementException:
            pass
        except ValueError:
            logger.warning(f"{inspect.currentframe().f_back.f_code.co_name} Ungültige Follower-Zahl: '{followers_text}'.")

        # Extrahiere das Datum und die Uhrzeit
        datetime_obj = None
        try:
            time_tag = posting_element.find_element(By.CSS_SELECTOR, "time[data-date]")
            datetime_str = time_tag.get_attribute("data-date")
            datetime_obj = dateparser.parse(datetime_str, languages=['de'])
        except NoSuchElementException:
            pass

        # Extrahiere den Inhalt des Postings
        content = ""
        try:
            content_div = posting_element.find_element(By.CSS_SELECTOR, "div.posting--content")
            headers = content_div.find_elements(By.TAG_NAME, "h1")
            paragraphs = content_div.find_elements(By.TAG_NAME, "p")
            header_text = "\n".join([h.text for h in headers]) if headers else ""
            paragraph_text = "\n".join([p.text for p in paragraphs]) if paragraphs else ""
            content = "\n".join([header_text, paragraph_text]).strip()
        except NoSuchElementException:
            pass

        # Extrahiere Upvotes und Downvotes
        upvotes = 0
        downvotes = 0
        try:
            ratinglog = posting_element.find_element(By.CSS_SELECTOR, "dst-posting--ratinglog")
            positiveratings = ratinglog.get_attribute("positiveratings")
            negativeratings = ratinglog.get_attribute("negativeratings")
            upvotes = int(positiveratings) if positiveratings and positiveratings.isdigit() else 0
            downvotes = int(negativeratings) if negativeratings and negativeratings.isdigit() else 0
        except NoSuchElementException:
            pass
        except ValueError:
            logger.warning(f"{inspect.currentframe().f_back.f_code.co_name} Ungültige Upvote/Downvote-Zahlen gefunden.")

        # Extrahiere Parent-Kommentar-ID (falls Antwort)
        parent_id = posting_element.get_attribute("data-parentpostingid")
        reply_on_comment = int(parent_id) if parent_id and parent_id.isdigit() else None

        # Extrahiere Kommentar-ID
        commentID = posting_element.get_attribute("data-postingid")
        commentID = int(commentID) if commentID and commentID.isdigit() else None

        # Erstelle das Kommentar-Dictionary
        comment = {
            'commentID': commentID,
            'author': author,
            'user_followers': user_followers,
            'datetime': datetime_obj,
            'content': content,
            'upvotes': upvotes,
            'downvotes': downvotes,
            'reply_on_comment': reply_on_comment,
            'replies': []
        }

        return comment

    except Exception as e:
        logger.error(f"{inspect.currentframe().f_back.f_code.co_name} Fehler beim Parsen eines Postings: {e}", exc_info=True)
        return None

def parse_comment_datetime(datetime_str):
    return dateparser.parse(datetime_str, languages=['de'])

def get_article_byline(soup, logger):
    article_byline = {}
    article_byline_tag = soup.find('div', class_='article-byline')
    if article_byline_tag:
        # Storylabels extrahieren
        storylabels_tag = article_byline_tag.find('div', class_='storylabels')
        if storylabels_tag:
            storylabels = storylabels_tag.get_text(strip=True)
            article_byline['storylabels'] = storylabels

        # Article origins extrahieren
        article_origins_tag = article_byline_tag.find('div', class_='article-origins')
        if article_origins_tag:
            article_origins = article_origins_tag.get_text(strip=True)
            article_byline['article_origins'] = article_origins
        else:
            # Fallback für einfachen Autorentext
            author_simple = article_byline_tag.find('span', class_='simple')
            if author_simple:
                article_byline['article_origins'] = author_simple.get_text(strip=True)
    else:
        article_byline = None
        logger.debug(f"{inspect.currentframe().f_back.f_code.co_name} Keine Artikel-Byline gefunden.")
    return article_byline

def get_article_datetime(soup, logger):
    time_tag = soup.find('time', class_='article-pubdate')
    if time_tag:
        if time_tag.has_attr('datetime'):
            datetime_str = time_tag['datetime'].strip()
            datetime_str = datetime_str.replace('\n', '').strip()
        else:
            datetime_str = time_tag.get_text(strip=True)
        try:
            article_datetime = datetime.datetime.fromisoformat(datetime_str)
        except ValueError:
            datetime_text = time_tag.get_text(strip=True)
            article_datetime = dateparser.parse(datetime_text, languages=['de'])
    else:
        article_datetime = None
        logger.debug(f"{inspect.currentframe().f_back.f_code.co_name} Kein Datum gefunden.")
    return article_datetime

def get_posting_count(soup, full_url, logger):
    posting_count = None
    try:
        posting_count_tag = soup.find('span', class_='js-forum-postingcount')
        if posting_count_tag:
            posting_count_text = posting_count_tag.contents[0].strip()
            posting_count = int(posting_count_text)
            return posting_count
    except (AttributeError, ValueError):
        posting_count = None
    try:
        community_section = soup.find('section', id='story-community')
        header_div = community_section.find('div', class_='story-community-header')
        h1_tag = header_div.find('h1')
        h1_text = h1_tag.get_text(strip=True)
        match = re.search(r'Forum:\s*(\d+)\s*Postings', h1_text)
        posting_count = int(match.group(1))
        return posting_count
    except (AttributeError, ValueError):
        posting_count = None
        logger.warning(f"{inspect.currentframe().f_back.f_code.co_name} Ungültige Posting-Anzahl in {full_url}")
    return posting_count

def get_paragraph_texts(soup, full_url, logger):
    # Artikelinhalt extrahieren
    paragraph_texts = None
    try:
        article_body = soup.find('div', class_='article-body')
        if article_body:

            logger.debug(f"{inspect.currentframe().f_back.f_code.co_name} Alle 'href'-Attribute aus Artikeltext entfernt.")

            # Unerwünschte Elemente entfernen
            for ad in article_body.find_all(['ad-container', 'ad-slot', 'ad', 'native-ad']):
                ad.decompose()
            for ad in article_body.find_all("div", class_="native-ad"):
                ad.decompose()
            for figure in article_body.find_all('figure'):
                figure.decompose()
            for unwanted in article_body.find_all(['aside', 'nav', 'div'], attrs={'data-section-type': 'supplemental'}):
                unwanted.decompose()

            # Alle 'href'-Attribute entfernen
            for a_tag in article_body.find_all('a'):
                a_tag['href'] = ''

            logger.debug(f"{inspect.currentframe().f_back.f_code.co_name} Unerwünschte Elemente aus Artikeltext entfernt.")

            # Paragraphen extrahieren und in Liste umwandeln
            paragraphs = article_body.find_all('p')
            paragraph_texts = [p.get_text() for p in paragraphs]
            logger.debug(f"{inspect.currentframe().f_back.f_code.co_name} Extrahierte Paragraphen: {len(paragraph_texts)} in {full_url}")
        else:
            logger.debug(f"{inspect.currentframe().f_back.f_code.co_name} Kein Artikelinhalt gefunden in {full_url}")
    except Exception as e:
        logger.error(f"{inspect.currentframe().f_back.f_code.co_name} Fehler beim Extrahieren des Artikelinhalts: {e}", exc_info=True)
    return paragraph_texts


In [8]:
# scraper.py

import logging
import time
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import datetime
import multiprocessing

PUBLISHER = "derStandard"

def extract_reactions(driver, logger):
    """
    Extrahiert die Reaktionen aus dem Shadow DOM der aktuellen Seite.
    Gibt ein Tuple zurück: (reactions_dict, warning_flag)
    """
    try:
        shadow_host = driver.find_element(By.CSS_SELECTOR, "dst-community-reactions")
        shadow_root = expand_shadow_element(driver, shadow_host)
        reactions_buttons = shadow_root.find_elements(By.CSS_SELECTOR, "aside.reactions div.reactions--buttons button")
        reactions = {}
        for button in reactions_buttons:
            try:
                count_element = button.find_element(By.TAG_NAME, "strong")
                count = int(count_element.text.strip())
            except (NoSuchElementException, ValueError):
                count = 0
            try:
                sr_only = button.find_element(By.CSS_SELECTOR, "span.sr-only")
                reaction_name = sr_only.text.strip()
            except NoSuchElementException:
                reaction_name = button.text.replace(str(count), '').strip()
            reactions[reaction_name] = count
        return reactions, False  # Kein Warnhinweis notwendig
    except NoSuchElementException:
        logger.warning("Reaktionen konnten nicht extrahiert werden.")
        return None, True  # Warnhinweis setzen
    except Exception as e:
        logger.error(f"Fehler beim Extrahieren der Reaktionen: {e}", exc_info=True)
        return None, True  # Warnhinweis setzen
    
    

def extract_forum_comments_normal(driver, logger, max_comments=70):
    """
    Extrahiert Benutzerkommentare aus dem Shadow DOM der aktuellen Seite und
    bildet verschachtelte Antworten ab.
    Gibt ein Tuple zurück: (comments_list, warning_flag)
    """
    comments = []
    count = 0
    try:
        forum_host = driver.find_element(By.CSS_SELECTOR, "dst-forum")
        forum_shadow = expand_shadow_element(driver, forum_host)
        main_content = forum_shadow.find_element(By.CSS_SELECTOR, "main.forum--main")
        children = main_content.find_elements(By.CSS_SELECTOR, ":scope > *")
        current_parent = None

        for child in children:
            if count >= max_comments:
                break
            tag_name = child.tag_name.lower()
            if tag_name == "dst-posting":
                comment = parse_posting(child, logger)
                if comment:
                    comments.append(comment)
                    current_parent = comment
                    count += 1
            elif tag_name == "section":
                classes = child.get_attribute("class")
                if classes and "thread" in classes:
                    if not current_parent:
                        logger.warning("Thread-Sektion gefunden, aber kein aktueller Parent.")
                        continue
                    reply_postings = child.find_elements(By.CSS_SELECTOR, "dst-posting")
                    for reply in reply_postings:
                        if count >= max_comments:
                            break
                        reply_comment = parse_posting(reply, logger)
                        if reply_comment:
                            current_parent['replies'].append(reply_comment)
                            count += 1
        return comments, False  # Kein Warnhinweis notwendig
    except NoSuchElementException:
        logger.warning("Forum-Elemente nicht gefunden.")
        return [], True  # Warnhinweis setzen
    except Exception as e:
        logger.error(f"Fehler beim Extrahieren der Forenkommentare: {e}", exc_info=True)
        return [], True  # Warnhinweis setzen
    

def extract_forum_comments_alternative(driver, logger, max_comments=70):
    """
    Extrahiert Benutzerkommentare aus der aktuellen Seite unter Verwendung von BeautifulSoup
    und bildet verschachtelte Antworten ab.
    """
    comments_data = []
    comment_map = {}
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    postings = soup.find_all('div', class_='posting', attrs={'data-postingid': True})

    if not postings:
        logger.warning("Forum-Elemente nicht gefunden.")
        return comments_data, True  # Warnhinweis setzen


    for posting in postings[:max_comments]:
        try:
            commentID = posting.get('data-postingid')
            if not commentID or not commentID.isdigit():
                continue
            commentID = int(commentID)

            username = posting.get('data-communityname') or 'gelöschtes Profil'

            reply_on_comment = posting.get('data-parentpostingid')
            reply_on_comment = int(reply_on_comment) if reply_on_comment and reply_on_comment.isdigit() else None

            # Datum und Uhrzeit des Kommentars extrahieren
            datetime_tag = posting.find('span', class_='js-timestamp')
            if datetime_tag and datetime_tag.text:
                datetime_str = datetime_tag.text.strip()
                datetime_obj = parse_comment_datetime(datetime_str)
            else:
                datetime_obj = None 

            # Kommentarüberschrift extrahieren
            comment_header_tag = posting.find('h4', class_='upost-title')
            comment_header = comment_header_tag.text.strip() if comment_header_tag else ""

            # Kommentartext extrahieren
            comment_body = posting.find('div', class_='upost-text')
            comment_text = comment_body.get_text(separator=' ', strip=True) if comment_body else ""

            # Upvotes extrahieren
            upvotes_tag = posting.find('span', class_='js-ratings-positive-count')
            upvotes = int(upvotes_tag.text.strip()) if upvotes_tag and upvotes_tag.text.isdigit() else 0

            # Downvotes extrahieren
            downvotes_tag = posting.find('span', class_='js-ratings-negative-count')
            downvotes = int(downvotes_tag.text.strip()) if downvotes_tag and downvotes_tag.text.isdigit() else 0

            # Anzahl der Follower des Nutzers extrahieren
            user_followers_tag = posting.find('span', class_='upost-follower')
            user_followers = int(user_followers_tag.text.strip()) if user_followers_tag and user_followers_tag.text.isdigit() else 0

            comment_data = {
                'commentID': commentID,
                'author': username,
                'user_followers': user_followers,
                'datetime': datetime_obj,
                'content': f"{comment_header}\n{comment_text}".strip(),
                'upvotes': upvotes,
                'downvotes': downvotes,
                'reply_on_comment': reply_on_comment,
                'replies': []
            }

            # In die Map einfügen
            comment_map[commentID] = comment_data

        except Exception as e:
            logger.error(f"Fehler beim Verarbeiten eines Kommentars: {e}", exc_info=True)
            continue 

    # Jetzt die verschachtelte Struktur aufbauen
    for comment in comment_map.values():
        parent_id = comment['reply_on_comment']
        if parent_id and parent_id in comment_map:
            parent_comment = comment_map[parent_id]
            parent_comment['replies'].append(comment)
        else:
            comments_data.append(comment)

    return comments_data, False  


def scraping_fail(url, exception_message, logger):
    current_date = datetime.datetime.now()
    derStandard_collection.update_one(
        {'scraping_info.url': url},
        {
            '$set': {
                'scraping_info.status': 'failed',
                'scraping_info.download_datetime': current_date
            }
        }
    )
    logger.warning(f"Artikel übersprungen (fehlende Daten): {url}")



def scrape_articles(logger):
    logger.info("Starte den Artikel-Scraping-Prozess.")
    driver = configure_driver(headless=True)

    try:
        # Alle URLs aus der 'derStandard' Collection holen, die noch nicht gescraped wurden
        urls_to_scrape = list(derStandard_collection.find({'scraping_info.status': ''}, {'scraping_info.url': 1}))
        logger.debug(f"Anzahl der zu scrapenden URLs: {len(urls_to_scrape)}")

        # FRONTPAGE_URL mit heutigem Datum erstellen
        frontpage_url = FRONTPAGE_URL + datetime.date.today().strftime("%Y/%m/%d")
        logger.info(f"Navigiere zur Frontpage: {frontpage_url}")

        # Zur Frontpage navigieren, um das Pop-up zu schließen
        driver.get(frontpage_url)
        logger.debug("Frontpage geladen. Warte auf Pop-up.")
        time.sleep(5)  # Kurze Pause, um sicherzustellen, dass alles geladen ist

        # Für jede URL in der Liste
        for url_dict in urls_to_scrape:
            full_url = url_dict['scraping_info']['url']
            logger.info(f"Verarbeite URL: {full_url}")

            # Überspringen spezifischer URLs, falls notwendig
            if full_url.startswith("https://www.derstandard.at/story/3000000240211/w"):
                continue

            # Seite laden mit Timeout von 10 Sekunden
            driver.set_page_load_timeout(10)
            try:
                driver.get(full_url)
            except TimeoutException:
                scraping_fail(url=full_url, exception_message='Timeout nach 10 Sekunden', logger=logger)
                continue

            wait = WebDriverWait(driver, 10)
            time.sleep(5)

            try:
                # Warten, bis die Seite vollständig geladen ist
                wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
                logger.debug(f"Seite {full_url} vollständig geladen.")

                # BeautifulSoup zum Parsen (für Elemente außerhalb des Shadow DOM)
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                logger.debug(f"HTML-Inhalt von {full_url} mit BeautifulSoup geparst.")

                # manchmal ist die Seite anders strukturiert
                old_design = soup.find("div", class_="forum use-unobtrusive-ajax visible")

                # Rubrik/Kicker
                kicker_tag = soup.find('h2', class_='article-kicker')
                kicker = kicker_tag.get_text(strip=True) if kicker_tag else None

                # Titel
                title_tag = soup.find('h1', class_='article-title')
                title = title_tag.get_text(strip=True) if title_tag else None

                # Subtitel
                subtitle_tag = soup.find('p', class_='article-subtitle')
                subtitle = subtitle_tag.get_text(strip=True) if subtitle_tag else None

                # Artikel-Byline (kann verschachtelt sein)
                article_byline = get_article_byline(soup, logger)

                # Datum und Uhrzeit extrahieren und in DATETIME konvertieren
                article_datetime = get_article_datetime(soup, logger)

                if article_datetime is None or title is None:
                    scraping_fail(url=full_url, exception_message='Fehlendes Datum oder Titel', logger=logger)
                    continue

                # Anzahl der Postings extrahieren
                posting_count = get_posting_count(soup, full_url, logger)

                # Reaktionen extrahieren
                reactions, reactions_warning = extract_reactions(driver, logger)

                # Artikelinhalt extrahieren
                paragraph_texts = get_paragraph_texts(soup, full_url, logger)

                # Kommentare extrahieren
                if old_design:
                    forum_comments, comments_warning = extract_forum_comments_alternative(driver, logger)
                else:
                    forum_comments, comments_warning = extract_forum_comments_normal(driver, logger)

                # Status bestimmen
                if reactions_warning or comments_warning:
                    status = 'warning'
                else:
                    status = 'success'

                # Daten vorbereiten gemäß neuer Struktur
                article_data = {
                    'article.title': title,
                    'article.subtitle': subtitle,
                    'article.kicker': kicker,
                    'article.text': paragraph_texts,
                    'article.author': article_byline,
                    'article.pubdate': article_datetime,
                    'article.comments': forum_comments,
                    'features.posting_count': posting_count,
                    'features.reactions': reactions,
                    'scraping_info.status': status,
                    'scraping_info.download_datetime': datetime.datetime.now()
                }

                # Daten in die 'derStandard' Collection einfügen
                derStandard_collection.update_one(
                    {'scraping_info.url': full_url},
                    {'$set': article_data}
                )

                logger.info(f"Erfolgreich gescraped mit Status '{status}': {full_url} am {article_datetime}")

            except TimeoutException:
                scraping_fail(url=full_url, exception_message='Timeout nach 10 Sekunden', logger=logger)
                continue
            except Exception as e:
                exception_message = str(e)
                scraping_fail(url=full_url, exception_message=exception_message, logger=logger)
                logger.error(f"Fehler beim Verarbeiten von {full_url}: {e}", exc_info=True)
                continue

    except Exception as e:
        logger.critical(f"Unerwarteter Fehler im Scraping-Prozess: {e}", exc_info=True)

    finally:
        driver.quit()
        logger.info("Browser erfolgreich geschlossen.")



In [None]:
# main.py

def main():
    logger = setup_logger()
    scrape_articles(logger)

if __name__ == "__main__":
    main()


2024-11-11 22:22:49,198 - __main__ - INFO - Starte den Artikel-Scraping-Prozess.
2024-11-11 22:23:05,467 - __main__ - INFO - Navigiere zur Frontpage: https://www.derstandard.at/2024/11/11
2024-11-11 22:23:10,479 - __main__ - INFO - Verarbeite URL: https://www.derstandard.at/story/3000000240211/wie-gerecht-ist-unser-zugang-zur-gesundheitsversorgung-wirklich
2024-11-11 22:23:10,484 - __main__ - INFO - Verarbeite URL: https://www.derstandard.at/story/2000142048886/unternehmer-reinhold-wuerth-ist-kaeufer-von-beckmann-rekordgemaelde
2024-11-11 22:23:22,780 - __main__ - INFO - Erfolgreich gescraped mit Status 'success': https://www.derstandard.at/story/2000142048886/unternehmer-reinhold-wuerth-ist-kaeufer-von-beckmann-rekordgemaelde am 2022-12-22 17:23:00
2024-11-11 22:23:22,783 - __main__ - INFO - Verarbeite URL: https://www.derstandard.at/story/2000142048685/eugh-entscheidet-gegen-schadenersatz-bei-krankheit-wegen-luftverschmutzung
2024-11-11 22:23:30,356 - __main__ - INFO - Erfolgreich ge

In [None]:
%%deactivate

import logging
import time
import re
from selenium import webdriver as wd
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import (
    TimeoutException,
    NoSuchElementException,
    ElementClickInterceptedException,
    WebDriverException
)
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import datetime
import dateparser
import re
from dateparser import parse as dateparser_parse
from selenium.webdriver.common.by import By
import pandas as pd



# global vars for local system
CHROMEDRIVER_PATH = r"chromedriver.exe"
FRONTPAGE_URL = "https://www.derstandard.at/frontpage/"


def configure_driver(headless=True):
    chrome_options = wd.ChromeOptions()
    if headless:
        chrome_options.add_argument("--headless")
    chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36")
    chrome_options.add_argument("--blink-settings=imagesEnabled=false")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_prefs = {
        "profile.default_content_settings.images": 2,  # Bilder deaktivieren
        "profile.managed_default_content_settings.images": 2
    }
    chrome_options.experimental_options["prefs"] = chrome_prefs
    chrome_options.page_load_strategy = 'none'  # Seite wird nicht vollständig geladen, Timeout auf Seite selbst kontrollieren 

    service = ChromeService(executable_path=CHROMEDRIVER_PATH)
    driver = wd.Chrome(service=service, options=chrome_options)

    # POPUP WEGKLICKEN
    driver.get(FRONTPAGE_URL + datetime.date.today().strftime("%Y/%m/%d"))
    time.sleep(5)
    try:
        WebDriverWait(driver, 10).until(
            lambda driver: driver.execute_script("return document.readyState") == 'complete'
        )
        driver.switch_to.frame(driver.find_element(By.XPATH, "/html/body/div/iframe"))
        driver.find_element(By.XPATH, "/html/body/div[1]/div[2]/div[3]/div[1]/button").click()
        driver.switch_to.parent_frame()
    except NoSuchElementException:
        raise Exception("popup nicht gefunden")
    
    return driver



# Logging konfigurieren
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)  # Setzen Sie das minimale Log-Level

# Log-Format definieren
formatter = logging.Formatter(
    '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)

# Console-Handler hinzufügen
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)  # Konsole zeigt INFO und höhere Level
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)

# File-Handler hinzufügen
file_handler = logging.FileHandler('scraper.log')
file_handler.setLevel(logging.DEBUG)  # Datei speichert alle Log-Level
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)



def extract_reactions(driver):
    """
    Extrahiert die Reaktionen aus dem Shadow DOM der aktuellen Seite.
    """
    try:
        shadow_host = driver.find_element(By.CSS_SELECTOR, "dst-community-reactions")
        shadow_root = expand_shadow_element(driver, shadow_host)
        reactions_buttons = shadow_root.find_elements(By.CSS_SELECTOR, "aside.reactions div.reactions--buttons button")
        reactions = {}
        for button in reactions_buttons:
            try:
                count_element = button.find_element(By.TAG_NAME, "strong")
                count = int(count_element.text.strip())
            except (NoSuchElementException, ValueError):
                count = 0
            try:
                sr_only = button.find_element(By.CSS_SELECTOR, "span.sr-only")
                reaction_name = sr_only.text.strip()
            except NoSuchElementException:
                reaction_name = button.text.replace(str(count), '').strip()
            reactions[reaction_name] = count
        return reactions
    except NoSuchElementException:
        logger.warning(f"{inspect.currentframe().f_back.f_code.co_name} Reaktionen konnten nicht extrahiert werden.")
        return None
    except Exception as e:
        logger.error(f"{inspect.currentframe().f_back.f_code.co_name} Fehler beim Extrahieren der Reaktionen: {e}", exc_info=True)

def parse_posting(posting_element):
    # Parst ein einzelnes <dst-posting>-Element und extrahiert die relevanten Daten.
    try:
        # Extrahiere den Autor
        author = "Unbekannter Benutzer"
        try:
            usermenu = posting_element.find_element(By.CSS_SELECTOR, "dst-posting--user button")
            spans = usermenu.find_elements(By.CSS_SELECTOR, "span > span")
            if spans:
                author = spans[0].text.strip()
        except NoSuchElementException:
            pass

        # Extrahiere die Anzahl der Follower
        user_followers = 0
        try:
            followers_div = posting_element.find_element(By.CSS_SELECTOR, "dst-posting--user button div[title]")
            followers_text = followers_div.get_attribute("title")
            followers_match = re.search(r'\d+', followers_text)
            if followers_match:
                user_followers = int(followers_match.group())
        except NoSuchElementException:
            pass
        except ValueError:
            logger.warning(f"{inspect.currentframe().f_back.f_code.co_name} Ungültige Follower-Zahl: '{followers_text}'.")

        # Extrahiere das Datum und die Uhrzeit
        datetime_obj = None
        try:
            time_tag = posting_element.find_element(By.CSS_SELECTOR, "time[data-date]")
            datetime_str = time_tag.get_attribute("data-date")
            datetime_obj = dateparser.parse(datetime_str, languages=['de'])
        except NoSuchElementException:
            pass

        # Extrahiere den Inhalt des Postings
        content = ""
        try:
            content_div = posting_element.find_element(By.CSS_SELECTOR, "div.posting--content")
            headers = content_div.find_elements(By.TAG_NAME, "h1")
            paragraphs = content_div.find_elements(By.TAG_NAME, "p")
            header_text = "\n".join([h.text for h in headers]) if headers else ""
            paragraph_text = "\n".join([p.text for p in paragraphs]) if paragraphs else ""
            content = "\n".join([header_text, paragraph_text]).strip()
        except NoSuchElementException:
            pass

        # Extrahiere Upvotes und Downvotes
        upvotes = 0
        downvotes = 0
        try:
            ratinglog = posting_element.find_element(By.CSS_SELECTOR, "dst-posting--ratinglog")
            positiveratings = ratinglog.get_attribute("positiveratings")
            negativeratings = ratinglog.get_attribute("negativeratings")
            upvotes = int(positiveratings) if positiveratings and positiveratings.isdigit() else 0
            downvotes = int(negativeratings) if negativeratings and negativeratings.isdigit() else 0
        except NoSuchElementException:
            pass
        except ValueError:
            logger.warning(f"{inspect.currentframe().f_back.f_code.co_name} Ungültige Upvote/Downvote-Zahlen gefunden.")

        # Extrahiere Parent-Kommentar-ID (falls Antwort)
        parent_id = posting_element.get_attribute("data-parentpostingid")
        reply_on_comment = int(parent_id) if parent_id and parent_id.isdigit() else None

        # Extrahiere Kommentar-ID
        commentID = posting_element.get_attribute("data-postingid")
        commentID = int(commentID) if commentID and commentID.isdigit() else None

        # Erstelle das Kommentar-Dictionary
        comment = {
            'commentID': commentID,
            'author': author,
            'user_followers': user_followers,
            'datetime': datetime_obj,
            'content': content,
            'upvotes': upvotes,
            'downvotes': downvotes,
            'reply_on_comment': reply_on_comment,
            'replies': []
        }

        return comment

    except Exception as e:
        logger.error(f"{inspect.currentframe().f_back.f_code.co_name} Fehler beim Parsen eines Postings: {e}", exc_info=True)
        return None
    
def extract_forum_comments_normal(driver, max_comments=50):
    """
    Extrahiert Benutzerkommentare aus dem Shadow DOM der aktuellen Seite und
    bildet verschachtelte Antworten ab.
    """
    comments = []
    count = 0
    try:
        forum_host = driver.find_element(By.CSS_SELECTOR, "dst-forum")
        forum_shadow = expand_shadow_element(driver, forum_host)
        main_content = forum_shadow.find_element(By.CSS_SELECTOR, "main.forum--main")
        children = main_content.find_elements(By.CSS_SELECTOR, ":scope > *")
        current_parent = None

        for child in children:
            if count >= max_comments:
                break
            tag_name = child.tag_name.lower()
            if tag_name == "dst-posting":
                comment = parse_posting(child)
                if comment:
                    comments.append(comment)
                    current_parent = comment
                    count += 1
            elif tag_name == "section":
                classes = child.get_attribute("class")
                if classes and "thread" in classes:
                    if not current_parent:
                        logger.warning(f"{inspect.currentframe().f_back.f_code.co_name} Thread-Sektion gefunden, aber kein aktueller Parent.")
                        continue
                    reply_postings = child.find_elements(By.CSS_SELECTOR, "dst-posting")
                    for reply in reply_postings:
                        if count >= max_comments:
                            break
                        reply_comment = parse_posting(reply)
                        if reply_comment:
                            current_parent['replies'].append(reply_comment)
                            count += 1
        return comments
    except NoSuchElementException:
        logger.warning(f"{inspect.currentframe().f_back.f_code.co_name} Forum-Elemente nicht gefunden.")
        return comments
    except Exception as e:
        logger.error(f"{inspect.currentframe().f_back.f_code.co_name} Fehler beim Extrahieren der Forenkommentare: {e}", exc_info=True)
        return comments



def extract_forum_comments_alternative(driver, max_comments=50):
    comments_data = []

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    postings = soup.find_all('div', class_='posting', attrs={'data-postingid': True})

    for posting in postings[:max_comments]:  # Bis zu 10 Kommentare sammeln
        try:
            commentID = posting.get('data-postingid')
            username = posting.get('data-communityname') or 'gelöschtes Profil'
            reply_on_comment = posting.get('data-parentpostingid')
            reply_on_comment = int(reply_on_comment) if reply_on_comment else None

            # Datum und Uhrzeit des Kommentars extrahieren
            datetime_tag = posting.find('span', class_='js-timestamp')
            if datetime_tag and datetime_tag.text:
                datetime_str = datetime_tag.text.strip()
                datetime_obj = dateparser.parse(datetime_str, languages=['de'])
            else:
                datetime_obj = None 

            # Kommentarüberschrift extrahieren
            comment_header_tag = posting.find('h4', class_='upost-title')
            comment_header = comment_header_tag.text.strip() if comment_header_tag else None

            # Kommentartext extrahieren
            comment_body = posting.find('div', class_='upost-text')
            comment_text = comment_body.get_text(separator=' ', strip=True) if comment_body else None

            # Upvotes extrahieren
            upvotes_tag = posting.find('span', class_='js-ratings-positive-count')
            upvotes = int(upvotes_tag.text.strip()) if upvotes_tag and upvotes_tag.text else 0

            # Downvotes extrahieren
            downvotes_tag = posting.find('span', class_='js-ratings-negative-count')
            downvotes = int(downvotes_tag.text.strip()) if downvotes_tag and downvotes_tag.text else 0

            # Anzahl der Follower des Nutzers extrahieren
            user_followers_tag = posting.find('span', class_='upost-follower')
            user_followers = int(user_followers_tag.text.strip()) if user_followers_tag and user_followers_tag.text else 0

            comments_data.append({
                'commentID': int(commentID),
                'author': username,
                'user_followers': user_followers,
                'datetime': datetime_obj,
                'content': comment_header+comment_text,
                'upvotes': upvotes,
                'downvotes': downvotes,
                'replies': reply_on_comment
            })

        except Exception as e:
            logger.error(f"{inspect.currentframe().f_back.f_code.co_name} Fehler beim Verarbeiten eines Kommentars in Artikel: {e}", exc_info=True)
            continue 

    return comments_data



def parse_comment_datetime(datetime_str):
    return dateparser.parse(datetime_str, languages=['de'])

def extract_forum_comments_alternative(driver, max_comments=50):
    """
    Extrahiert Benutzerkommentare aus der aktuellen Seite unter Verwendung von BeautifulSoup
    und bildet verschachtelte Antworten ab.
    """
    comments_data = []
    comment_map = {}
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    postings = soup.find_all('div', class_='posting', attrs={'data-postingid': True})

    for posting in postings[:max_comments]:
        try:
            commentID = posting.get('data-postingid')
            if not commentID or not commentID.isdigit():
                continue
            commentID = int(commentID)

            username = posting.get('data-communityname') or 'gelöschtes Profil'

            reply_on_comment = posting.get('data-parentpostingid')
            reply_on_comment = int(reply_on_comment) if reply_on_comment and reply_on_comment.isdigit() else None

            # Datum und Uhrzeit des Kommentars extrahieren
            datetime_tag = posting.find('span', class_='js-timestamp')
            if datetime_tag and datetime_tag.text:
                datetime_str = datetime_tag.text.strip()
                datetime_obj = parse_comment_datetime(datetime_str)
            else:
                datetime_obj = None 

            # Kommentarüberschrift extrahieren
            comment_header_tag = posting.find('h4', class_='upost-title')
            comment_header = comment_header_tag.text.strip() if comment_header_tag else ""

            # Kommentartext extrahieren
            comment_body = posting.find('div', class_='upost-text')
            comment_text = comment_body.get_text(separator=' ', strip=True) if comment_body else ""

            # Upvotes extrahieren
            upvotes_tag = posting.find('span', class_='js-ratings-positive-count')
            upvotes = int(upvotes_tag.text.strip()) if upvotes_tag and upvotes_tag.text.isdigit() else 0

            # Downvotes extrahieren
            downvotes_tag = posting.find('span', class_='js-ratings-negative-count')
            downvotes = int(downvotes_tag.text.strip()) if downvotes_tag and downvotes_tag.text.isdigit() else 0

            # Anzahl der Follower des Nutzers extrahieren
            user_followers_tag = posting.find('span', class_='upost-follower')
            user_followers = int(user_followers_tag.text.strip()) if user_followers_tag and user_followers_tag.text.isdigit() else 0

            comment_data = {
                'commentID': commentID,
                'author': username,
                'user_followers': user_followers,
                'datetime': datetime_obj,
                'content': f"{comment_header}\n{comment_text}".strip(),
                'upvotes': upvotes,
                'downvotes': downvotes,
                'reply_on_comment': reply_on_comment,
                'replies': []
            }

            # In die Map einfügen
            comment_map[commentID] = comment_data

        except Exception as e:
            logger.error(f"{inspect.currentframe().f_back.f_code.co_name} Fehler beim Verarbeiten eines Kommentars: {e}", exc_info=True)
            continue 

    # Jetzt die verschachtelte Struktur aufbauen
    for comment in comment_map.values():
        parent_id = comment['reply_on_comment']
        if parent_id and parent_id in comment_map:
            parent_comment = comment_map[parent_id]
            parent_comment['replies'].append(comment)
        else:
            comments_data.append(comment)

    return comments_data








def get_article_byline(soup):
    article_byline = {}
    article_byline_tag = soup.find('div', class_='article-byline')
    if article_byline_tag:
        # Storylabels extrahieren
        storylabels_tag = article_byline_tag.find('div', class_='storylabels')
        if storylabels_tag:
            storylabels = storylabels_tag.get_text(strip=True)
            article_byline['storylabels'] = storylabels

        # Article origins extrahieren
        article_origins_tag = article_byline_tag.find('div', class_='article-origins')
        if article_origins_tag:
            article_origins = article_origins_tag.get_text(strip=True)
            article_byline['article_origins'] = article_origins
        else:
            # Fallback für einfachen Autorentext
            author_simple = article_byline_tag.find('span', class_='simple')
            if author_simple:
                article_byline['article_origins'] = author_simple.get_text(strip=True)
    else:
        article_byline = None
        logger.debug(f"{inspect.currentframe().f_back.f_code.co_name} Keine Artikel-Byline gefunden.")
    return article_byline





def get_article_datetime(soup):
    time_tag = soup.find('time', class_='article-pubdate')
    if time_tag:
        if time_tag.has_attr('datetime'):
            datetime_str = time_tag['datetime'].strip()
            datetime_str = datetime_str.replace('\n', '').strip()
        else:
            datetime_str = time_tag.get_text(strip=True)
        try:
            article_datetime = datetime.datetime.fromisoformat(datetime_str)
        except ValueError:
            datetime_text = time_tag.get_text(strip=True)
            article_datetime = dateparser.parse(datetime_text, languages=['de'])
    else:
        article_datetime = None
        logger.debug(f"{inspect.currentframe().f_back.f_code.co_name} Kein Datum gefunden.")
    return article_datetime



def get_posting_count(soup, full_url):
    posting_count = None
    try:
        posting_count_tag = soup.find('span', class_='js-forum-postingcount')
        if posting_count_tag:
            posting_count_text = posting_count_tag.contents[0].strip()
            posting_count = int(posting_count_text)
            return posting_count
    except (AttributeError, ValueError):
        posting_count = None
    try:
        community_section = soup.find('section', id='story-community')
        header_div = community_section.find('div', class_='story-community-header')
        h1_tag = header_div.find('h1')
        h1_text = h1_tag.get_text(strip=True)
        match = re.search(r'Forum:\s*(\d+)\s*Postings', h1_text)
        posting_count = int(match.group(1))
        return posting_count
    except (AttributeError, ValueError):
        posting_count = None
        logger.warning(f"{inspect.currentframe().f_back.f_code.co_name} Ungültige Posting-Anzahl in {full_url}")
    return posting_count

                    

def get_paragraph_texts(soup,full_url):
    # Artikelinhalt extrahieren
    paragraph_texts = None
    try:
        article_body = soup.find('div', class_='article-body')
        if article_body:
            # Alle 'href'-Attribute entfernen
            for a_tag in article_body.find_all('a'):
                del a_tag['href']
            logger.debug(f"{inspect.currentframe().f_back.f_code.co_name} Alle 'href'-Attribute aus Artikeltext entfernt.")

            # Unerwünschte Elemente entfernen
            for ad in article_body.find_all(['ad-container', 'ad-slot', 'ad']):
                ad.decompose()
            for figure in article_body.find_all('figure'):
                figure.decompose()
            for unwanted in article_body.find_all(['aside', 'nav', 'div'], attrs={'data-section-type': 'supplemental'}):
                unwanted.decompose()
            logger.debug(f"{inspect.currentframe().f_back.f_code.co_name} Unerwünschte Elemente aus Artikeltext entfernt.")

            # Paragraphen extrahieren und in Liste umwandeln
            paragraphs = article_body.find_all('p')
            paragraph_texts = [p.get_text(strip=True) for p in paragraphs]
            logger.debug(f"{inspect.currentframe().f_back.f_code.co_name} Extrahierte Paragraphen: {len(paragraph_texts)} in {full_url}")
        else:
            logger.debug(f"{inspect.currentframe().f_back.f_code.co_name} Kein Artikelinhalt gefunden in {full_url}")
    except Exception as e:
        logger.error(f"{inspect.currentframe().f_back.f_code.co_name} Fehler beim Extrahieren des Artikelinhalts: {e}", exc_info=True)
    return paragraph_texts



def scraping_fail(url,exception_message):
    current_date = datetime.datetime.now()
    failed_collection.insert_one({
        'url': url,
        'date': current_date,
        'exception': exception_message
    })
    logger.warning(f"{inspect.currentframe().f_back.f_code.co_name} Artikel übersprungen (fehlende Daten): {url}")

    urls_collection.update_one(
        {'URL': url},
        {'$set': {'download_date': current_date}}
    )
    logger.debug(f"{inspect.currentframe().f_back.f_code.co_name} 'download_date' für {url} aktualisiert.")
    return



def expand_shadow_element(driver, element):
    """Erweitert ein Shadow DOM-Element und gibt das Shadow Root zurück."""
    shadow_root = driver.execute_script('return arguments[0].shadowRoot', element)
    return shadow_root

def scrape_articles():
    logger.info("Starte den Artikel-Scraping-Prozess.")
    driver = configure_driver()
    wait = WebDriverWait(driver, 15)

    try:
        # Alle URLs aus der 'urls' Sammlung holen, die noch nicht gescraped wurden
        urls_to_scrape = list(urls_collection.find({'download_date': None}, {'URL': 1}))
        logger.debug(f"Anzahl der zu scrapenden URLs: {len(urls_to_scrape)}")

        # FRONTPAGE_URL mit heutigem Datum erstellen
        FRONTPAGE_URL = "https://www.derstandard.at/" + datetime.date.today().strftime("%Y/%m/%d")
        logger.info(f"Navigiere zur Frontpage: {FRONTPAGE_URL}")

        # Zur Frontpage navigieren, um das Pop-up zu schließen
        driver.get(FRONTPAGE_URL)
        logger.debug("Frontpage geladen. Warte auf Pop-up.")
        time.sleep(5)  # Kurze Pause, um sicherzustellen, dass alles geladen ist
        # Für jede URL in der Liste
        for url_dict in urls_to_scrape:
            full_url = url_dict['URL']
            #full_url = url_dict
            logger.info(f"Verarbeite URL: {full_url}")

            # Seite laden mit Timeout von 10 Sekunden
            driver.set_page_load_timeout(10)
            driver.get(full_url)
            time.sleep(3)

            try:
                # Warten, bis die Seite vollständig geladen ist
                wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
                logger.debug(f"Seite {full_url} vollständig geladen.")

                # BeautifulSoup zum Parsen (für Elemente außerhalb des Shadow DOM)
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                logger.debug(f"HTML-Inhalt von {full_url} mit BeautifulSoup geparst.")

                # manchmal ist die seite anders strukturiert
                old_design = soup.find("div", class_="forum use-unobtrusive-ajax visible")
                
                # Artikelklasse speichern (z.B. 'story-article', 'video-article')
                article_element = soup.find('article')
                article_classes = article_element.get('class', []) if article_element else []
                article_class = ' '.join(article_classes) if article_classes else None

                # Rubrik/Kicker
                kicker_tag = soup.find('h2', class_='article-kicker')
                kicker = kicker_tag.get_text(strip=True) if kicker_tag else None

                # Titel
                title_tag = soup.find('h1', class_='article-title')
                title = title_tag.get_text(strip=True) if title_tag else None

                # Subtitel
                subtitle_tag = soup.find('p', class_='article-subtitle')
                subtitle = subtitle_tag.get_text(strip=True) if subtitle_tag else None

                # Artikel-Byline (kann verschachtelt sein)
                article_byline = get_article_byline(soup)

                # Datum und Uhrzeit extrahieren und in DATETIME konvertieren
                article_datetime = get_article_datetime(soup)


                if article_datetime is None or title is None:
                    scraping_fail(url=full_url, exception_message='Fehlendes Datum oder Titel')
                    continue

                # Anzahl der Postings extrahieren (js-forum-postingcount)
                posting_count = get_posting_count(soup, full_url)

                # Reaktionen extrahieren
                reactions = extract_reactions(driver)

                # Artikelinhalt extrahieren
                paragraph_texts = get_paragraph_texts(soup, full_url)

                # Kommentare extrahieren
                if old_design:
                    forum_comments = extract_forum_comments_alternative(driver)
                else:
                    forum_comments = extract_forum_comments_normal(driver)


                # Daten vorbereiten
                article_data = {
                    'url': full_url,
                    'article_class': article_class,
                    'kicker': kicker,
                    'title': title,
                    'subtitle': subtitle,
                    'article_byline': article_byline,
                    'datetime': article_datetime,
                    'posting_count': posting_count,
                    'reactions': reactions,
                    'article_text': paragraph_texts,
                    'forum_comments': forum_comments
                }

                # Daten in die MongoDB einfügen
                result = articles_collection.update_one(
                    {'url': full_url},
                    {'$setOnInsert': article_data},
                    upsert=True
                )

                if result.upserted_id is None:
                    logger.info(f"Artikel bereits vorhanden (Duplikat): {full_url}")
                else:
                    logger.info(f"Erfolgreich gescraped: {full_url} am {article_datetime}")

                # 'download_date' in der 'urls' Sammlung aktualisieren
                current_date = datetime.datetime.now()
                urls_collection.update_one(
                    {'URL': full_url},
                    {'$set': {'download_date': current_date}}
                )
                logger.debug(f"'download_date' für {full_url} aktualisiert.")

            except TimeoutException:
                current_date = datetime.datetime.now()
                exception_message = 'Timeout nach 10 Sekunden'
                failed_collection.insert_one({
                    'url': full_url,
                    'date': current_date,
                    'exception': exception_message
                })
                logger.error(f"{inspect.currentframe().f_back.f_code.co_name} Timeout beim Verarbeiten von {full_url}")

                urls_collection.update_one(
                    {'URL': full_url},
                    {'$set': {'download_date': current_date}}
                )
                logger.debug(f"{inspect.currentframe().f_back.f_code.co_name} 'download_date' für {full_url} nach Timeout aktualisiert.")
                continue

            except Exception as e:
                current_date = datetime.datetime.now()
                exception_message = str(e)
                failed_collection.insert_one({
                    'url': full_url,
                    'date': current_date,
                    'exception': exception_message
                })
                logger.error(f"Fehler beim Verarbeiten von {full_url}: {e}", exc_info=True)

                urls_collection.update_one(
                    {'URL': full_url},
                    {'$set': {'download_date': current_date}}
                )
                logger.debug(f"{inspect.currentframe().f_back.f_code.co_name} 'download_date' für {full_url} nach Fehler aktualisiert.")
                continue

    except Exception as e:
        logger.critical(f"{inspect.currentframe().f_back.f_code.co_name} Unerwarteter Fehler im Scraping-Prozess: {e}", exc_info=True)

    finally:
        driver.quit()
        logger.info(f"{inspect.currentframe().f_back.f_code.co_name} Browser erfolgreich geschlossen.")

if __name__ == "__main__":
    scrape_articles()


⚠️ **Diese Zelle ist deaktiviert und wurde nicht ausgeführt.**