In [None]:
%%capture
%pip install pymongo playwright

In [None]:
%%cmd
playwright install

In [None]:
from pymongo import MongoClient
import os

# set user creds
username = os.getenv("MONGODB_USER")
password = os.getenv("MONGODB_PWD")

# setup client
client = MongoClient(f"mongodb://{username}:{password}@BlackWidow:27017")

# load db and collections
db = client['newspapers']
articles_collection = db['articles']
failed_collection = db['failed']
urls_collection = db['urls']

In [None]:
import datetime
from bs4 import BeautifulSoup
import requests
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
from pymongo import UpdateOne 
import re

baseurl = "https://www.derstandard.at"

def scrape_sitemap(scrape_from=datetime.datetime(2020, 1, 1),
                   scrape_to=datetime.datetime(2023,3,1)):
    # Haupt-Sitemap
    sitemap_index_url = f"{baseurl}/sitemaps/sitemap.xml"
    response = requests.get(sitemap_index_url)
    soup = BeautifulSoup(response.content, 'xml')

    # Alle Sitemap-URLs extrahieren
    sitemap_urls = [sitemap_loc.text for sitemap_loc in soup.find_all('loc')]

    # regex pattern um das datum aus sitemap url zu extrahieren
    sitemap_pattern = re.compile(r'sitemap-(\d{4})-(\d{2})\.xml')

    for sitemap_url in sitemap_urls:
        # Jahr und Monat aus der Sitemap-URL extrahieren
        match = sitemap_pattern.search(sitemap_url)
        if match:
            year = int(match.group(1))
            month = int(match.group(2))
            sitemap_date = datetime.datetime(year, month, 1)

            # Prüfen, ob das Sitemap-Datum vor dem cutoff_date liegt
            if (sitemap_date < scrape_from) or (sitemap_date > scrape_to):
                print(f"Überspringe Sitemap {sitemap_url} (Datum: {sitemap_date.strftime('%Y-%m')})")
                continue
        else:
            raise Exception("Sitemap Error")

        sitemap_response = requests.get(sitemap_url)
        sitemap_soup = BeautifulSoup(sitemap_response.content, 'xml')

        # Alle Artikel-URLs
        article_urls = [url_loc.text for url_loc in sitemap_soup.find_all('loc')]

        # Liste für neue Dokumente
        operations = []

        # URLs in die MongoDB 'urls' Sammlung einfügen
        for article_url in article_urls:

            # Dokument vorbereiten
            document = {
                'URL': article_url,
                'download_date': None,
                'publisher': 'derStandard'
            }

            # Bulk-Operation vorbereiten
            operations.append(UpdateOne(
                {'URL': article_url},
                {'$setOnInsert': document},
                upsert=True # um Duplikate zu vermeiden
            ))

        if operations:
            # Bulk-write
            result = urls_collection.bulk_write(operations)
            print(f"URLs aus {sitemap_url} erfolgreich hinzugefügt: {result.upserted_count} neue URLs.")
        else:
            print(f"Keine neuen URLs in {sitemap_url} gefunden.")

In [None]:
scrape_sitemap()