---
# Tagesschau API


In [2]:
import requests


In [1]:
# Endpoints
url = "https://www.tagesschau.de/api2"

endpoints = {
    "homepage": "/homepage",
    "news": "/news",
    "newsfeed": "/newsfeed-101~_date-{date}.json",  # yymmdd format
}


# "enums"
ressorts_and_topics = [
    "inland",
    "ausland",
    "wirtschaft",
    "sport",
    "video",
    "investigativ",
    "faktenfinder",
]

regions = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]

region_mapping = {
    1: "Baden-Württemberg",
    2: "Bayern",
    3: "Berlin",
    4: "Brandenburg",
    5: "Bremen",
    6: "Hamburg",
    7: "Hessen",
    8: "Mecklenburg-Vorpommern",
    9: "Niedersachsen",
    10: "Nordrhein-Westfalen",
    11: "Rheinland-Pfalz",
    12: "Saarland",
    13: "Sachsen",
    14: "Sachsen-Anhalt",
    15: "Schleswig-Holstein",
    16: "Thüringen",
}

types = ["story", "webview", "video"]

# Blacklists
blacklist_url = ["liveblog"]
blacklist_type = ["video", "webview"]


---
## Tagesschau API Client


In [7]:
def create_session():
    s = requests.Session()
    s.headers.update({
        "Content-Type": "application/json"
    })

    return s


def main():
    sess = create_session()
    resp = sess.get(url + endpoints["news"])
    print(resp.headers)


In [12]:
main()


{'Content-Type': 'application/json;charset=UTF-8', 'Vary': 'Accept-Encoding', 'Content-Encoding': 'gzip', 'Cache-Control': 'max-age=181', 'Date': 'Tue, 10 Jan 2023 11:13:03 GMT', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive, Transfer-Encoding', 'Access-Control-Allow-Origin': '*'}


---
# Endpoint:  newsfeed


In [3]:
response_newsfeed = requests.get(f"{url}{endpoints['newsfeed'].format(date='221106')}")

if response_newsfeed.status_code == 200:
    parsed_response_newsfeed = response_newsfeed.json()
else:
    print(response_newsfeed.status_code)
    print(response_newsfeed.text)


In [4]:
parsed_response_newsfeed.keys()

dict_keys(['news', 'regional', 'newStoriesCountLink', 'type', 'nextPage'])

In [22]:
len(parsed_response_newsfeed["news"])


160

In [6]:
list(parsed_response_newsfeed["news"][0].keys())


['sophoraId',
 'externalId',
 'title',
 'teaserImage',
 'date',
 'tracking',
 'tags',
 'updateCheckUrl',
 'regionId',
 'details',
 'detailsweb',
 'shareURL',
 'topline',
 'firstSentence',
 'geotags',
 'ressort',
 'type']

In [7]:
all_details_links_from_newsfeed = [news["details"] for news in parsed_response_newsfeed["news"] if news.get("details")]


---
# Endpoint: news


In [13]:
response_news = requests.get(f"{url}{endpoints['news']}")

if response_news.status_code == 200:
    parsed_response_news = response_news.json()
else:
    print(response_news.status_code)
    print(response_news.text)


In [60]:
bar = []
foo = None

if foo is None:
    bar.append(None)

print(bar)


[None]


In [18]:
parsed_response_news[.keys()]


''

In [79]:
parsed_news = parse_news_all(parsed_response_news)


In [82]:
parsed_news = list(parsed_news)


In [167]:
parsed_news[0]


News(title='So sollen Ausländer künftig den deutschen Pass schneller erhalten ', sub_title='Einbürgerung schon nach fünf Jahren statt nach acht - so sieht es ein Gesetzentwurf des Bundesinnenministeriums vor.', tags=['brandenburg', 'berlin'], text=None, url='https://www.tagesschau.de/ardimport/regional/brandenburg/rbb-story-115031.html')

In [166]:
def is_keywords_in_news(
    news: News,
    keywords: list[str],
    filters: dict[Literal["title", "sub_title", "tags", "text"], bool] = {
        "title": True,
        "sub_title": True,
        "tags": True,
        "text": False,
    },
) -> bool:
    """Takes instance of News obje"""

    def unify_filter_object(
        filter_object: Union[list, str, None]
    ) -> Union[list[str], None]:
        """Transforms input into list of lowercase strings."""

        if type(filter_object) is None:
            unified_filter_object = None
        elif type(filter_object) == str:
            # TODO:  Implement as generator
            # https://stackoverflow.com/questions/3862010/is-there-a-generator-version-of-string-split-in-python
            unified_filter_object = filter_object.lower().split()
        elif type(filter_object) == list:
            # TODO:  Use generator instead
            # unified_filter_object = (i.lower() for i in filter_object)
            unified_filter_object = [i.lower() for i in filter_object]
        else:
            raise TypeError

        return unified_filter_object

    def is_keywords_in_filter_object(
        keywords: list[str],
        filter_object: Union[list[str]],
    ) -> bool:
        # Alternative version that returns keyword matches
        # [word for word in filter_object if any(keyword in word for keyword in keywords)] or False
        return True in (
            True
            for word in filter_object
            if any(keyword in word for keyword in keywords)
        )

    news = news.dict()
    filters = [k for k, v in filters.items() if v]

    filter_objects = (unify_filter_object(news[key]) for key in filters)

    # TODO:  Implement as map or as filter function
    return True in (
        is_keywords_in_filter_object(keywords, filter_object)
        for filter_object in filter_objects
    )


# Unittests
keywords = [
    "pass",
    # "einbürgerung",
    "rheinmetall",
]


filters = {
    "title": True,
    "sub_title": True,
    "tags": True,
    "text": False,
}


News(
    title="So sollen Ausländer künftig den deutschen Pass schneller erhalten ",
    sub_title="Einbürgerung schon nach fünf Jahren statt nach acht - so sieht es ein Gesetzentwurf des Bundesinnenministeriums vor.",
    tags=["brandenburg", "berlin"],
    text=None,
    url="https://www.tagesschau.de/ardimport/regional/brandenburg/rbb-story-115031.html",
)


is_keywords_in_news(parsed_news[0], keywords, filters)


True

In [174]:
keywords = [
    "pass",
    "einbürgerung",
    "rheinmetall",
    "ukraine",
]


filters = {
    "title": True,
    "sub_title": True,
    "tags": True,
    "text": False,
}

parsed_news = [i for i in parsed_news if i]

relevant_news = [news for news in parsed_news if is_keywords_in_news(news, keywords, filters)]


In [175]:
relevant_news


[News(title='So sollen Ausländer künftig den deutschen Pass schneller erhalten ', sub_title='Einbürgerung schon nach fünf Jahren statt nach acht - so sieht es ein Gesetzentwurf des Bundesinnenministeriums vor.', tags=['brandenburg', 'berlin'], text=None, url='https://www.tagesschau.de/ardimport/regional/brandenburg/rbb-story-115031.html'),
 News(title='++ Russland meldet Eroberung von Soledar ++', sub_title='Alle aktuellen Entwicklungen zum Krieg gegen die Ukraine in unserem Liveblog.', tags=['liveblog', 'ukraine'], text=None, url='https://www.tagesschau.de/newsticker/liveblog-ukraine-mittwoch-205.html'),
 News(title='Hilfe für die Menschen in der Ukraine', sub_title='Wenn Sie für die Menschen in der Ukraine und Geflüchtete aus der Ukraine spenden wollen, finden Sie hier Hilfsorganisationen und Bankverbindungen.', tags=['solidarität', 'ukraine', 'hilfe'], text=None, url='https://www.tagesschau.de/spendenkonten/spendenkonten-133.html'),
 News(title='+++ Update: Baggerfahrer wohl schwer 

---
# Get individual news article from news details


In [11]:
whitelist_type = ["headline", "text"]


In [12]:
response_news_article = requests.get(parsed_response_news["news"][0]["details"])

if response_news_article.status_code == 200:
    parsed_response_news_article = response_news_article.json()
else:
    print(response_news_article.status_code)
    print(response_news_article.text)


In [13]:
list(parsed_response_news_article.keys())


['sophoraId',
 'externalId',
 'title',
 'teaserImage',
 'content',
 'date',
 'tracking',
 'tags',
 'updateCheckUrl',
 'regionId',
 'regionIds',
 'images',
 'details',
 'detailsweb',
 'shareURL',
 'topline',
 'firstSentence',
 'geotags',
 'brandingImage',
 'type',
 'breakingNews']

In [14]:
parsed_response_news_article["title"]


'Medien: Disney zahlt Schnäppchenpreis für "Global Dream"'

In [15]:
print("\n\n".join([i["value"] for i in parsed_response_news_article["content"] if i["type"] in whitelist_type]))


<strong>Der Disney-Konzern hat das auf den insolventen MV-Werften gebaute Kreuzfahrtschiff "Global Dream" offenbar für den Schnäppchenpreis von 40 Millionen Euro bekommen. Das berichten die Magazine "Capital" und "stern". Das Land Mecklenburg-Vorpommern könnte auf einem dreistelligen Millionenschaden sitzen bleiben.</strong>

Ursprünglich war das mehr als 340 Meter lange Kreuzfahrtschiff laut dem Bericht von "Capital" und "stern" auf einen Preis von rund 1,8 Milliarden Euro taxiert worden - wenn es fertiggestellt wäre. Nach jüngsten Angaben des MV-Werften-Insolvenzverwalters Christoph Morgen ist die in der Werfthalle in Wismar liegende "Global Dream" zu rund 60 Prozent fertiggestellt. Den Angaben zufolge werde Disney, das eine eigene Kreuzfahrtsparte im Portfolio hat, das Schiff ohne Gewährleistungsansprüche übernehmen und es auf eigenes Risiko und auf eigene Kosten fertig- und umbauen. Disney hatte das Schiff Mitte November gekauft, über den Kaufpreis war Stillschweigen vereinbart wor

---
# Application


In [189]:
from datetime import datetime
import logging
from typing import Literal, Generator, Union, Iterable

import requests
from pydantic import BaseModel, Field


In [20]:
class DynamicConfig(BaseModel):
    keywords: list

    def __init__(self, **data):
        data["keywords"] = [keyword.lower() for keyword in data["keywords"]]
        super().__init__(**data)


class News(BaseModel):
    title: str
    sub_title: Union[str, None] = None
    tags: Union[list[str], None] = None
    text: Union[str, None] = None
    url: str = Field(description="Web url to html page")
    timestamp: datetime = datetime.now()


# Tagesschau specific
class ArticleInfo(BaseModel):
    id: str  # sophoraId
    title: str
    tags: list[str]
    link_json: str  # details
    link_web: str  # detailsweb
    timestamp: str  # date


"""
TODO:

CONSIDER TO HAVE A MORE GENERIC NEWS BASE CLASS

THIS CLASS COULD HAVE BASIC PROPERTIES LIKE (title, tags, text, link)
WHICH WOULD ENABLE TO PASS THEM GENERICALLY TO THE get_relevant_news() FUNCTION
    AND TO PASS FILTER LEVELS IN ADDITION,  i.e. filter_level({title: True, tags: True, text: False})
"""


'\nTODO:\n\nCONSIDER TO HAVE A MORE GENERIC NEWS BASE CLASS\n\nTHIS CLASS COULD HAVE BASIC PROPERTIES LIKE (title, tags, text, link)\nWHICH WOULD ENABLE TO PASS THEM GENERICALLY TO THE get_relevant_news() FUNCTION\n    AND TO PASS FILTER LEVELS IN ADDITION,  i.e. filter_level({title: True, tags: True, text: False})\n'

In [25]:
logging.basicConfig(
    format="%(levelname)s - %(asctime)s -\t%(message)s",
    level=logging.DEBUG,
    datefmt="%Y-%m-%d %I:%M:%S",
)


In [197]:
# Application config

# Endpoints
url = "https://www.tagesschau.de"

endpoint = {
    "homepage": "/api2/homepage",
    "news": "/api2/news",
    "newsfeed": "/api2/newsfeed-101~_date-{date}.json",  # yymmdd format
}


# "enums"
ressorts_and_topics = [
    "inland",
    "ausland",
    "wirtschaft",
    "sport",
    "video",
    "investigativ",
    "faktenfinder",
]

regions = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]

region_mapping = {
    1: "Baden-Württemberg",
    2: "Bayern",
    3: "Berlin",
    4: "Brandenburg",
    5: "Bremen",
    6: "Hamburg",
    7: "Hessen",
    8: "Mecklenburg-Vorpommern",
    9: "Niedersachsen",
    10: "Nordrhein-Westfalen",
    11: "Rheinland-Pfalz",
    12: "Saarland",
    13: "Sachsen",
    14: "Sachsen-Anhalt",
    15: "Schleswig-Holstein",
    16: "Thüringen",
}

types = ["story", "webview", "video"]

# Blacklists
blacklist_url = ["liveblog"]
blacklist_type = ["video", "webview"]


In [203]:
# API client
def create_session():
    s = requests.Session()
    s.headers.update({"Content-Type": "application/json"})

    return s


# Data Source Gateway
def get_news(url: str, endpoint: str, session: requests.Session) -> dict:
    logging.info("tagesschau API - GET request to news endpoint")
    response = session.get(url + endpoint)

    if response.status_code == 200:
        logging.info("Received status code 200")
        return response.json()
    else:
        raise Exception(
            f"""Error
        Expected status code:  200
        Got status code:  {response.status_code}
        Response text:
        {response.text}
        """
        )


# Service Level

## Tagesschau Service - Parse data
def parse_news(news_raw: dict) -> News:
    """Parse dict of individual news article."""

    def parse_tags(tags: list[dict]) -> list[Union[str, None]]:
        """Unpacks list of {"tag": "value"} pairs and returns list
            of unqiue tag values in lowercase.

        Example:
            Input
                [{'tag': 'Berlin'}, {'tag': 'Brandenburg'}]

            Returns
                ['Berlin', 'Brandenburg']
        """
        return list(set([i["tag"].lower() for i in tags]))

    try:
        news = News(
            title=news_raw["title"],
            sub_title=news_raw["firstSentence"],
            tags=parse_tags(news_raw["tags"]),
            url=news_raw["detailsweb"],
            timestamp=datetime.fromisoformat(news_raw["date"]),
        )
    # TODO:  Replace with proper exception and exception handling
    except KeyError as error:
        logging.info(f"{error}\tarticle.get('type'):  {news_raw.get('type')}")
        logging.info(f"\tarticle.get('shareURL'):  {news_raw.get('shareURL')}")
        news = News(
            title=str(error),
            url=news_raw["shareURL"],
        )

    return news


def parse_news_all(many_news_raw: dict) -> Iterable[News]:
    return map(parse_news, many_news_raw["news"])


In [193]:
## Generic Service - Filter
def is_keywords_in_news(
    news: News,
    keywords: list[str],
    filters: dict[Literal["title", "sub_title", "tags", "text"], bool] = {
        "title": True,
        "sub_title": True,
        "tags": True,
        "text": False,
    },
) -> bool:
    """Takes instance of News obje"""

    def unify_filter_object(
        filter_object: Union[list, str, None]
    ) -> Union[list[str], None]:
        """Transforms input into list of lowercase strings."""

        if type(filter_object) is None:
            unified_filter_object = None
        elif type(filter_object) == str:
            # TODO:  Implement as generator
            # https://stackoverflow.com/questions/3862010/is-there-a-generator-version-of-string-split-in-python
            unified_filter_object = filter_object.lower().split()
        elif type(filter_object) == list:
            # TODO:  Use generator instead
            # unified_filter_object = (i.lower() for i in filter_object)
            unified_filter_object = [i.lower() for i in filter_object]
        else:
            raise TypeError

        return unified_filter_object

    def is_keywords_in_filter_object(
        keywords: list[str],
        filter_object: Union[list[str]],
    ) -> bool:
        # Alternative version that returns keyword matches
        # [word for word in filter_object if any(keyword in word for keyword in keywords)] or False
        return True in (
            True
            for word in filter_object
            if any(keyword in word for keyword in keywords)
        )

    news_dict = news.dict()
    filter_keys = [k for k, v in filters.items() if v]

    filter_objects = (unify_filter_object(news_dict[key]) for key in filter_keys)

    # TODO:  Implement as map or as filter function
    return True in (
        is_keywords_in_filter_object(keywords, filter_object)
        for filter_object in filter_objects
    )


def get_relevant_news(
    many_news: Generator[News, None, None],
    keywords: list,
    filters: dict[Literal["title", "sub_title", "tags", "text"], bool] = {
        "title": True,
        "sub_title": True,
        "tags": True,
        "text": False,
    },
) -> Generator[News, None, None]:
    return (news for news in parsed_news if is_keywords_in_news(news, keywords, filters))


In [194]:
# Dynamic inputs

keywords = [
    "pass",
    "einbürgerung",
    "rheinmetall",
    "ukraine",
]


filters = {
    "title": True,
    "sub_title": True,
    "tags": True,
    "text": False,
}


In [195]:
session = create_session()


In [204]:
# Data source gateway and data source specific parsing
many_news_raw = get_news(
    url=url,
    endpoint=endpoint["news"],
    session=session
)


In [205]:
many_news = parse_news_all(
    many_news_raw
)


In [207]:
relevant_news = get_relevant_news(
    many_news,
    keywords,
    filters,
)


In [208]:
list(relevant_news)

[News(title='So sollen Ausländer künftig den deutschen Pass schneller erhalten ', sub_title='Einbürgerung schon nach fünf Jahren statt nach acht - so sieht es ein Gesetzentwurf des Bundesinnenministeriums vor.', tags=['brandenburg', 'berlin'], text=None, url='https://www.tagesschau.de/ardimport/regional/brandenburg/rbb-story-115031.html'),
 News(title='++ Russland meldet Eroberung von Soledar ++', sub_title='Alle aktuellen Entwicklungen zum Krieg gegen die Ukraine in unserem Liveblog.', tags=['liveblog', 'ukraine'], text=None, url='https://www.tagesschau.de/newsticker/liveblog-ukraine-mittwoch-205.html'),
 News(title='Hilfe für die Menschen in der Ukraine', sub_title='Wenn Sie für die Menschen in der Ukraine und Geflüchtete aus der Ukraine spenden wollen, finden Sie hier Hilfsorganisationen und Bankverbindungen.', tags=['solidarität', 'ukraine', 'hilfe'], text=None, url='https://www.tagesschau.de/spendenkonten/spendenkonten-133.html'),
 News(title='+++ Update: Baggerfahrer wohl schwer 

---
# ntfy


In [76]:
ntfy_url = "https://ntfy.sh"

topic = "news-ticker"

def post_news_alert(
    title: str,
    data: str,
    link_web: str,
    topic: str=topic,
    ntfy_url: str=ntfy_url,
) -> None:
    logging.info("Post news alert")
    requests.post(f"{ntfy_url}/{topic}",
        data=data,
        headers={
            "Title": title,
            "Click": link_web,
            # "Priority": "urgent",
            # "Tags": "warning,skull"
        })
        
    logging.info("News alert posted")


In [81]:
from time import sleep

for article in relevant_articles:
    post_news_alert(
        title="Source:  tagesschau.de",
        data=article.title,
        link_web=article.link_web,
    )


INFO - 2022-11-29 10:12:46:	Post news alert
INFO - 2022-11-29 10:12:46:	News alert posted
INFO - 2022-11-29 10:12:46:	Post news alert
INFO - 2022-11-29 10:12:47:	News alert posted
INFO - 2022-11-29 10:12:47:	Post news alert
INFO - 2022-11-29 10:12:47:	News alert posted
INFO - 2022-11-29 10:12:47:	Post news alert
INFO - 2022-11-29 10:12:48:	News alert posted
INFO - 2022-11-29 10:12:48:	Post news alert
INFO - 2022-11-29 10:12:48:	News alert posted
INFO - 2022-11-29 10:12:48:	Post news alert
INFO - 2022-11-29 10:12:49:	News alert posted
INFO - 2022-11-29 10:12:49:	Post news alert
INFO - 2022-11-29 10:12:49:	News alert posted
INFO - 2022-11-29 10:12:49:	Post news alert
INFO - 2022-11-29 10:12:50:	News alert posted
INFO - 2022-11-29 10:12:50:	Post news alert
INFO - 2022-11-29 10:12:50:	News alert posted
INFO - 2022-11-29 10:12:50:	Post news alert
INFO - 2022-11-29 10:12:50:	News alert posted
INFO - 2022-11-29 10:12:50:	Post news alert
INFO - 2022-11-29 10:12:51:	News alert posted
INFO - 202