In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
from time import sleep 
import tqdm
import json
import numpy as np

## get links

In [None]:
def get_article_links(link):
    """Extract all links to articles from the archive site. 

    Args:
        link (str): link to tageschau archive 

    Raises:
        ConnectionError: _description_
        TypeError: _description_

    Returns:
        _type_: list of links of articles
    """
    link_list = []
    response = requests.get(link)
    if response.status_code !=200:
        raise ConnectionError('satuscode of website is not 200')
    soup  = BeautifulSoup(response.text,'html.parser')
    content_wrapper = soup.find_all('div',class_='teaser-right twelve')
    if len(content_wrapper)<1:
        raise TypeError ('there are no links at that day')
    for content in content_wrapper:
        link = content.find('a').attrs['href']
        link_list.append(link)
    return (link_list)
    

In [None]:
## generate a list of dates date 

dates = pd.date_range(start='11/07/2023', end='11/07/2025')
dates = dates.strftime('%Y-%m-%d')


In [None]:
## Loops over all days in date range and gets the links to the articles written on that day
df = pd.DataFrame(columns=['id','link','datum'])
archive_base_link = 'https://www.tagesschau.de/archiv?datum='
df = df.set_index('id')
for date in tqdm.tqdm(dates):
    date_link = archive_base_link+date
    links = get_article_links(date_link)
    for link in links:
        df.loc[len(df)] = [link,date]
    sleep(3)

100%|██████████| 732/732 [1:09:10<00:00,  5.67s/it]


In [None]:
## save the link
df.to_csv('tageschau_link.csv',index=None)

In [None]:
## filter for the unique link
unique_links = df.groupby('link').first()
unique_links['saved']= np.nan
unique_links = unique_links.reset_index()
unique_links.to_csv('tageschau_data.csv',index= None)

## api
get the article from the tagsschau api

In [None]:
## read link list
df = pd.read_csv('tageschau_data.csv')
df

Unnamed: 0,link,datum,saved
0,https://www.tagesschau.de/multimedia/podcast/...,2025-04-30,
1,/11-km-stories-neuer-storytelling-podcast-100....,2025-04-07,
2,/analyse-afd-parteitag-100.html,2024-06-01,
3,/ausland/abschiebungen-mexiko-100.html,2025-01-24,
4,/ausland/aegypten-hurghada-uboot-untergang-tot...,2025-03-27,
...,...,...,...
15471,https://www1.wdr.de/nachrichten/wahlen/kommuna...,2025-09-28,
15472,https://www1.wdr.de/nachrichten/wahlen/kommuna...,2025-09-15,
15473,https://www1.wdr.de/nachrichten/westfalen-lipp...,2025-05-18,
15474,https://www1.wdr.de/nachrichten/westfalen-lipp...,2025-06-05,


In [None]:
def convert_link_to_path(link):
    """creates path where article is saved

    Args:
        link (str): link to article 

    Returns:
        str: path 
    """
    file_path = link.replace('/','-')
    file_path = file_path.replace('.html','.json')
    return file_path
    
def save_tagesschau_to_json(link):
    """requests the link using the api2u and saves the json respones

    Args:
        link (str): link to an article

    Returns:
        bool: if the article was saved or not. True if saved, False if not
    """
    api_base_link = 'https://www.tagesschau.de/api2u'
    request_lisk = api_base_link+link
    respones = requests.get(request_lisk)
    #file_path = link.replace('/','-')
    file_path = convert_link_to_path(link) #file_path.replace('.html','.json')
    if respones.status_code != 200:
        return False
    if len(respones.json())<1:
        return False
    with open('data/'+file_path,'w')as file:
        json.dump(respones.json(),file,indent=4)
    return True


In [None]:
import os
def does_file_exists(link):
    ## test if link was already saved
    return os.path.exists('data/'+convert_link_to_path(link))

In [None]:
## test for one link
link = '/wirtschaft/weltwirtschaft/china-lieferketten-de-risking-investitionen-unternehmen-100.html'
save_tagesschau_to_json(link)

True

In [None]:
## loop over all links and save those that have not yet been saved.
api_base_link = 'https://www.tagesschau.de/api2u'
for id , row in tqdm.tqdm(df.iterrows()):
    
    link = row['link']
    if row['saved'] == 1:
        continue
    if does_file_exists(link):
        df.loc[id,'saved'] = True
        continue
    saved = save_tagesschau_to_json(link)
    df.loc[id,'saved'] = saved
    sleep(3)

15476it [1:01:45,  4.18it/s]


In [72]:
print((df['saved']==1).sum())
#df['saved'].value_counts()
df
#/wirtschaft/finanzen/marktberichte/marktbericht-dow-dax-fed-ezb-zins-geldanlage-100.html
#
# df.to_csv('tageschau_data.csv',index= None)

14934


Unnamed: 0,link,datum,saved
0,https://www.tagesschau.de/multimedia/podcast/...,2025-04-30,False
1,/11-km-stories-neuer-storytelling-podcast-100....,2025-04-07,True
2,/analyse-afd-parteitag-100.html,2024-06-01,True
3,/ausland/abschiebungen-mexiko-100.html,2025-01-24,True
4,/ausland/aegypten-hurghada-uboot-untergang-tot...,2025-03-27,True
...,...,...,...
15471,https://www1.wdr.de/nachrichten/wahlen/kommuna...,2025-09-28,False
15472,https://www1.wdr.de/nachrichten/wahlen/kommuna...,2025-09-15,False
15473,https://www1.wdr.de/nachrichten/westfalen-lipp...,2025-05-18,False
15474,https://www1.wdr.de/nachrichten/westfalen-lipp...,2025-06-05,False
