In [3]:
import os
import json
from bs4 import BeautifulSoup

data_dir = "../data/"

faz_dir = data_dir + "faz/raw-search/"
sz_dir = data_dir + "sz/raw-search/"
zeit_dir = data_dir + "zeit/raw-search/"
spiegel_dir = data_dir + "spiegel/raw-search/"



# FAZ
#### Extraktionsfunktion

In [4]:
def extract_faz_articles(filename):

    with open(faz_dir + filename, 'r') as file: 
        soup = BeautifulSoup(file, "html.parser")


    articles_html = soup.select("div[class*='article-row clearfix']") 
    articles_list = []

    for article in articles_html:
        title = article.find("h3").text.strip()
        link = "https://www.faz-biblionet.de/faz-portal/document?uid="  + article["class"][2]
        date = article.find("li").text.replace("|","")
        abstract = article.find("p",{"class":"abstract"}).text.strip()
        h2_list = article.find_all("h2")[:-1]
        author = ""
        surtitle = ""
        if len(h2_list)>0:
            for h2 in h2_list:
                if "Von " == h2.text.strip()[:4]:
                    author = h2.text.strip().replace("Von ", "").split(",")[0]
                else:
                    surtitle = h2.text.strip()


        articles_list.append({"title":title, "link":link, "date":date, "abstract":abstract, "author":author, "surtitle":surtitle, "source":"FAZ"})
    return articles_list


#### Ausführen der Extraktion (Listenerstellung und Abspeichern)

In [5]:
files = os.listdir(faz_dir)
faz_articles = []

for file in files:
    faz_articles = faz_articles + extract_faz_articles(file)

with open(data_dir + "faz/all-articles.json", "w") as file:
    json.dump(faz_articles, file, indent = 2)


# SZ
#### Extraktionsfunktion

In [6]:
def extract_sz_articles(filename):

    with open(sz_dir + filename, 'r') as file: 
        soup = BeautifulSoup(file, "html.parser")


    articles_html = soup.find_all("div", {"class":"hitWrapper"}) 
    articles_list = []

    for article in articles_html:
        title = article.find("a").text.strip()
        link = "https://archiv.szarchiv.de" + article.find("a")["href"]
        date = article.find("span").text.strip()
        abstract = article.find("div", {"class":"hitContentFOKUS"}).text.strip()
        author = ""
        try:
            surtitle = article.find("div", {"class":"hitContentDACHZEILE"}).text.strip()
        except:
            try: 
                surtitle = article.find("div", {"class":"hitContentText"}).text.strip()
            except:
                surtitle = ""
       


        articles_list.append({"title":title, "link":link, "date":date, "abstract":abstract, "author":author, "surtitle":surtitle, "source":"SZ"})
    return articles_list


#### Ausführen der Extraktion (Listenerstellung und Abspeichern)

In [7]:
files = os.listdir(sz_dir)
sz_articles = []

for file in files:
    sz_articles = sz_articles + extract_sz_articles(file)

with open(data_dir + "sz/all-articles.json", "w") as file:
    json.dump(sz_articles, file, indent = 2)


# Zeit
#### Wiso-Extraktionsfunktion

In [8]:
def extract_wiso_articles(filename, directory, source):

    with open(directory + filename, 'r') as file: 
        soup = BeautifulSoup(file, "html.parser")

    articles_html = soup.select("tr[class*='hitlist_item']") 
    articles_list = []

    for article in articles_html:
        title = article.select("span[class*='boxHeader']")[0].text.strip()
        link = "https://www.wiso-net.de/" + article.find("a", {"rel":"nofollow"})["href"]
        date = article.find("td", {"class":"boxCol3"}).text.strip()
        abstract =  article.select("span[class*='boxDescription']")[0].text.strip() + " ".join([e.text.strip() for e in article.select("span[class*='boxAbstract']")[1:]])
        author = ""
        surtitle = article.select("span[class*='boxAbstract']")[0].text.strip() 


        articles_list.append({"title":title, "link":link, "date":date, "abstract":abstract, "author":author, "surtitle":surtitle, "source":source})
    return articles_list


#### Ausführen der Extraktion

In [9]:
files = os.listdir(zeit_dir)
zeit_articles = []

for file in files:
    zeit_articles = zeit_articles + extract_wiso_articles(file, zeit_dir, "Zeit")

with open(data_dir + "zeit/all-articles.json", "w") as file:
    json.dump(zeit_articles, file, indent = 2)

# Spiegel
#### für surtitle eliminierende Extraktion manipulierte Extraktionsfunktion

In [1]:
def extract_wiso_articles(filename, directory, source):

    with open(directory + filename, 'r') as file: 
        soup = BeautifulSoup(file, "html.parser")

    articles_html = soup.select("tr[class*='hitlist_item']") 
    articles_list = []

    for article in articles_html:
        title = article.select("span[class*='boxHeader']")[0].text.strip()
        link = "https://www.wiso-net.de/" + article.find("a", {"rel":"nofollow"})["href"]
        date = article.find("td", {"class":"boxCol3"}).text.strip()
        abstract =  article.select("span[class*='boxDescription']")[0].text.strip() + " ".join([e.text.strip() for e in article.select("span[class*='boxAbstract']")[1:]])
        author = ""
        surtitle = "" 


        articles_list.append({"title":title, "link":link, "date":date, "abstract":abstract, "author":author, "surtitle":surtitle, "source":source})
    return articles_list

#### Extraktion

In [4]:
files = os.listdir(spiegel_dir)
spiegel_articles = []

for file in files:
    spiegel_articles = spiegel_articles + extract_wiso_articles(file, spiegel_dir, "Spiegel")

with open(data_dir + "spiegel/all-articles.json", "w") as file:
    json.dump(spiegel_articles, file, indent = 2)