# Scrap data from der Spiegel archive

In [1]:
import re
import datetime
from bs4 import BeautifulSoup
import requests
import pandas as pd
from urllib.parse import urljoin
import newspaper
from newspaper import Article

#### 1. Get articles content in the field of politic since Feb 2015 (the refugee crisis)

In [2]:
base = datetime.datetime.strptime('10.07.2018', "%d.%m.%Y") # since '11.02.2015'
date_list = [(base - datetime.timedelta(days=x)).strftime("%d.%m.%Y") for x in range(1, 1245)]

In [3]:
def get_list_articles(website):
    r = requests.get(website)
    soup = BeautifulSoup((r.content), "lxml")

    news_collection = []
    div = soup.find("div", {"class": "column-wide"})
    #import pdb; pdb.set_trace()
    for a in div.find_all('a', href=True):
        url_article = a.get('href')
        if re.search("politik",url_article):
            url = urljoin("http://www.spiegel.de", url_article)
            news_collection.append(url)
    return pd.Series(news_collection)

def scrap_text_for_each_article(url, date):
    try:
        a = Article(url, language='de')
        a.download()
        a.parse()
        #import pdb; pdb.set_trace()
        Series = pd.Series()
        Series['text'] = a.text
        Series['date'] = date
        Series['meta_description'] = a.meta_description
        return Series
    except:
        print("Article {} not available.".format(url))

In [4]:
articles = []
for date in date_list:
    website = 'http://www.spiegel.de/nachrichtenarchiv/artikel-{}.html'.format(date)
    news_collection = get_list_articles(website)
    news_complete = pd.concat([scrap_text_for_each_article(x, date) for x in news_collection],axis=1).T
    articles.append(news_complete)

You must `download()` an article first!
Article http://www.bento.de/politik/rassismus-so-gehen-junge-deutsche-damit-um-2044750/#refsponi not available.
You must `download()` an article first!
Article http://www.bento.de/politik/aegypten-auswaertiges-amt-warnt-vor-schwulenverfolgung-1791280/#refsponi not available.
You must `download()` an article first!
Article http://www.bento.de/politik/marokko-wie-marokkaner-die-proteste-in-rabat-tanger-casablanca-und-fes-erleben-1556388/#refsponi not available.
Article `download()` failed with 404 Client Error: Not Found for url: http://www.bento.de/politik/regensburg-kneipen-in-bayern-entscheiden-sich-fuer-ein-verbot-von-junggesellenabschieden-1391251/#refsponi on URL http://www.bento.de/politik/regensburg-kneipen-in-bayern-entscheiden-sich-fuer-ein-verbot-von-junggesellenabschieden-1391251/#refsponi
Article http://www.bento.de/politik/regensburg-kneipen-in-bayern-entscheiden-sich-fuer-ein-verbot-von-junggesellenabschieden-1391251/#refsponi not av

In [5]:
pd.concat(articles).to_csv("../data/articles_politic_all.csv", index=False, encoding="utf-8")

#### 2. Build complete dataset

* expecting 800000 rows for 3 years politics articles -> 717452 rows in total in the csvs
* 32028 articles in total

In [122]:
list_source = ["../../data/articles_politic_1.csv","../../data/articles_politic_2.csv","../../data/articles_politic_3.csv",
              "../../data/articles_politic_4.csv","../../data/articles_politic_5.csv","../../data/articles_politic_6.csv",
              "../../../data/articles_politic_7.csv","../../../data/articles_politic_8.csv","../../data/articles_politic_9.csv",
              "../../data/articles_politic_10.csv","../../data/articles_politic_11.csv","../../data/articles_politic_12.csv"]
dfs = []
for file in list_source:
    df = pd.read_csv(file)
    dfs.append(df)
    
all_articles = pd.concat(dfs)

In [127]:
all_articles.loc[:,("text","meta_description")].to_csv("../../data/articles_politic_complete_02_15.csv",
                                                      index=False, encoding="utf-8")

In [128]:
all_articles.loc[:,("text","meta_description")].head()

Unnamed: 0,text,meta_description
0,Die britische Premierministerin Theresa May ha...,Theresa May steht unter Druck: Die Minister Bo...
1,hier finden Sie die wichtigsten Nachrichten de...,Deutschlands führende Nachrichtenseite. Alles ...
2,An einem Tag hat die britische Premierminister...,An einem Tag hat die britische Premierminister...
3,Auf diesen Moment hat Recep Tayyip Erdogan jah...,Zwei Wochen nach den Wahlen in der Türkei hat ...
4,"Erst Brexit-Minister David Davis, nun auch der...",Der britische Außenminister und strikte Brexit...
