In [127]:
import pandas as pd
import numpy as np
import os
import re
import tqdm 
import requests
from bs4 import BeautifulSoup
import time
import datetime as dt

## Clean link list further

In [128]:
dr_01 = pd.read_csv("dr_links_1_26.csv")
dr_02 = pd.read_csv("dr_links_26_50.csv")
dr_03 = pd.read_csv("dr_links_51_75.csv")
dr_04 = pd.read_csv("dr_links_76_100.csv")
dr_05 = pd.read_csv("dr_links_100_124.csv")

In [129]:
# Concatenate individual link-files
dr_links = pd.concat([dr_01, dr_02, dr_03, dr_04, dr_05], ignore_index=True)

In [130]:
# Limit to index and links column
dr_links = dr_links[["links"]]

In [131]:
# Drop false links
dr_links = dr_links[dr_links['links'].str.contains("/search?") == False]
# Dropping rows that only contain a '#' - issue that later appeared
dr_links = dr_links[dr_links['links'] != "#"]

In [132]:
link_list_clean = dr_links.copy()

In [133]:
link_list_clean.head()

Unnamed: 0,links
0,https://www.dr.dk/nyheder/penge/danske-sygeple...
1,https://www.dr.dk/nyheder/regionale/sjaelland/...
2,https://www.dr.dk/nyheder/indland/dom-i-drabss...
3,https://www.dr.dk/Nyheder/Temaer/2012/Dronning...
4,https://www.dr.dk/nyheder/regionale/nordjyllan...


## End with `link_list_clean`

In [143]:
link_list_clean["links"][0:2]

0    https://www.dr.dk/nyheder/penge/danske-sygeple...
1    https://www.dr.dk/nyheder/regionale/sjaelland/...
Name: links, dtype: object

## Get DR articles from links

With the following code we scraped DR news articles collected with the Google search. Since this takes very long for all 2065 articles, we run the code here again with a small subset (`link_list_clean["links"][0:5]`) to demonstrate the functionality of the code. We processed the full link set and saved the full dataset. For illustrative reasons, we process a small portion of the dataset and save it. Then, we continue working with the full dataset.

In [None]:
# Defining empty list for features
dr_titles_list = []
dr_h2_list = []
dr_date = []
dr_content_list = []
dr_author_list = []
dr_subhead_list = []
dr_tag_list = []

# Looping over links to retrieve HTMLs and extract relevant information from webpages
for u in tqdm.tqdm(link_list_clean["links"][0:5]): # [0:5] has been added to limit the run time for this code. We run it on all links.
    re = requests.get(u, headers = {"Name" : "Simon Ullrich - summer course project" , "email": "simon.ullrich@sodas.ku.dk"})
    soup = BeautifulSoup(re.content, "lxml")
    # Extracting article titles
    try:
        title = soup.find("div", class_ = "dre-speech")
        dr_titles_list.append(title.text)
    except:
        dr_titles_list.append("")
    # Extracting date of publication
    try:
        try:
            date = soup.find("time", class_ = "dre-byline__date")['datetime']
            dr_date.append(date)
        # Some websites are of another type. Date of publication located elsewhere on the website
        except:
            # Different website type: e.g. https://www.dr.dk/nyheder/da-lille-rikke-loeb-ind-i-margrethe
            date = soup.find("div", class_ = "hydra-latest-news-page-short-news__top").find("span", class_ = "dre-label-text__text")
            dr_date.append(pd.to_datetime(date.text))
    except:
        dr_date.append("")
    # Extracting author of articles
    try:
        author = soup.find("div", class_ = "dre-byline__contribution-details")
        dr_author_list.append(author.text)
    except:
        dr_author_list.append("")
    # Extracting subheaders
    try:
        subheader = soup.find("p", class_ = "dre-article-title__summary")
        dr_subhead_list.append(subheader.text)
    except:
        dr_subhead_list.append("")
    # Extracting article bodies
    try:
        # try:
        dr_content_i_list = []
        dr_content = soup.find_all("p", class_ = "dre-article-body-paragraph dre-variables")
        for i in dr_content:
            dr_content_i_list.append(i.text)
        dr_content_str = " ".join(dr_content_i_list)
        dr_content_list.append(dr_content_str)
        # except:
            # dr_content_i_list = []
            # dr_content = soup.find("div", class_ = "hydra-latest-news-page-short-news__body")
            # for i in dr_content:
                # dr_content_i_list.append(i)
                # dr_content_str = " ".join(dr_content_i_list)
            # dr_content_list.append(dr_content_str)
    except:
        dr_content_list.append("")
    # Extracting second headlines
    try:
        dr_h2 = soup.find_all("h2", class_ = "dre-article-body-sub-heading dre-variables")
        dr_h2_i_list = []
        for i in dr_h2:
            dr_h2_i = i.get_text()
            dr_h2_i_list.append(dr_h2_i)
        dr_h2_str = " ".join(dr_h2_i_list)
        dr_h2_list.append(dr_h2_str)
    except:
        dr_h2_list.append("")
    # Extracting article tags
    try:
        dr_tag = soup.find("a", class_ = "dre-article-title-section-label__title dre-article-title-section-label__title--link")
        dr_tag_list.append(dr_tag.text)
    except:
        dr_tag_list.append("")

In [None]:
### Combine lists to DataFrame
dr_articles_test=pd.DataFrame([dr_titles_list, dr_subhead_list, dr_h2_list, dr_content_list, dr_author_list, dr_tag_list, dr_date, link_list_clean["links"]]).transpose()
dr_articles_test.columns=["titles", "sub_header", "h2", "content", "author", "tag", "date", "link"]
dr_articles_test["source"] = "DR"

display(dr_articles_test)

In [None]:
### Save dataset
dr_articles_test.to_csv("dr_articles_test.csv", index = False)

### Reading in the full dataset

In [149]:
dr_articles = pd.read_csv("dr_articles.csv")

In [151]:
display(dr_articles.head())

Unnamed: 0,titles,sub_header,h2,content,author,tag,date,link,source
0,Danske sygeplejersker får job i Norge,"Krise, fyringer og ansættelsesstop får sygeple...",Markant flere,"Antallet af danske sygeplejersker, der har fåe...",Ritzau/,Penge,2012-01-19T13:27:00+00:00,https://www.dr.dk/nyheder/penge/danske-sygeple...,DR
1,Næsten ingen ledige sygeplejersker,Trods fyringsrunder i både 2010 og 2011 er arb...,,Trods fyringsrunder på sygehusene i både 2010 ...,Henny Mortensen,Sjælland,2012-01-28T07:42:00+00:00,https://www.dr.dk/nyheder/regionale/sjaelland/...,DR
2,Dom i drabssag fra Skodborg i dag,I dag afgøres det om en 40-årig kvinde er skyl...,,I dag afsiges der dom i sagen om den 42-årige ...,Søren Andersen,Indland,2012-01-27T05:24:00+00:00,https://www.dr.dk/nyheder/indland/dom-i-drabss...,DR
3,BILLEDSERIE: Den nye dronning,,,,Morten Top,,2012-01-08 00:00:00,https://www.dr.dk/Nyheder/Temaer/2012/Dronning...,DR
4,Socialminister: Vi kommer til at lære af Rebil...,Socialminister Karen Hækkerup er overbevist om...,,Socialminister Karen Hækkerup er overbevist om...,Søren K Nielsen,Nordjylland,2012-01-17T21:42:00+00:00,https://www.dr.dk/nyheder/regionale/nordjyllan...,DR


## Next tasks
* Clean dates - all datetime objects!