# Analysis of usage of the term "sygeplejersker" in DR, TV2 and the danish parliament

## Imports

In [5]:
import pandas as pd
import numpy as np
import csv
import re
import time
import tqdm

# Plotting
import seaborn as sns
import matplotlib.pyplot as plt

# Scraping
import requests
from bs4 import BeautifulSoup

# Selenium
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

## Data gathering

### DR

> NOTE: The following code must be run on a computer with danish language as google behaves differently based on the computers current language settings

#### 1:  Generate search terms

Search terms follow the following format: We search for the month and year in the format DR articles inclue a timestap. Manual Google searches proved to provide relevant research results mostly limited to the month provided. In this way we create a list of links to DR articles. All articles are located on the site https://www.dr.dk/nyheder or a subsite. This can be included in the google search. An example of a search is: jan. 2012" AND "sygeplejersker*" site:https://www.dr.dk/nyheder.

In [2]:
# Generate empty list with search terms
search_terms = []
# Generate combinations of year-month and term search combinations
for year in range(2012, 2023):
    months = ['jan.', 'feb.', 'mar.', 'apr.', 'maj', 'jun.', 'jul.', 'aug.', 'sep.', 'okt.', 'nov.', 'dec.']
    for month in months:
        term = f'"{month} {year}" AND "sygeplejersker*" site:https://www.dr.dk/nyheder/ \n'
        search_terms.append(term)
# Create final list of search terms, future month-year combinations deleted
search_terms = search_terms[:-4]


We create a total of 124 search terms, for all month-year combinations between January 2012 and August 2022.

#### 2a: Scraping Google search to retrieve list of links to DR articles
With the following code we scrape Google searches to retrieve a list of links to DR articles, given the DR website does not provide a useful search function. We use Selenium to go execute a Google search and retrieve DR article links. We execute the search, save the HTML for the first results page and then go to further pages of the search results to retrieve more search resutls. Google intervenes when scaping search results too fast. We therefore integrate a break when moving between pages. The break time takes random values between 0.25 and 3.5 seconds.

Scraping the first three search pages:

In [11]:
driver = webdriver.Chrome(ChromeDriverManager().install())
html_list = []
finished_searches = 0

for i in tqdm.tqdm(search_terms[0:1]):
    # Go to google
    driver.get('https:google.com')
    # Discard cookie message, reject cookies
    cookie = driver.find_element(By.ID, "W0wltc")
    cookie.click()
    # Search for DR news articles
    gsearch = driver.find_element(By.CSS_SELECTOR, "input[title='Søg']")
    gsearch.send_keys(i)
    # Get HTML for first search result page
    html = driver.page_source
    html_list.append(html)
    # Go to next result page
    next_page = driver.find_element(By.CSS_SELECTOR, ".NVbCr+ span") #CSS selector only last not previous page
    next_page.click()
    # Define an error used when reaching last search page:
        # When error = 0, there is another resut page.
        # When error = 1, there is no further page on Google, loop stops.
    error = 0
    page = 1
    for page in range(0,1): #Iterates over 3 Pages in total
        try:
            html2 = driver.page_source
            html_list.append(html2)
            # Google detects suspicious behavior and asks to solve some puzzle after 7 iterations. Trying random sleep time and scrolling down to element.
            time.sleep(np.random.uniform(10, 15))
            # Go to next result page
            next_page = driver.find_element(By.CSS_SELECTOR, "#pnnext .NVbCr+ span") #CSS selector only last not previous page
            next_page.click()
            page += 1
        except:
            error += 1
    finished_searches += 1
    time.sleep(np.random.uniform(5,10))
    driver.quit()

  driver = webdriver.Chrome(ChromeDriverManager().install())
100%|██████████| 1/1 [00:20<00:00, 20.56s/it]


#### 2b: Scraping number of google results for search terms

In [50]:
num_search_results = []

for search_term in tqdm.tqdm(search_terms[124:129]):
    driver = webdriver.Chrome(ChromeDriverManager())
    # Go to google
    driver.get('https:google.com')
    # Discard cookie message, reject cookies
    cookie = driver.find_element(By.ID, "W0wltc")
    cookie.click()
    # Search for DR news articles
    gsearch = driver.find_element(By.CSS_SELECTOR, "input[title='Søg']")
    gsearch.send_keys(search_term)
    # Get HTML for first search result page
    num_results = driver.find_element(By.ID, "result-stats")
    num_search_results.append(num_results.text)
    time.sleep(np.random.uniform(0.25,1))
    driver.quit()

  0%|                                                                                            | 0/4 [00:00<?, ?it/s]
[WDM] - Downloading:   0%|                                                                 | 0.00/6.21M [00:00<?, ?B/s][A
[WDM] - Downloading:   7%|███▋                                                     | 416k/6.21M [00:00<00:01, 4.12MB/s][A
[WDM] - Downloading:  28%|███████████████▋                                        | 1.74M/6.21M [00:00<00:00, 9.75MB/s][A
[WDM] - Downloading: 100%|████████████████████████████████████████████████████████| 6.21M/6.21M [00:00<00:00, 17.1MB/s][A
  driver = webdriver.Chrome(ChromeDriverManager().install())
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:47<00:00, 11.86s/it]


#### 3: Create link list

In [12]:
# Preparing two empty lists
link_list = []
link_list_clean = []
# Iterating over the results from scraping
for l in html_list:
    soup = BeautifulSoup(l, 'lxml')
    try: 
        links = soup.find('div', class_ = 'v7W49e').find_all('a', href=True)
    except:
        pass
    # Generate list with all links
    for i in links:
        temp = i['href']
        link_list.append(temp)
    # Getting rid of noise, links not pointing to DR but Google infrastructure
    for link in link_list:
        if "webcache.googleusercontent" not in link:
            link_list_clean.append(link)

#### 4: From list to DataFrame

In [31]:
# Creating a pd.DataFrame from link_list_clean
dr_links = pd.DataFrame({'links':link_list_clean})
# For some reason, there are many duplicates included, drop them
dr_links = dr_links.drop_duplicates(subset = 'links')

#### 5: Save dataset

In [19]:
dr_links.to_csv("dr_links.csv")

NameError: name 'dr_links' is not defined

### TV2

#### 1:  Generate search terms

In [22]:
links=[]
for year in range(2012, 2023, 1):
    year=f"https://search.tv2.dk/?query=sygeplejersker+{year}"
    
    for month in range(1,10,1): #Måned 1-9
        search_month=f"-0{month}&sort=relevance&page="
        
        for page in range(1,11,1): # (start, stop, step)
            url = year+search_month+f"{page}"
            links.append(url)
    
    for month in range(10, 13): #måned 10-12
        search_month=f"-{month}&sort=relevance&page="
        
        for page in range(1,11,1): # (start, stop, step)
            url = year+search_month+f"{page}"
            links.append(url)

In [None]:
tv2_article_list=pd.DataFrame(links)
tv2_article_list.to_csv("tv2_links_to_articles.csv")

#### 2: Scraping TV2 searches for links to articles and create list

In [25]:
article_url=[]
for i in tqdm.tqdm(links[0:2]):
    try:
        resp_page = requests.get(i,headers={"Name" : "Oliver Fredborg Smietana" , "email": "kph383@ku.alumni.dk"})
        soup = BeautifulSoup(resp_page.content, 'lxml')
        for link in soup.find("section").find_all('a', href=True):
            article_url.append(links['href'])
    except:
        pass
    time.sleep(0.25)

100%|██████████| 2/2 [00:01<00:00,  1.04it/s]


23

#### 3: Scraping articles from links list

In [36]:
titles_list=[]
h2_list=[]
date_list=[]
content_list=[]
author_list=[]
sub_head_list=[]
tag_list=[]

#  Define start and end of article loop
start=0
end=15


for u in tqdm.tqdm(article_url[start: end]) :
    re=requests.get(u, headers={"Name" : "Oliver Fredborg Smietana" , "email": "kph383@alumni.ku.dk", "Purpose": "exam project for Copenhagen uni Course" })
    soup=BeautifulSoup(re.content, "lxml")
    try:
        title=soup.find("h1", class_="tc_heading tc_heading--2")
        titles_list.append(title.text.strip())
    except:
        titles_list.append("")

    try:
        date=soup.find("time", class_="tc_timestamp")['datetime']
        date_list.append(date)
    except:
        date_list.append("")

    try:
        h2=soup.find_all("h2", class_="tc_heading tc_heading--2")
        h2_i_list=[]
        for i in h2:
            h2_i=i.get_text()
            h2_i_list.append(h2_i)
        h2_str=" ".join(h2_i_list)
        h2_list.append(h2_str)
    except:
        h2_list.append("")

    try:
        sub_head=soup.find("p", class_="tc_page__body__standfirst")
        sub_head_list.append(sub_head.text.strip())
    except:
        sub_head_list.append("")

    try:
        author=soup.find("span", class_="tc_byline__author__text")
        author_list.append(author.text.strip())
    except:
         author_list.append("")

    try:
        content_i_list=[]
        content_i=soup.find("div", class_="tc_richcontent").find_all("p")
        for i in content_i:
            content_i_list.append(i.text.strip())
        content_str=" ".join(content_i_list)
        content_list.append(content_str)
    except:
        content_list.append("")

    try:
        tag=soup.find("a", class_="tc_label tc_label--color-base-red")
        tag_list.append(tag.text.strip())
    except:
        tag_list.append("")

    time.sleep(0.25)



100%|██████████| 15/15 [00:19<00:00,  1.32s/it]


#### 4: Creating dataframe from scraped articles

In [37]:
tv2_articles_sample=pd.DataFrame([titles_list, sub_head_list, h2_list, content_list, author_list, tag_list, date_list, links[start:end]]).transpose()
tv2_articles_sample.columns=["titles", "sub_header", "h2", "content", "author", "tag", "date", "links"]
tv2_articles["source"]="tv2"

In [53]:
tv2_articles.to_csv("tv2_articles_sample.csv", index=False) 

The complete data is loaded from the local folder:

In [None]:
tv2_articles=pd.read_csv("tv2_articles.csv")