In [49]:
import pandas as pd
import numpy as np
import datetime
import time
from babel.dates import format_date
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

<h1 style="background-color:gray;">Goal of this notebook</h2>

- To scrape DW news articles on 4 main categories ('Politics', 'Business', 'Sports', 'Arts').

(Note: The articles scraped have descending order of date.)

In [50]:
TARGET_CATEGORIES = ('Politics', 'Business', 'Sports', 'Arts')
COLS = ['url', 'category', 'title', 'text', 'target_category'] # main dataframe columns

#### Function for getting article url from anchor tag using xpath

In [51]:
def get_url(driver):
    element = driver.find_element(By.XPATH,'.//a')
    return element.get_attribute('href')

#### Function for getting article title from h2 header using xpath

In [52]:
def get_title(driver):
    element = driver.find_element(By.XPATH,'.//h2')
    return element.text

#### Utility function for finding an element using xpath and returning it's text

In [53]:
def get_data(driver,xpath):
    data = np.nan
    try:
        data = driver.find_element(By.XPATH,xpath).text
    except Exception as e:
        print('element not found')
    finally:
        return data        

#### Utility function for finding elements using xpath and returning list of their texts

In [54]:
def get_multi_data(driver, xpath):
    data = []
    try:
        elems = driver.find_elements(By.XPATH,xpath)
        for elem in elems:
            data.append(elem.text)
    except Exception as e:
        print('elements not found')
    finally:
        return data   

#### Function for setting all articles data into a dictionary

In [55]:
def set_data_dict(url,category,title,text,target_category):
    data_dict = {'url':[url],'category':[category],\
                 'title':[title],'text':[text],'target_category':[target_category]}
    return data_dict

### Function to scrape articles url and title based on the url which has search item as one of the target categories

In [56]:
def scrape_articles_list(url,target_category):

    # set up selenium driver in a Chrome browser environment (make sure you have Chrome installed)
    driver = webdriver.Chrome()
    driver.maximize_window()
    driver.get(url)
    wait = WebDriverWait(driver, 30) # make the driver wait for 30 seconds
    
    articles_all = [] # initialize an empty list to store articles scraped
    try:
        # check if all search results xpath of articles are found or not
        elements_present = EC.presence_of_all_elements_located((By.XPATH,'//div[@id="searchResult"]//div[@class="searchResult"]'))
        wait.until(elements_present)
    except Exception as e:
        print(e)
        print('id, xpath or class name of the element is not found!')
        driver.quit()

    try:
        # find all elements in search results (articles)
        articles = driver.find_elements(By.XPATH,'//div[@id="searchResult"]//div[@class="searchResult"]')
        
        for article in articles:
            article_url = get_url(article)
            title = get_title(article)
            articles_all.append((article_url,title,target_category))

    except Exception as e:
        print(e)
        print('Search results error!')
        driver.quit()
    
    driver.quit()
    return articles_all
    

### Function to scrape article's main text

In [57]:
def scrape_article_text(articles):
    
    # set up selenium driver in a Chrome browser environment (make sure you have Chrome installed)
    driver = webdriver.Chrome()
    driver.maximize_window()
    # main dataframe where all articles and its features are appended
    main_df = pd.DataFrame(columns=COLS)

    # loop through all articles and scrape the actual text by going throught it's url which was stored in the above step
    for item in articles:
        # url, title are already scraped from the search page
        url = item[0]
        title = item[1]
        target_category = item[2]

        driver.get(item[0])
        driver.implicitly_wait(20) # configure implicit wait for 20s since multiple elements are scraped
        
        # get category from the xpath
        elems = driver.find_elements(By.XPATH,'//div[@class="sc-iJfdHH sc-iUeHef sc-lbNsEr dEMmVC imciYi imfbMx sc-fkekHa hFHcoo kicker sc-iuWDFx fWEOHE"]//span')
        if len(elems) > 0:
            category = elems[0].text
        else:
            category = np.nan

        # get all paragraph texts and join them with a new line
        texts = get_multi_data(driver, '//div[@class="sc-iJfdHH sc-iUeHef sc-hjcAwj dEMmVC imciYi eGhiVL sc-kMTSIo GtYvC rich-text has-italic"]//p')
        if len(texts)>0:
            text = '\n'.join(texts)
        else:
            text = np.nan

        # create a row of dictionary to concat in final dataframe
        data_dict = set_data_dict(url,category,title,text,target_category)
        article_df = pd.DataFrame(data_dict)
        main_df = pd.concat([main_df,article_df],ignore_index=True)

    driver.quit()
    return main_df

<h2 style="background-color:gray;">SCRAPE ARTICLES URL AND TITLE  BY SEARCHING TARGET CATEGORIES STRINGS ON A LOOP</h2>

In [58]:
articles_all = []
for target_category in TARGET_CATEGORIES:
    search_url = f"https://www.dw.com/search/?languageCode=en&&contentType=ARTICLE&item={target_category}&searchNavigationId=9097-30688&sort=DATE&resultsCounter=200"
    articles_single_category = scrape_articles_list(search_url, target_category)
    articles_all.extend(articles_single_category)

print(f'No of articles scraped = {len(articles_all)}')

No of articles scraped = 800


<h2 style="background-color:gray;">SCRAPE THE TEXT OF THE ARTICLE BY GOING TO THE URL OF THE ARTICLE</h2>

In [59]:
df = scrape_article_text(articles_all)

In [60]:
df.head()

Unnamed: 0,url,category,title,text,target_category
0,https://www.dw.com/en/germany-undeterred-by-gl...,POLITICS,Germany undeterred by global turmoil — Scholz ...,"In his New Year's message, German Chancellor O...",Politics
1,https://www.dw.com/en/taiwan-presidential-cand...,POLITICS,Taiwan: Presidential candidates debate in shad...,Taiwan's presidential candidates argued over w...,Politics
2,https://www.dw.com/en/emboldened-iran-silences...,HUMAN RIGHTS,Emboldened Iran silences critics as world look...,As least 690 prisoners were executed in Iran i...,Politics
3,https://www.dw.com/en/berlin-prepares-for-anot...,SOCIETY,Berlin prepares for another rowdy New Year's E...,"In Germany, Christmas is all about contemplati...",Politics
4,https://www.dw.com/en/albania-former-pm-put-un...,CRIME,Albania: Former PM put under house arrest in c...,Albania's right-wing opposition leader Sali Be...,Politics


In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   url              800 non-null    object
 1   category         794 non-null    object
 2   title            800 non-null    object
 3   text             794 non-null    object
 4   target_category  800 non-null    object
dtypes: object(5)
memory usage: 31.4+ KB


In [65]:
# we save the data as a csv file so no scraping for 31 mins should be redone
# this csv file would be used as a dataset in task2_modeling.ipynb notebook
df.to_csv('dw_articles.csv', header=True, index=False)

# References
- https://www.dw.com/search/?languageCode=en