# Webpage data extraction using Beautiful Soup 4

For a list of references see:

https://blog.hartleybrody.com/web-scraping-cheat-sheet/#using-beautifulsoup


In [1]:
# conda install beautifulsoup4
# pip install requests
import requests
from bs4 import BeautifulSoup

In [2]:
# Request the webpage
url = "https://www.spiegel.de/international"
req = requests.get(url)

In [3]:
# Inspect the structure of the article using Chrome / Devtools
req.text

'<!doctype html><html lang="de" class="brand-spon">\n<head>\n<title>International - DER SPIEGEL</title>\n<meta charset="utf-8">\n<meta name="viewport" content="width=device-width,initial-scale=1,user-scalable=no">\n<meta name="MSSmartTagsPreventParsing" content="true">\n<meta http-equiv="imagetoolbar" content="no">\n<meta http-equiv="x-ua-compatible" content="IE=Edge">\n<meta name="apple-itunes-app" content="app-id=424881832">\n<link rel="manifest" href="https://www.spiegel.de/public/spon/json/manifest.json">\n<meta name="theme-color" content="#E64415">\n<meta name="google-site-verification" content="lwpSnwoyvMkHS9nHmLBZuYOashAHfooOHSHeA_KS7ek">\n<meta name="robots" content="index, follow, noarchive, noodp">\n<meta name="copyright" content="DER SPIEGEL, Hamburg, Germany">\n<meta name="email" content="spiegel_online@spiegel.de">\n<meta name="author" content="DER SPIEGEL, Hamburg, Germany">\n<meta name="description" content="Deutschlands führende Nachrichtenseite. Alles Wichtige aus Poli

In [None]:
# Save the website
websiteFileName="website.htm"
with open(websiteFileName, "wb") as file:
    file.write(req.text.encode())

In [None]:
req.text

In [4]:
# Create the BS4 Object
soup = BeautifulSoup(req.text, 'html')

In [5]:
# Use HTML Selector
events = soup.findAll('article')
events

[<article aria-label="The West's Greed Could Come Back To Haunt It" class="lg:p-24 md:py-24 sm:py-16" data-sara-article-id="dc4b8653-d935-4938-928a-746b031bf0c9">
 <header class="lg:flex lg:justify-between md:flex md:justify-between md:mx-24 sm:mx-16">
 <h2 class="lg:flex-grow md:flex-grow">
 <a class="text-black block" href="https://www.spiegel.de/international/world/a-stalled-global-vaccine-drive-the-west-s-greed-could-come-back-to-haunt-it-a-dc4b8653-d935-4938-928a-746b031bf0c9" title="The West's Greed Could Come Back To Haunt It">
 <span class="block text-primary-base hover:text-primary-dark focus:text-primary-darker font-brandUI font-extrabold lg:text-xl md:text-xl sm:text-l leading-tight mb-8">
 The Stalled Global Vaccine Drive
 </span>
 <span class="block lg:mb-24 md:mb-16 sm:mb-16">
 <span class="font-brandUI font-extrabold lg:text-5xl md:text-5xl sm:text-3xl leading-tight"><span class="align-middle hover:opacity-moderate focus:opacity-moderate">The West's Greed Could Come Back

In [6]:
len(events)

32

In [7]:
# View the first article
event=events[1]
event

<article aria-label="European Unity and Determination Offer a Path Forward" class="lg:py-24 md:py-24 sm:py-16" data-sara-article-id="640685a3-c137-4a00-9816-996b94d255b2">
<header class="lg:flex lg:justify-between md:flex md:justify-between lg:mx-24 md:mx-24 sm:mx-16">
<h2 class="w-full">
<a class="text-black block" href="https://www.spiegel.de/international/germany/the-eu-relationship-with-russia-european-unity-and-determination-offer-a-path-forward-a-640685a3-c137-4a00-9816-996b94d255b2" target="_self" title="European Unity and Determination Offer a Path Forward">
<span class="flex items-center sm:justify-between">
<span class="max-w-full overflow-hidden pr-px">
<span class="flex flex-col w-full mb-16">
<span class="focus:text-primary-darker font-brandUI font-extrabold hover:text-primary-dark leading-tight lg:text-xl mb-8 sm:pb-2 md:text-xl sm:text-l text-primary-base">
The EU Relationship with Russia
</span>
<span class="font-serifdisplayUI font-bold lg:text-5xl md:text-5xl sm:text-

In [8]:
# Get the headline
x=event.find('h2')
x

<h2 class="w-full">
<a class="text-black block" href="https://www.spiegel.de/international/germany/the-eu-relationship-with-russia-european-unity-and-determination-offer-a-path-forward-a-640685a3-c137-4a00-9816-996b94d255b2" target="_self" title="European Unity and Determination Offer a Path Forward">
<span class="flex items-center sm:justify-between">
<span class="max-w-full overflow-hidden pr-px">
<span class="flex flex-col w-full mb-16">
<span class="focus:text-primary-darker font-brandUI font-extrabold hover:text-primary-dark leading-tight lg:text-xl mb-8 sm:pb-2 md:text-xl sm:text-l text-primary-base">
The EU Relationship with Russia
</span>
<span class="font-serifdisplayUI font-bold lg:text-5xl md:text-5xl sm:text-2xl mb-2 leading-none"><span class="italic leading-normal align-middle hover:opacity-moderate focus:opacity-moderate pr-px">European Unity and Determination Offer a Path Forward</span>
</span>
<span class="block font-sansUI text-s text-shade-dark font-normal">A Guest Ed

In [10]:
# Get the title from the Anchor tag within the headline
title=x.find("a")["title"]
title

'European Unity and Determination Offer a Path Forward'

In [None]:
# Get the article link from the Anchor tag within the headline
href=x.find("a")["href"]
href

In [None]:
# Get the figure HTML element
x=event.find('figure')

In [None]:
# Get the image srce using HTML and CSS selection
image= x.find("img",{'data-image-el':'img'})["src"]
image

In [None]:
event=events[0]
x=event.find('span',{'class':'font-sansUI font-normal text-s text-shade-dark'})

In [None]:
if (x!=None):
    print(x.text)

In [None]:
my_events = []
for event in events:
    event_details = dict()
    x=event.find('h2')
    if (x!=None):
        event_details['title'] = x.find("a")["title"]
    x=event.find('h2')
    if (x!=None):
        event_details['href'] = x.find("a")["href"]
    x=event.find('figure')
    if (x!=None):
        event_details['image'] = x.find("img",{'data-image-el':'img'})["src"]
    x=event.find('span',{'class':'font-sansUI font-normal text-s text-shade-dark'})
    if (x!=None):
        event_details['author']=x.text
    my_events.append(event_details) 


In [None]:
print(my_events)

In [None]:
# Get one link
details_url=my_events[2]["href"]
details_url

Check the website manually in Chrome with Xpath

//div[contains(@class,"RichText")]/p/text()

In [None]:
# Request the website
details_req = requests.get(details_url)

In [None]:
# Create a Soup Object
details_soup = BeautifulSoup(details_req.text, 'html')

In [None]:
import re
# AND expression with look aheads
regex = re.compile('(?=.*RichText.*)(?=.*word-wrap.*)')
# OR expression with look aheads
# regex = re.compile('(?=.*RichText.*|.*word-wrap.*)')

In [None]:
# Use HTML Selector
details_events = details_soup.findAll('div',{'class':regex})
details_events

In [None]:
# Iterate over all p tags
for devents in details_events:
    x=devents.findAll("p")
    for p in x:
        print(p.text)

In [None]:
# Create a function for the code above
# Define a function to automatically extract the text
def downloadText(url):
    details_req = requests.get(url)
    details_soup = BeautifulSoup(details_req.text, 'html')
    regex = re.compile('.*RichText.*')
    details_events = details_soup.findAll('div',{'class':regex})
    text="";
    for devents in details_events:
        paragraphs=devents.findAll("p")
        for paragraph in paragraphs:
            text = text + " " + paragraph.text
    return text

In [None]:
# Test the function
print(downloadText(details_url))

In [None]:
# Download all images - Helper method
def downloadImage(url, file_name):
    # open in binary mode
    with open(file_name, "wb") as file:
        # get request
        response = requests.get(url)
        # write to file
        file.write(response.content)

In [None]:
# Download all images
i=0;
for e in my_events:
    if (e.get("image")!=None):
        downloadImage(e["image"],str(i)+".jpg")
        i = i + 1