# Webpage data extraction using Beautiful Soup 4

For a list of references see:

https://blog.hartleybrody.com/web-scraping-cheat-sheet/#using-beautifulsoup


In [None]:
# conda install beautifulsoup4
# pip install requests
import requests
from bs4 import BeautifulSoup

In [None]:
# Request the webpage
url = "https://www.spiegel.de/international"
req = requests.get(url)

In [None]:
# Inspect the structure of the article using Chrome / Devtools
req.text

In [None]:
# Save the website
websiteFileName="website.htm"
with open(websiteFileName, "wb") as file:
    file.write(req.text.encode())

In [None]:
req.text

In [None]:
# Create the BS4 Object
soup = BeautifulSoup(req.text, 'html')

In [None]:
# Use HTML Selector
events = soup.findAll('article')
events

In [None]:
len(events)

In [None]:
# View the first article
event=events[1]
event

In [None]:
# Get the headline
x=event.find('h2')
x

In [None]:
# Get the title from the Anchor tag within the headline
title=x.find("a")["title"]
title

In [None]:
# Get the article link from the Anchor tag within the headline
href=x.find("a")["href"]
href

In [None]:
# Get the figure HTML element
x=event.find('figure')

In [None]:
# Get the image srce using HTML and CSS selection
image= x.find("img",{'data-image-el':'img'})["src"]
image

In [None]:
event=events[0]
x=event.find('span',{'class':'font-sansUI font-normal text-s text-shade-dark'})

In [None]:
if (x!=None):
    print(x.text)

In [None]:
my_events = []
for event in events:
    event_details = dict()
    x=event.find('h2')
    if (x!=None):
        event_details['title'] = x.find("a")["title"]
    x=event.find('h2')
    if (x!=None):
        event_details['href'] = x.find("a")["href"]
    x=event.find('figure')
    if (x!=None):
        event_details['image'] = x.find("img",{'data-image-el':'img'})["src"]
    x=event.find('span',{'class':'font-sansUI font-normal text-s text-shade-dark'})
    if (x!=None):
        event_details['author']=x.text
    my_events.append(event_details) 


In [None]:
print(my_events)

In [None]:
# Get one link
details_url=my_events[2]["href"]
details_url

Check the website manually in Chrome with Xpath

//div[contains(@class,"RichText")]/p/text()

In [None]:
# Request the website
details_req = requests.get(details_url)

In [None]:
# Create a Soup Object
details_soup = BeautifulSoup(details_req.text, 'html')

In [None]:
import re
# AND expression with look aheads
regex = re.compile('(?=.*RichText.*)(?=.*word-wrap.*)')
# OR expression with look aheads
# regex = re.compile('(?=.*RichText.*|.*word-wrap.*)')

In [None]:
# Use HTML Selector
details_events = details_soup.findAll('div',{'class':regex})
details_events

In [None]:
# Iterate over all p tags
for devents in details_events:
    x=devents.findAll("p")
    for p in x:
        print(p.text)

In [None]:
# Create a function for the code above
# Define a function to automatically extract the text
def downloadText(url):
    details_req = requests.get(url)
    details_soup = BeautifulSoup(details_req.text, 'html')
    regex = re.compile('.*RichText.*')
    details_events = details_soup.findAll('div',{'class':regex})
    text="";
    for devents in details_events:
        paragraphs=devents.findAll("p")
        for paragraph in paragraphs:
            text = text + " " + paragraph.text
    return text

In [None]:
# Test the function
print(downloadText(details_url))

In [None]:
# Download all images - Helper method
def downloadImage(url, file_name):
    # open in binary mode
    with open(file_name, "wb") as file:
        # get request
        response = requests.get(url)
        # write to file
        file.write(response.content)

In [None]:
# Download all images
i=0;
for e in my_events:
    if (e.get("image")!=None):
        downloadImage(e["image"],str(i)+".jpg")
        i = i + 1