# Libraries

In [1]:
import requests

from pprint import pprint

from bs4 import BeautifulSoup, SoupStrainer
import re

# HTML

https://www.w3schools.com/html/html_examples.asp

In [2]:
%%html
<html>
  <head>
    <title>Eurostat</title>
  </head>
  <body>
    <h1>Welcome to Eurostat!</h1>
    <a href="https://en.wikipedia.org/wiki/Eurostat">Take me to the wikipedia site</a>
    <h2>Python Courses</h2>
    <ul>
      <li>Introduction to Python for Data Science</li>
      <li>Data Science for Structured Data</li>
      <li>Data Science for Unstructured Data</li>
      <li>Data for Science, how to scrape the Web</li>
      <li>Data Science for Big Data</li>
    </ul>
  </body>
</html>

In [3]:
%%html
<html>
  <head>
    <title>Eurostat</title>
  </head>
  <body>
    <h1>Welcome to Eurostat!</h1>
    <a href="https://en.wikipedia.org/wiki/Eurostat">Take me to the wikipedia site</a>
    <h2>Python Courses</h1>
    <ul>
      <li id='first-course' class='odd-order'>Introduction to Python for Data Science</li>
      <li class='even-order'>Data Science for Structured Data</li>
      <li class='odd-order'>Data Science for Unstructured Data</li>
      <li class='even-order'>Data for Science, how to scrape the Web</li>
      <li class='odd-order'>Data Science for Big Data</li>
    </ul>
  </body>
</html>

In [4]:
simple_html = '''
<html>
  <head>
    <title>Eurostat</title>
  </head>
  <body>
    <h1>Welcome to Eurostat!</h1>
    <a href="https://en.wikipedia.org/wiki/Eurostat">Take me to the wikipedia site</a>
    <h2>Python Courses</h2>
    <ul>
      <li id='first-course' class='odd-order'>Introduction to Python for Data Science</li>
      <li class='even-order'>Data Science for Structured Data</li>
      <li class='odd-order'>Data Science for Unstructured Data</li>
      <li class='even-order'>Data for Science, how to scrape the Web</li>
      <li class='odd-order'>Data Science for Big Data</li>
    </ul>
  </body>
</html>
'''

# Beautiful Soup

- Library for extracting data from XML and HTML files
- Creates a parse tree and offers intuitive ways to navigate the tree 
- https://www.crummy.com/software/BeautifulSoup/bs4/doc/

In [5]:
soup = BeautifulSoup(simple_html)

In [6]:
print(soup.prettify())

<html>
 <head>
  <title>
   Eurostat
  </title>
 </head>
 <body>
  <h1>
   Welcome to Eurostat!
  </h1>
  <a href="https://en.wikipedia.org/wiki/Eurostat">
   Take me to the wikipedia site
  </a>
  <h2>
   Python Courses
  </h2>
  <ul>
   <li class="odd-order" id="first-course">
    Introduction to Python for Data Science
   </li>
   <li class="even-order">
    Data Science for Structured Data
   </li>
   <li class="odd-order">
    Data Science for Unstructured Data
   </li>
   <li class="even-order">
    Data for Science, how to scrape the Web
   </li>
   <li class="odd-order">
    Data Science for Big Data
   </li>
  </ul>
 </body>
</html>



## Tags

In [7]:
soup.title

<title>Eurostat</title>

In [8]:
soup.body

<body>
<h1>Welcome to Eurostat!</h1>
<a href="https://en.wikipedia.org/wiki/Eurostat">Take me to the wikipedia site</a>
<h2>Python Courses</h2>
<ul>
<li class="odd-order" id="first-course">Introduction to Python for Data Science</li>
<li class="even-order">Data Science for Structured Data</li>
<li class="odd-order">Data Science for Unstructured Data</li>
<li class="even-order">Data for Science, how to scrape the Web</li>
<li class="odd-order">Data Science for Big Data</li>
</ul>
</body>

In [9]:
soup.ul

<ul>
<li class="odd-order" id="first-course">Introduction to Python for Data Science</li>
<li class="even-order">Data Science for Structured Data</li>
<li class="odd-order">Data Science for Unstructured Data</li>
<li class="even-order">Data for Science, how to scrape the Web</li>
<li class="odd-order">Data Science for Big Data</li>
</ul>

In [10]:
soup.ul.li

<li class="odd-order" id="first-course">Introduction to Python for Data Science</li>

### Name

In [11]:
soup.ul.name

'ul'

### Content

In [12]:
print(soup.ul.text)


Introduction to Python for Data Science
Data Science for Structured Data
Data Science for Unstructured Data
Data for Science, how to scrape the Web
Data Science for Big Data



In [13]:
print(soup.ul.string)

None


In [14]:
print(soup.li.string)

Introduction to Python for Data Science


## Robustness

In [15]:
broken_html = '''
<html>
  <head>
    <title>Eurostat</title>
  </head>
  <body>
    <h1>Welcome to Eurostat!</h1>
    <a href="https://en.wikipedia.org/wiki/Eurostat">Take me to the wikipedia site</a>
    <h2>Python Courses</h2>
    <ul>
      <li id='first-course' class='odd-order'>Introduction to Python for Data Science
      <li class='even-order'>Data Science for Structured Data
      <li class='odd-order'>Data Science for Unstructured Data
      <li class='even-order'>Data for Science, how to scrape the Web
      <li class='odd-order'>Data Science for Big Data
  </body>
</html>
'''

In [16]:
soup = BeautifulSoup(broken_html)
print(soup.prettify())

<html>
 <head>
  <title>
   Eurostat
  </title>
 </head>
 <body>
  <h1>
   Welcome to Eurostat!
  </h1>
  <a href="https://en.wikipedia.org/wiki/Eurostat">
   Take me to the wikipedia site
  </a>
  <h2>
   Python Courses
  </h2>
  <ul>
   <li class="odd-order" id="first-course">
    Introduction to Python for Data Science
   </li>
   <li class="even-order">
    Data Science for Structured Data
   </li>
   <li class="odd-order">
    Data Science for Unstructured Data
   </li>
   <li class="even-order">
    Data for Science, how to scrape the Web
   </li>
   <li class="odd-order">
    Data Science for Big Data
   </li>
  </ul>
 </body>
</html>



## Navigation

In [17]:
soup.ul.parent.name

'body'

In [18]:
soup.ul.children

<list_iterator at 0x7f7cc008ef50>

In [19]:
list(soup.ul.children)

['\n',
 <li class="odd-order" id="first-course">Introduction to Python for Data Science
       </li>,
 <li class="even-order">Data Science for Structured Data
       </li>,
 <li class="odd-order">Data Science for Unstructured Data
       </li>,
 <li class="even-order">Data for Science, how to scrape the Web
       </li>,
 <li class="odd-order">Data Science for Big Data
   </li>]

In [20]:
list(map(lambda li: li.string, soup.ul.children))

['\n',
 'Introduction to Python for Data Science\n      ',
 'Data Science for Structured Data\n      ',
 'Data Science for Unstructured Data\n      ',
 'Data for Science, how to scrape the Web\n      ',
 'Data Science for Big Data\n  ']

In [21]:
soup.li.nextSibling

<li class="even-order">Data Science for Structured Data
      </li>

## Attributes

In [22]:
tag = soup.a
print(tag)
tag['href']

<a href="https://en.wikipedia.org/wiki/Eurostat">Take me to the wikipedia site</a>


'https://en.wikipedia.org/wiki/Eurostat'

In [23]:
tag.attrs

{'href': 'https://en.wikipedia.org/wiki/Eurostat'}

### ❓ Exercise
What are the attributes of the first Python course?

## Filters

### by Tag

In [24]:
soup.find('li')

<li class="odd-order" id="first-course">Introduction to Python for Data Science
      </li>

In [25]:
soup.findAll('li')

[<li class="odd-order" id="first-course">Introduction to Python for Data Science
       </li>, <li class="even-order">Data Science for Structured Data
       </li>, <li class="odd-order">Data Science for Unstructured Data
       </li>, <li class="even-order">Data for Science, how to scrape the Web
       </li>, <li class="odd-order">Data Science for Big Data
   </li>]

### by Attribute

In [26]:
soup.findAll(attrs={'class': 'even-order'})

[<li class="even-order">Data Science for Structured Data
       </li>, <li class="even-order">Data for Science, how to scrape the Web
       </li>]

In [27]:
soup.find(attrs={'id': True})

<li class="odd-order" id="first-course">Introduction to Python for Data Science
      </li>

## CSS Selectors

In [28]:
soup.select('li')

[<li class="odd-order" id="first-course">Introduction to Python for Data Science
       </li>, <li class="even-order">Data Science for Structured Data
       </li>, <li class="odd-order">Data Science for Unstructured Data
       </li>, <li class="even-order">Data for Science, how to scrape the Web
       </li>, <li class="odd-order">Data Science for Big Data
   </li>]

In [29]:
soup.select('ul > li')

[<li class="odd-order" id="first-course">Introduction to Python for Data Science
       </li>, <li class="even-order">Data Science for Structured Data
       </li>, <li class="odd-order">Data Science for Unstructured Data
       </li>, <li class="even-order">Data for Science, how to scrape the Web
       </li>, <li class="odd-order">Data Science for Big Data
   </li>]

In [30]:
soup.select('li#first-course')

[<li class="odd-order" id="first-course">Introduction to Python for Data Science
       </li>]

In [31]:
soup.select('li[id=first-course]')

[<li class="odd-order" id="first-course">Introduction to Python for Data Science
       </li>]

In [32]:
soup.find('li', id=re.compile('first'))

<li class="odd-order" id="first-course">Introduction to Python for Data Science
      </li>

In [33]:
soup.select('li.odd-order')

[<li class="odd-order" id="first-course">Introduction to Python for Data Science
       </li>, <li class="odd-order">Data Science for Unstructured Data
       </li>, <li class="odd-order">Data Science for Big Data
   </li>]

## ❓ Exercise
Explore the Eurostat website

In [34]:
res = requests.get('https://ec.europa.eu/eurostat')
soup = BeautifulSoup(res.text)

In [35]:
# print the prettified html

In [36]:
## What's the site title

In [37]:
## What's the first h1 tag?

In [38]:
## Locate the following content in the site's source
print("\n".join(map(lambda a : a.string, (soup.select('div.block > div > a')))))

Services represented 73% of EU’s total GVA
Industry enterprises: largest share of trade value
Happy European Statistics Day!
Electricity and gas prices in the first half of 2021


# Regular Expressions
https://docs.python.org/3/library/re.html#module-re

In [39]:
train = 'the train RB 6807 from Luxembourg to Belval-Université leaves at 7h02.'

## Search

In [40]:
regex = re.compile('train')
regex.search(train)

<re.Match object; span=(4, 9), match='train'>

In [41]:
regex = re.compile('a')
regex.search(train)

<re.Match object; span=(6, 7), match='a'>

## Find

In [42]:
regex = re.compile('a')
regex.findall(train)

['a', 'a', 'a', 'a']

In [43]:
regex = re.compile('a')
[(m.start(), m.end(), m.group(0)) for m in regex.finditer(train)]

[(6, 7, 'a'), (41, 42, 'a'), (57, 58, 'a'), (62, 63, 'a')]

In [44]:
regex = re.compile('^t')
regex.findall(train)[0]

't'

In [45]:
regex = re.compile('\w*a\w*')
[(m.start(), m.end(), m.group(0)) for m in regex.finditer(train)]

[(4, 9, 'train'), (37, 43, 'Belval'), (55, 61, 'leaves'), (62, 64, 'at')]

In [46]:
regex = re.compile('\d+h\d+')
regex.findall(train)

['7h02']

In [47]:
regex = re.compile('\w{2} \d{4}')
regex.findall(train)

['RB 6807']

## ❓ Exercise

In [48]:
res = requests.get('http://rtl.lu')
soup = BeautifulSoup(res.text)

In [49]:
print(soup.prettify())

<!DOCTYPE html>
<html class="theme-rtl" lang="lb">
 <head>
  <meta charset="utf-8"/>
  <title>
   RTL - Home
  </title>
  <meta content="1594823294110501" property="og:app_id"/>
  <meta content="RTL" property="og:site_name"/>
  <meta content="RTL - Home" property="og:title"/>
  <meta content="/assets/images/facebook-share/rtl_share.png" property="og:image"/>
  <meta content="https://www.rtl.lu" property="og:url"/>
  <meta content="website" property="og:type"/>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="width=device-width,minimum-scale=1,initial-scale=1" name="viewport"/>
  <meta content="8Sn9q53ne28yyXaBRH5y3YaJJ3YEXb5KjuIfb3Xfs5U" name="google-site-verification"/>
  <meta content="app-id=360077592" name="apple-itunes-app"/>
  <link href="https://www.google-analytics.com" rel="dns-prefetch"/>
  <link href="https://imasdk.googleapis.com" rel="dns-prefetch"/>
  <link href="https://gabe.hit.gemius.pl" rel="dns-prefetch"/>
  <link hr

In [50]:
# How many <img> tags are on RTL.lu ? (use Beautiful soup or a Regex on str(soup))

In [51]:
# Does RTL use custom tags for images?
soup.findAll(src=re.compile('.jpeg$'))

[<img alt="RTL" class="media-image__image responsive-img" loading="lazy" src="https://stock.rtl.lu/rtl/800/rtl2008.lu/nt/p/2021/10/21/17/18f14129c7718a0e66a7f0e7e68c9c1b.jpeg"/>,
 <img alt="RTL" class="media-image__image responsive-img" loading="lazy" src="https://stock.rtl.lu/rtl/300/rtl2008.lu/nt/p/2021/10/21/14/518081f05369138eef740eea8b6033eb.jpeg"/>,
 <img alt="RTL" class="media-image__image responsive-img" loading="lazy" src="https://stock.rtl.lu/rtl/300/rtl2008.lu/nt/p/2021/10/21/15/5cc53ead0914e99a67883de81aca9826.jpeg"/>,
 <img alt="RTL" class="media-image__image responsive-img" loading="lazy" src="https://stock.rtl.lu/rtl/300/rtl2008.lu/nt/p/2021/10/21/16/e77a559263bc4a43892db373e329bfc3.jpeg"/>,
 <img alt="RTL" class="media-image__image responsive-img" loading="lazy" src="https://stock.rtl.lu/rtl/300/rtl2008.lu/nt/p/2021/10/21/10/20ff79d52d5a19dd9e635be565cb0d94.jpeg"/>,
 <img alt="RTL" class="media-image__image responsive-img" loading="lazy" src="https://stock.rtl.lu/rtl/30

In [52]:
#Find all "rtl-img" of today

In [53]:
# How many images are there for each day? Hint: use Counter from collections on the extracted dates

# SoupStrainer

In [54]:
res = requests.get('http://rtl.lu')
soup = BeautifulSoup(res.text,                  
                     parse_only=SoupStrainer(attrs={'src': re.compile('2021/10/20')}))
print(soup.prettify())

<!DOCTYPE html>
<img alt="RTL" class="media-image__image responsive-img" loading="lazy" src="https://stock.rtl.lu/rtl/300/rtl2008.lu/nt/p/2021/10/20/15/b66952422ad3e76ab4347bcad36c612f.jpeg"/>
<img alt="RTL" class="media-image__image responsive-img" loading="lazy" src="https://stock.rtl.lu/rtl/220/rtl2008.lu/nt/p/2021/10/20/19/5c895c95d8b174e3b29722744f2acca9.jpeg"/>
<img alt="RTL" class="media-image__image responsive-img" loading="lazy" src="https://stock.rtl.lu/rtl/300/rtl2008.lu/nt/p/2021/10/20/15/b66952422ad3e76ab4347bcad36c612f.jpeg"/>
<rtl-img class="media-image__image responsive-img" height="" src="https://static.rtl.lu/rtl2008.lu/nt/p/2021/10/20/21/33a3d882c9a64dfdb2ead5332ddeff73.jpeg" width="620">
</rtl-img>
<rtl-img class="media-image__image responsive-img" height="" src="https://static.rtl.lu/rtl2008.lu/nt/p/2021/10/20/18/c058007b16339e0d2cb5d6c1f49606ad.jpeg" width="300">
</rtl-img>
<rtl-img class="media-image__image responsive-img" height="" src="https://static.rtl.lu/rtl

## ❓ Exercise
List all the titles of the national news!

1. Let's have a look at the website source first
2. Devise a strategy first
3. Then implement it in code

In [55]:
res = requests.get('http://rtl.lu')
soup = BeautifulSoup(res.text,
                     #parse_only=SoupStrainer(...)
                     )

In [56]:
# print(soup.prettify())

In [57]:
# national news titles