# Installations
If you run the code on Colab

In [1]:
!pip install selenium
!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:6 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Hit:7 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Hit:8 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:9 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:11 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:13 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Get:14 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
Hit:15 http://ppa.launchpad.net/graph

If you run this code locally:
- Install Selenium (via Anaconda)
- Install the Chrome browser
- Place the [Chromedriver](https://chromedriver.chromium.org/downloads) into your path (from where you started jupyter)

# Libraries



In [2]:
import sys

from bs4 import BeautifulSoup, SoupStrainer
import re

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

from pprint import pprint
import random
import time
import unicodedata

# Selenium

In [3]:
options = webdriver.ChromeOptions()
## if you run this code locally and want to see how Selenium operates the borwser, comment the following lines
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

## if you run this code locally and want to see how Selenium operates the borwser, uncomment the following line
#options.add_argument("--window-size=1920x1080")

## Open a Website

In [4]:
driver = webdriver.Chrome('chromedriver', options=options)
driver.get('https://cfl.lu')

## Inspect the DOM
- Inspect the website source in the browser (F12)

In [5]:
pprint(driver.page_source)

('<html class="no-js no-touch" lang="fr"><head>\n'
 '    <meta charset="utf-8">\n'
 '    <meta http-equiv="X-UA-Compatible" content="IE=edge">\n'
 '    <meta name="viewport" content="width=device-width, initial-scale=1, '
 'maximum-scale=1, user-scalable=no">\n'
 '    <meta name="format-detection" content="telephone=no">\n'
 '\n'
 '    <title>CFL </title>\n'
 '    <meta name="description" content="CFL - Société nationale des chemins de '
 'fer luxembourgeois · FR · DE · EN. 2489 2489 Call Center. Votre voyage '
 'commence ici. ">\n'
 '\n'
 '    <link rel="apple-touch-icon" sizes="57x57" '
 'href="/Content/Images/favicon/apple-icon-57x57.png">\n'
 '    <link rel="apple-touch-icon" sizes="60x60" '
 'href="/Content/Images/favicon/apple-icon-60x60.png">\n'
 '    <link rel="apple-touch-icon" sizes="72x72" '
 'href="/Content/Images/favicon/apple-icon-72x72.png">\n'
 '    <link rel="apple-touch-icon" sizes="76x76" '
 'href="/Content/Images/favicon/apple-icon-76x76.png">\n'
 '    <link rel="ap

## Find Elements

### by XPATH

In [6]:
el = driver.find_element(by=By.XPATH, value='//a[contains(@onclick, "accept")]')
print(el.tag_name)

el.click()

a


### by ID

In [7]:
el = driver.find_element(by=By.ID, value='quicksearch-departure')
print(el.tag_name)

el.clear()
el.send_keys('Luxembourg')
el.send_keys(Keys.RETURN)

input


In [8]:
el = driver.find_element(by=By.ID, value='quicksearch-arrival')
print(el.tag_name)

el.clear()
el.send_keys('Belval-Université')
el.send_keys(Keys.RETURN)

input


## Build Soup from HTML

In [9]:
soup = BeautifulSoup(driver.page_source,
                     parse_only=SoupStrainer('div', attrs={'class': 'search__head'}))

In [10]:
print(soup.prettify())

<div class="search__head">
 <div class="search__hours-container">
  <p class="search__hours">
   <span class="js-departure-hour">
    05:02
   </span>
  </p>
  <svg class="icon__arrow">
   <use xlink:href="/Content/Images/svg/icons.svg#icon-arrow-right-1">
   </use>
  </svg>
  <p class="search__hours">
   <span class="js-arrival-hour">
    05:32
   </span>
  </p>
 </div>
 <div class="search__stop-info order-md-3" data-original-title="Arrêts trains modifiés!" data-toggle="tooltip" data-trigger="hover focus" title="">
  <svg class="icon">
   <use xlink:href="/Content/Images/svg/icons.svg#icon-info">
   </use>
  </svg>
 </div>
 <div class="search__head-infos">
  <div class="search__head-inner">
   <p class="search__duration">
    <svg class="icon">
     <use xlink:href="/Content/Images/svg/icons.svg#icon-34-heure">
     </use>
    </svg>
    <span class="js-travel-time">
     00:30
    </span>
   </p>
  </div>
  <p class="search__connections-number">
   Pas de changement
  </p>
 </div>
</

## Extract Information

In [11]:
print('The next trains from Luxembourg to Belval-Université leave at:')
set(sorted([train.string.strip() for train in soup.find_all('span', attrs={'js-departure-hour'})]))

The next trains from Luxembourg to Belval-Université leave at:


{'05:02', '05:21', '05:32', '05:51', '06:02'}

## ❓ Exercise
Get more connections:
- Locate the button that loads more connections on www.cfl.lu
- Click on it (el.click)
- Wait for 5 seconds for the website to load (time.sleep(5))
- Scrape the resulting page
- Parse the HTML and extract the departure times

## Close the Browser

In [12]:
driver.quit()

# Eurostat Quiz

In [13]:
%%html
<iframe src="https://ec.europa.eu/eurostat/cache/quiz/?lang=en" width="100%"/ height='800'>

## Navigate to page

In [14]:
driver = webdriver.Chrome('chromedriver', options=options)
driver.get('https://ec.europa.eu/eurostat/cache/quiz/?lang=en')

## Accept Cookies

In [16]:
el = driver.find_element(by=By.XPATH, value='//a[contains(@href, "refuse")]')
el.click()
el = driver.find_element(by=By.XPATH, value='//a[contains(@href, "close")]')
el.click()

## Start Game

In [17]:
el = driver.find_element(by=By.XPATH, value='//button[contains(@ng-show, "showplay")]')
el.click()

## Select Question

In [18]:
els = driver.find_elements(by=By.XPATH, value='//a[contains(@class, "question") and not(contains(@class, "answered"))]')
el = random.sample(els, 1)[0]
el.click()

## ❓ Exercise

Extract the quiz data

In [19]:
soup = BeautifulSoup(driver.page_source, 
                     parse_only=SoupStrainer('div', attrs={'class': 'modal-dialog'}))

In [20]:
print(soup.prettify())

<div class="modal-dialog">
 <div class="modal-content theme-4" ng-class="{ 'wrong animated shake': wrongClass, 'correct animated bounceIn': correctClass }">
  <div class="modal-body" ng-show="choiceansw">
   <div class="title">
    <img alt="" ng-src="images/themes/theme4w.png" src="images/themes/theme4w.png"/>
    <span class="ng-binding" ng-bind="themeTitle | decodeURIComponent">
     In­dus­try, trade and ser­vices
    </span>
   </div>
   <p class="ng-binding" ng-bind="questionText | decodeURIComponent">
    Which country has the highest number of small and medium enterprises (SME) in manufacturing?
   </p>
   <form class="ng-pristine ng-valid" role="form">
    <div class="radio">
     <input class="ng-pristine ng-untouched ng-valid ng-empty" id="theme-4-Germany" name="quiz-radio" ng-model="choice.value" type="radio" value="1"/>
     <label class="ng-binding" for="theme-4-Germany">
      Germany
     </label>
    </div>
    <div class="radio">
     <input class="ng-pristine ng-unto

### Category

In [21]:
#category = ...

### Question

In [22]:
# question = ...

### Choices

In [23]:
# choices = [...]

### Answer

In [24]:
# answer = ...

## Select Answer

In [25]:
els = driver.find_elements(by=By.TAG_NAME, value='input')
el = random.sample(els, 1)[0]
el.click()

## Submit Answer

In [26]:
el = driver.find_element(by=By.XPATH, value='//button[contains(., "Submit")]')
el.click()

## Continue

In [27]:
try:
    el = driver.find_element(by=By.XPATH, value='//div[3]/div/button')
    el.click()
except:
    el = driver.find_element(by=By.XPATH, value='//div[2]/div/button')
    el.click()

# Loop

In [31]:
quiz = []

# load the quiz site
driver = webdriver.Chrome('chromedriver', options=options)
driver.get('https://ec.europa.eu/eurostat/cache/quiz/?lang=en')
driver.implicitly_wait(1)

# accept cookies
el = driver.find_element(by=By.XPATH, value='//a[contains(@href, "refuse")]')
el.click()
el = driver.find_element(by=By.XPATH, value='//a[contains(@href, "close")]')
el.click()
driver.implicitly_wait(2)

# start the game
el = driver.find_element(by=By.XPATH, value='//button[contains(@ng-show, "showplay")]')
el.click()

remaining_questions = 9
while remaining_questions:
    
    # select a question
    els = driver.find_elements(by=By.XPATH, value='//a[contains(@class, "question") and not(contains(@class, "answered"))]')
    el = random.sample(els, 1)[0]
    el.click()
    driver.implicitly_wait(1)
    
    # scrape the question
    soup = BeautifulSoup(driver.page_source, 
                     parse_only=SoupStrainer('div', attrs={'class': 'modal-dialog'}))
    category = soup.find('div', attrs={'class':'title'}).find('span').text.replace('\xad','')
    question = soup.find('p', attrs={'ng-bind': re.compile('questionText ')}).text
    choices = list(map(lambda l : l.text.strip(), soup.select('div.radio > label')))
    answer = soup.find(attrs={'ng-bind': re.compile('questionMsg')}).text.replace('\xad','')
    quiz.append({
        'category': category,
        'question': question,
        'choices': choices,
        'answer': answer
    })
    
    # select an answer
    els = driver.find_elements(by=By.TAG_NAME, value='input')
    el = random.sample(els, 1)[0]
    el.click()
    driver.implicitly_wait(1)
    
    # submit answer
    el = driver.find_element(by=By.XPATH, value='//button[contains(., "Submit")]')
    el.click()
    driver.implicitly_wait(1)
    
    # continue to next question
    try:
        el = driver.find_element(by=By.XPATH, value='//div[2]/div/button')
        el.click()
    except:
        el = driver.find_element(by=By.XPATH, value='//div[3]/div/button')
        el.click()
    driver.implicitly_wait(1)
    
    els = driver.find_elements(by=By.XPATH, value='//a[contains(@class, "question") and not(contains(@class, "answered"))]')
    remaining_questions = len(els)

In [32]:
pprint(quiz)

[{'answer': 'Romania (+27%) has the highest growth rate of production in '
            'construction (Hungary +21%, Cyprus +12%).',
  'category': 'Industry, trade and services',
  'choices': ['Romania', 'Cyprus', 'Hungary'],
  'question': 'Which country has the highest annual growth rate of production '
              'in construction?'},
 {'answer': 'Luxembourg (676 cars per 1 000 inhabitants) has the largest '
            'number (Cyprus 629, Slovenia 549).',
  'category': 'Transport',
  'choices': ['Luxembourg', 'Cyprus', 'Slovenia'],
  'question': 'Which country has most cars per inhabitant?'},
 {'answer': 'Sweden (28.0 million hectares) has the largest forest area '
            '(Finland 22.4 mn ha, Spain 18.6 mn ha).',
  'category': 'Agriculture and fisheries',
  'choices': ['Finland', 'Sweden', 'Spain'],
  'question': 'Which country has the largest forest area?'},
 {'answer': 'Netherlands (59%) has the highest share of households living in '
            'cities (Cyprus 52%, Portu