## Class 02 - Web Scrapping with `selenium`

### Downloading `selenium`

In [None]:
import re

from nltk import word_tokenize
#!pip install selenium
#!pip install nltk
#!pip install spacy

In [2]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import nltk

### ECB Exercise: monetary policy statements

In [3]:
driver = webdriver.Chrome()
driver.get("https://www.ecb.europa.eu/press/press_conference/monetary-policy-statement/html/index.en.html")

for i in range(0, 10000, 200):
    driver.execute_script(f"window.scrollBy(0, {i});")
    time.sleep(1)

In [4]:
page_source = driver.page_source
soup = BeautifulSoup(page_source)


link_objects = soup.findAll("a")
urls = [link.get('href') for link in link_objects] # url starts with a "/" so we must add the main web link
urls = [url for url in urls if url]
urls = [url for url in urls if "/press/press_conference/monetary-policy-statement/" in url]
urls = [url for url in urls if ".en.html" in url]
#urls
len(link_objects)

973

In [5]:
for url in urls:
    req = requests.get("https://www.ecb.europa.eu"+url)
    soup = BeautifulSoup(req.content)

    text = ""

    for section in soup.findAll("div", {"class" : "section"}):
        for content in soup.findAll(["h2","p"]):
            text += content.text + "\n"

print(text)

Our monetary policy strategy, the tools we use and the impact they have
Insights into our work on financial stability and payments and market infrastructures
Access to all ECB statistics and background information
All you need to know about our common currency
In-depth studies and expert analyses covering diverse topics and fields
You may also be interested in:
Christine Lagarde, President of the ECB,Luis de Guindos, Vice-President of the ECB
Frankfurt am Main, 30 January 2025    
Good afternoon, the Vice-President and I welcome you to our press conference.
The Governing Council today decided to lower the three key ECB interest rates by 25 basis points. In particular, the decision to lower the deposit facility rate – the rate through which we steer the monetary policy stance – is based on our updated assessment of the inflation outlook, the dynamics of underlying inflation and the strength of monetary policy transmission. 
The disinflation process is well on track. Inflation has contin

### Tools for finding text

In [13]:
import re
text = 'we can meet at 11, call me at +68942585 when you want today before 18:00!'
print(re.search('a', text))
print(re.findall('a', text))
print(list(re.finditer('a', text)))

<re.Match object; span=(4, 5), match='a'>
['a', 'a', 'a', 'a', 'a', 'a']
[<re.Match object; span=(4, 5), match='a'>, <re.Match object; span=(12, 13), match='a'>, <re.Match object; span=(20, 21), match='a'>, <re.Match object; span=(27, 28), match='a'>, <re.Match object; span=(50, 51), match='a'>, <re.Match object; span=(57, 58), match='a'>]


### Replacing with `re.sub()`

In [53]:
text = 'we can meet at 11, call me at +68942585 when you want today before 18:00!'
re.sub("\+68942585", '[PHONE_NUMBER]', text)


  re.sub("\+68942585", '[PHONE_NUMBER]', text)


'we can meet at 11, call me at [PHONE_NUMBER] when you want today before 18:00!'

### Fetch numbers on the text that begins with "+" and has at least 7 digits `{7,}`

In [52]:
text = 'we can meet at 11, call me at +68942585 when you want today before 18:00!'
re.findall("\+\d{7,}", text) #fetch whatever numbers on the text that begins with "+" and has at least 7 digits "{7,}"

  re.findall("\\+\d{7,}", text) #fetch whatever numbers on the text that begins with "+" and has at least 7 digits "{7,}"


['+68942585']

### Retrieving pieces of text that have the structure `string+"@"+string`

In [51]:
test_text = "Contact us: info@example.com or support@site.org"
re.findall(r'\S+@\S+', test_text)

['info@example.com', 'support@site.org']

### Tokenization: finding the smallest unit of analysis

could be whole words, sub words (word piece tokenization: "some words can be splited into smaller pieces"). With Word Piece tokenization you can reduce the dictionary of tokens and be more computationally efficient.

- Ex: "buy" and "buyed". *Buy* becomes one token and *"ed"* becomes a token that is added to indicate the past particple.

In [58]:
text = "Hello my name is Advart. I live in Paris"
text.split() #most naive way to tokenize

['Hello', 'my', 'name', 'is', 'Advart.', 'I', 'live', 'in', 'Paris']

In [63]:
from nltk.tokenize import word_tokenize

tokens = word_tokenize(text)
tokens

['Hello', 'my', 'name', 'is', 'Advart', '.', 'I', 'live', 'in', 'Paris']

In [66]:
from nltk.corpus import stopwords

stops = stopwords.words('english')
stops

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [75]:
sentence = "The workers have nothing to lose but their chains"

def remove_stopwords(sentence: str):

tokens = word_tokenize(sentence)
stops = stopwords.words('english')
tokens = [token for token in tokens not in stops]
print(tokens)

TypeError: 'bool' object is not iterable