In [1]:
# import modules

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from getpass import getpass
import pandas as pd
import time
import csv
import os

### In order to use Selenium, you need the geckodriver file saved in your machine.
### I give more details about this in the Introduction to the following Medium article:
https://medium.com/analytics-vidhya/search-automation-in-google-translate-download-translations-with-selenium-3a8c8e136b0e


In [2]:
# Path to the chromedriver file in Windows and Linux.
# You need to change them to your local path.
linux_path = "my/Linux/path/chromium.chromedriver"
windows_path = r"C:\Users\my\Windows\path\chromedriver.exe"

# escolhe o path do chromedriver, de acordo com o sistema operacional
if os.name == "posix":
    path = linux_path
    
else:
    path = windows_path

In [3]:
# Create an Options object and set headless attribute to False (which is the default).
# Use chrome_options.headless = True if you don't want the browser window to be physically opened. 
chrome_options = Options()
print(type(chrome_options))
chrome_options.headless = False

# Save a webdriver object in the driver variable
driver = webdriver.Chrome(executable_path = path,
                          options = chrome_options)

# Check object type
type(driver)

<class 'selenium.webdriver.chrome.options.Options'>


selenium.webdriver.chrome.webdriver.WebDriver

In [4]:
# Navigate to the page to be scraped
url = "https://quotes.toscrape.com/"
driver.get(url)

In [5]:
# Get the current url
info = driver.current_url
print(info)

# Maximize window
driver.maximize_window()

https://quotes.toscrape.com/


### In Python, we have the following webdriver methods to find HTML elements from a webpage:  

1. driver.find_element_by_xpath()
2. driver.find_element_by_css_selector()
3. driver.find_element_by_tag_name()
4. driver.find_element_by_class_name()
5. driver.find_element_by_id()
6. driver.find_element_by_name()
7. driver.find_element_by_link_text()
8. driver.find_element_by_partial_link_text()

In [6]:
# Get the login link element by using Xpath
login_xpath = "/html/body/div/div[1]/div[2]/p/a"
login_link = driver.find_element_by_xpath(login_xpath)
type(login_link)

selenium.webdriver.remote.webelement.WebElement

In [7]:
# This attribute returns the element tag name
tag_name = login_link.tag_name
print("tag name:", tag_name)

# This attribute returns the element inner text
text = login_link.text
print("inner text:", text)

# This method returns an element HTML attribute (in this case, "href")
href_attribute = login_link.get_attribute("href")
print("href attribute:", href_attribute)

tag name: a
inner text: Login
href attribute: https://quotes.toscrape.com/login


In [8]:
# Click on the element
login_link.click()

In [9]:
# Go back to the last page and wait 2.5 seconds
driver.back()
time.sleep(2.5)
print("ok")

ok


In [10]:
# Go forward in the browser history and wait 2 seconds
driver.forward()
time.sleep(2)
print("ok")

ok


In [11]:
# This code uses a css selector to get the link to the main page
css_selector = "div.header-box.row > div.col-md-8 > h1 > a"
main_page_link = driver.find_element_by_css_selector(css_selector)
type(main_page_link)

selenium.webdriver.remote.webelement.WebElement

In [12]:
# Get the main_page_link element info and print it
tag_name = main_page_link.tag_name
text = main_page_link.text
style_attribute = main_page_link.get_attribute("style")

print("tag name:", tag_name)
print("inner text:", text)
print("style attribute:", style_attribute)

# Click on the link
main_page_link.click()

tag name: a
inner text: Quotes to Scrape
style attribute: text-decoration: none;


In [13]:
# This function makes the browser return to the main page
def return_to_main_page():
    css_selector = "div.header-box.row > div.col-md-8 > h1 > a"
    driver.find_element_by_css_selector(css_selector).click()

### In Python, if you write "driver.find_elements_..." (note the additional "s"), we can use the same kind of webdriver methods listed before to find all the HTML elements from a webpage with a specific characteristic. These methods return a list of WebElement objects:  

1. driver.find_elements_by_xpath()
2. driver.find_elements_by_css_selector()
3. driver.find_elements_by_tag_name()
4. driver.find_elements_by_class_name()
5. driver.find_elements_by_id()
6. driver.find_elements_by_name()
7. driver.find_elements_by_link_text()
8. driver.find_elements_by_partial_link_text()

In [14]:
# This code finds all links in page 1 by using the tag name
return_to_main_page()

all_links_page_1 = driver.find_elements_by_tag_name("a")

print("number of links in main page:",
      len(all_links_page_1),
      "\n")

print("object all_links_page_1:\ntype:",
      type(all_links_page_1),
      "\n")

print("seventh object in the all_links_page_1 list:\ntype:",
      all_links_page_1[7])

number of links in main page: 55 

object all_links_page_1:
type: <class 'list'> 

seventh object in the all_links_page_1 list:
type: <selenium.webdriver.remote.webelement.WebElement (session="8225bebb900485d56ea2d2e4b186a97a", element="264ae49d-cfa4-424f-9bcc-3ba610a0eee3")>


In [15]:
# The function below, when used in a webElement with the "a" tag name,
# returns the link inner text and its url.
# Both info are also printed to the console if print_output = True
def show_links_info(link_element, print_output = True):
    text = link_element.text
    url = link_element.get_attribute("href")
    
    if (print_output):
        print(f"{text}: {url}")
        
    return text, url

In [16]:
# Create a string with all the links in the first page
saved_as_string = ""

for link in all_links_page_1:
    text, url = show_links_info(link)
    saved_as_string += f"{text}: {url}\n" 

Quotes to Scrape: https://quotes.toscrape.com/
Login: https://quotes.toscrape.com/login
(about): https://quotes.toscrape.com/author/Albert-Einstein
change: https://quotes.toscrape.com/tag/change/page/1/
deep-thoughts: https://quotes.toscrape.com/tag/deep-thoughts/page/1/
thinking: https://quotes.toscrape.com/tag/thinking/page/1/
world: https://quotes.toscrape.com/tag/world/page/1/
(about): https://quotes.toscrape.com/author/J-K-Rowling
abilities: https://quotes.toscrape.com/tag/abilities/page/1/
choices: https://quotes.toscrape.com/tag/choices/page/1/
(about): https://quotes.toscrape.com/author/Albert-Einstein
inspirational: https://quotes.toscrape.com/tag/inspirational/page/1/
life: https://quotes.toscrape.com/tag/life/page/1/
live: https://quotes.toscrape.com/tag/live/page/1/
miracle: https://quotes.toscrape.com/tag/miracle/page/1/
miracles: https://quotes.toscrape.com/tag/miracles/page/1/
(about): https://quotes.toscrape.com/author/Jane-Austen
aliteracy: https://quotes.toscrape.com/

In [17]:
# The code below saves all links from page 1 into a CSV

# Open CSV file
file = open("links.csv",
            "w",
            encoding = "UTF-8",
            newline = "")

# Create writer object from csv module
csv_writer = csv.writer(file)

# insert CSV header:
csv_writer.writerow(["index", "inner_text", "link"])

# Colect links info and save it as CSV rows
for index, link in enumerate(all_links_page_1):
    links_info = show_links_info(link, print_output = False)
    row = [index + 1,
           links_info[0].strip(),
           links_info[1]]
    csv_writer.writerow(row)


# close file
file.close()

# show first rows from the scraped data
df = pd.read_csv("links.csv", index_col="index")
df.head()

Unnamed: 0_level_0,inner_text,link
index,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Quotes to Scrape,https://quotes.toscrape.com/
2,Login,https://quotes.toscrape.com/login
3,(about),https://quotes.toscrape.com/author/Albert-Eins...
4,change,https://quotes.toscrape.com/tag/change/page/1/
5,deep-thoughts,https://quotes.toscrape.com/tag/deep-thoughts/...


In [18]:
# This code clicks on the "next" link and moves along the pages.
# When the last page is reached and there is no next link, break the loop.
return_to_main_page()

i = 1

while True:
    print("page", i)
    i += 1
        
    next_link_as_list = driver.find_elements_by_css_selector("li.next > a")

    if len(next_link_as_list) == 0:
        break
    
    next_link_as_list[0].click()
    time.sleep(2)

print("\nwe reached the last page")
return_to_main_page()
print("back to page 1\n")

page 1
page 2
page 3
page 4
page 5
page 6
page 7
page 8
page 9
page 10

we reached the last page
back to page 1



In [19]:
# From the code above, we discovered that there are 10 pages in the website.
# The function below goes to a chosen page and it will raise an error
# if the argument passed to it is not a number between 1 and 10

def go_to_page(page_number = 1):
    message = "Provide an integer number between 1 and 10 as argument" 
    if isinstance(page_number, str):
        raise TypeError("Error: you passed a string as argument. " + message)
        
    if (not isinstance(page_number, (int, float)) or
        page_number < 1 or
        page_number > 10):
        
        raise TypeError(message)
        
    if page_number % 1 != 0:
        page_number = int(page_number)
        print("page_number truncated to", page_number)
    
    if page_number != 1:
        base_url_page = "https://quotes.toscrape.com/page/"
        driver.get(f"{base_url_page}{page_number}")    
    else: 
        return_to_main_page()

In [20]:
# Go to page 3, wait 2.5 seconds and then go to page 7
go_to_page(3)
print(driver.current_url)

time.sleep(2.5)

go_to_page(7)
print(driver.current_url)

https://quotes.toscrape.com/page/3/
https://quotes.toscrape.com/page/7/


In [21]:
# This code truncates 3.141592653589793 to 3 and then go to page 3
import math

go_to_page(math.pi)
time.sleep(2.5)
driver.current_url

page_number truncated to 3


'https://quotes.toscrape.com/page/3/'

In [22]:
# The following function calls would return errors.
# Comment them out to check the error messages.


go_to_page("page number 3")
go_to_page("3")
go_to_page("3.1415")
go_to_page(25)
go_to_page(97.75)

TypeError: Error: you passed a string as argument. Provide an integer number between 1 and 10 as argument

In [23]:
# The code below will extract the page number from page 10
go_to_page(10)

this_url = driver.current_url
print("current url:", this_url)

url_parts = this_url.split("/")
page_number = url_parts[-2]
print("page number:", page_number)

return_to_main_page()

current url: https://quotes.toscrape.com/page/10/
page number: 10


In [24]:
# This code reproduces all the first ten quotes as a long string.
# It shows how you can use the WebElement.text attribute to access
# the inner text from all children elements at once.
quotes_divs_list = driver.find_elements_by_css_selector("div.quote")
string = ""

for element in quotes_divs_list:
    string += element.text + "\n\n"

print(string)

“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
by Albert Einstein (about)
Tags: change deep-thoughts thinking world

“It is our choices, Harry, that show what we truly are, far more than our abilities.”
by J.K. Rowling (about)
Tags: abilities choices

“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”
by Albert Einstein (about)
Tags: inspirational life live miracle miracles

“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”
by Jane Austen (about)
Tags: aliteracy books classic humor

“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”
by Marilyn Monroe (about)
Tags: be-yourself inspirational

“Try not to become a man of success. Rather become a man of value.”
by Albert Einstein (about)
Tags: adulthood success value

“It is bet

In [25]:
# With this code one can get all the info for the quotes in the current page
return_to_main_page()

quotes_text = [item.text for item in
               driver.find_elements_by_css_selector("span.text")]

authors_names = [item.text for item in
                driver.find_elements_by_css_selector("small.author")] 

authors_links = [item.get_attribute("href") for item in 
                 driver.find_elements_by_partial_link_text("(about)")]

tags_text = [item.text for item in
             driver.find_elements_by_css_selector("div.tags")]

In [26]:
# Now, we only need to loop over all pages and save the quotes info into a CSV
return_to_main_page()
page_number = 0
i = 0

# Open CSV file
file = open("quotes.csv",
            "w",
            encoding = "UTF-8",
            newline = "")

# Create writer object from csv module
csv_writer = csv.writer(file)

# Initialize list of lists with data rows:
rows = [
    ["index", "quote", "author",
     "author_biography_link", "tags"]
]

while True:
    page_number += 1
    print("page", page_number)
    
    divs_quote_list = driver.find_elements_by_css_selector("div.quote")
    
    for div in divs_quote_list:
        quote_text = div.find_element_by_css_selector("span.text").text
        author_name = div.find_element_by_css_selector("small.author").text
        author_link = div.find_element_by_partial_link_text("(about)")
        tags_text = div.find_element_by_css_selector("div.tags").text
        
        try:
            tags_text = tags_text.lower().split("tags: ")[1]
        except:
            tags_text = ""
        
        row = [len(rows),
               quote_text[1:-1],
               author_name,
               author_link.get_attribute("href"),
               tags_text]
    
        rows.append(row)
  
    next_link_as_list = driver.find_elements_by_css_selector("li.next > a")

    if len(next_link_as_list) == 0:
        break
    
    next_link_as_list[0].click()
    time.sleep(2)
    
print("\nwe reached the last page")
return_to_main_page()
print("back to page 1\n")

# Write rows to the CSV file and close it
csv_writer.writerows(rows)
file.close()

# Show first rows from the scraped data
df = pd.read_csv("quotes.csv", index_col="index")
print("data frame dimensions:", df.shape)
df.head()

page 1
page 2
page 3
page 4
page 5
page 6
page 7
page 8
page 9
page 10

we reached the last page
back to page 1

data frame dimensions: (100, 4)


Unnamed: 0_level_0,quote,author,author_biography_link,tags
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,The world as we have created it is a process o...,Albert Einstein,https://quotes.toscrape.com/author/Albert-Eins...,change deep-thoughts thinking world
2,"It is our choices, Harry, that show what we tr...",J.K. Rowling,https://quotes.toscrape.com/author/J-K-Rowling,abilities choices
3,There are only two ways to live your life. One...,Albert Einstein,https://quotes.toscrape.com/author/Albert-Eins...,inspirational life live miracle miracles
4,"The person, be it gentleman or lady, who has n...",Jane Austen,https://quotes.toscrape.com/author/Jane-Austen,aliteracy books classic humor
5,"Imperfection is beauty, madness is genius and ...",Marilyn Monroe,https://quotes.toscrape.com/author/Marilyn-Monroe,be-yourself inspirational


In [27]:
# Each author has a biography page. We can get that info too.
# This will get all the unique authors biography links
unique_links_biographies = set(df["author_biography_link"])
print("total of biographical pages:",
      len(unique_links_biographies),
      "\n")

# Open CSV file
file = open("authors_info.csv",
            "w",
            encoding = "UTF-8",
            newline = "")

# Create writer object from csv module
csv_writer = csv.writer(file)

# Initialize list of lists with data rows:
rows = [
    ["index", "name", "born_date",
     "born_location", "description"]
]


# Now we can loop over these biography links and save their info in a CSV
for link in unique_links_biographies:
    driver.get(link)
    time.sleep(3)
    
    index = len(rows)
    name = driver.find_element_by_css_selector("h3.author-title").text
    born_date = driver.find_element_by_css_selector("span.author-born-date").text
    born_location = driver.find_element_by_css_selector("span.author-born-location").text
    description = driver.find_element_by_css_selector("div.author-description").text
    
    print(index, "of", len(unique_links_biographies))
    
    row = [index, name, born_date,
           born_location[3:], description]
    
    rows.append(row) 

print("\nwe reached the last page")
return_to_main_page()
print("back to page 1\n")

# Write rows to the CSV file and close it
csv_writer.writerows(rows)
file.close()

# Show first rows from the scraped data
df = pd.read_csv("authors_info.csv", index_col="index")
print("data frame dimensions:", df.shape)
df.head()

total of biographical pages: 50 

1 of 50
2 of 50
3 of 50
4 of 50
5 of 50
6 of 50
7 of 50
8 of 50
9 of 50
10 of 50
11 of 50
12 of 50
13 of 50
14 of 50
15 of 50
16 of 50
17 of 50
18 of 50
19 of 50
20 of 50
21 of 50
22 of 50
23 of 50
24 of 50
25 of 50
26 of 50
27 of 50
28 of 50
29 of 50
30 of 50
31 of 50
32 of 50
33 of 50
34 of 50
35 of 50
36 of 50
37 of 50
38 of 50
39 of 50
40 of 50
41 of 50
42 of 50
43 of 50
44 of 50
45 of 50
46 of 50
47 of 50
48 of 50
49 of 50
50 of 50

we reached the last page
back to page 1

data frame dimensions: (50, 4)


Unnamed: 0_level_0,name,born_date,born_location,description
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Eleanor Roosevelt,"October 11, 1884",The United States,Anna Eleanor Roosevelt was an American politic...
2,Suzanne Collins,"August 11, 1962","Hartford, Connecticut, The United States",Librarian Note: There is more than one author ...
3,C.S. Lewis,"November 29, 1898","Belfast, Ireland",CLIVE STAPLES LEWIS (1898–1963) was one of the...
4,Garrison Keillor,"August 07, 1942","Anoka, Minnesota, The United States",Garrison Keillor (born Gary Edward Keillor on ...
5,Alfred Tennyson,"August 06, 1809","Somersby, Lincolnshire, The United Kingdom","Alfred Tennyson was born in Somersby, Lincolns..."


In [28]:
# Now we go back to the login page and play a little with the text input boxes.

# Go to the login page
login_xpath = "/html/body/div/div[1]/div[2]/p/a"
driver.find_element_by_xpath(login_xpath).click()

# Save the text input boxes elements in variables, using their id (two forms):
username_input = driver.find_element_by_id("username") 
password_input = driver.find_element_by_css_selector("#password")

In [29]:
# Username and password values
# (one can also use base::readline() or getPass::getPass() 
# to ask for user input)
username = "Fabrício"
password = "1234 is not a secure password!"

# When dealing with input boxes, it is good practice to clear their value first
username_input.clear()

# Send the username information (it needs to be passed as a list)
username_input.send_keys(username)

# Do the same with password
password_input.clear()
password_input.send_keys(password)
time.sleep(2)

# Once you don't need the username and password variables anymore, delete them
del username, password

In [30]:
# Find the submit button
submit_button = driver.find_element_by_css_selector ("input.btn.btn-primary")

# Use the submit() method to submit the form information
submit_button.submit()

In [31]:
# let's go to another page and learn more about how to manipulate forms
url = "http://httpbin.org/forms/post"
driver.get(url)
time.sleep(3)

In [32]:
############# CHALLENGE ################
# Ask a pizza using the form.
# Use RSelenium to do that.
# Provide fictional data and order the following kind of pizza:
# * large size
# * bacon, extra cheese and mushroom toppings
# * 19h45 delivery time
# * give some delivery instructions
# 
# Then hit the submit button.
# You will see a JSON page with your orders info.
# 
# You can tell the browser to go back to the order page
# and play with it again.
#
# Can you make a code to automate ordering 
# 20 different pizzas?
# Try it yourself. You most certainly can!
########################################

In [34]:
# This code choose the large option in the radio buttons
radio_buttons = driver.find_elements_by_css_selector('input[type="radio"]')
radio_buttons[-1].click()

# Choose the first, second and fourth toppings to your pizza
checkboxes = driver.find_elements_by_css_selector('input[name="topping"]')
[element.click() for index, element in enumerate(checkboxes) if index != 2]

# Set delivery time to 19h45
clock = driver.find_element_by_css_selector('input[type="time"]')
clock.send_keys("19:45")

poetry = """Two households, both alike in dignity\n
(In fair Verona, where we lay our scene),\n
From ancient grudge break to new mutiny,\n
Where civil blood makes civil hands unclean.\n
From forth the fatal loins of these two foes\n
A pair of star-crossed lovers take their life;\n
Whose misadventured piteous overthrows\n
Doth with their death bury their parents’ strife.\n
The fearful passage of their death-marked love\n
And the continuance of their parents’ rage,\n
Which, but their children’s end, naught could remove,\n
Is now the two hours’ traffic of our stage;\n
The which, if you with patient ears attend,\n
What here shall miss, our toil shall strive to mend.\n"""

# Send aditional information to the textarea input
textarea = driver.find_element_by_tag_name("textarea")
textarea.send_keys(poetry)

# Submit form and go back
driver.find_element_by_css_selector("p > button").submit()
time.sleep(5)

driver.back()

# Refresh the form page:
driver.refresh()

In [35]:
# When you are done working with selenium, 
# Quit the browser and end the session
driver.quit()
del driver