In [1]:
# Importing Selenium and all related modules
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

#Importing the libraries to get the webdriver
import requests
from bs4 import BeautifulSoup

import time

We are going to scrape the Siba Inu dog race Wikipedia website. 

In [2]:
url = 'https://en.wikipedia.org/wiki/Shiba_Inu'

# Static scraping - Using BeautifulSoup

In [3]:
# Make a GET request to fetch the raw HTML content
html_content = requests.get(url).text

# Parse the html content
soup = BeautifulSoup(html_content, "lxml")
print(soup.prettify()) # print the parsed data of html

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-language-alert-in-sidebar-enabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-enabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Shiba Inu - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-language-alert-in-sidebar-enabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-enabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enable

## Getting the title

In [4]:
title = soup.find("h1") #In this case we use the .find() command as there is only one element.
title.text

'Shiba Inu'

## Getting the subtitles

In [5]:
subtitles = soup.findAll("h2") #In this case we use the .findAll() command as there are many elements.
subtitles_list = [ii.text for ii in subtitles]
subtitles_list

['Contents',
 'Appearance[edit]',
 'Temperament[edit]',
 'History[edit]',
 'Health[edit]',
 'In popular culture[edit]',
 'See also[edit]',
 'References[edit]',
 'External links[edit]']

## Getting all paragraphs

In [6]:
paragraphs = soup.findAll("p")
paragraphs_list = [ii.text for ii in paragraphs]
paragraphs_list

['The Shiba Inu (柴犬, Japanese:\xa0[ɕiba inɯ]) is a breed of hunting dog from Japan. A small-to-medium breed, it is the smallest of the six original and distinct spitz breeds of dog native to Japan.[1] Its name literally translates to "brushwood dog", as it is used to flush game.\n',
 'A small, alert, and agile dog that copes very well with mountainous terrain and hiking trails, the Shiba Inu was originally bred for hunting.[1][2] It looks similar to and is often mistaken for other Japanese dog breeds such as the Akita Inu or Hokkaido, but the Shiba Inu is a different breed with a distinct blood line, temperament, and smaller size than other Japanese dog breeds.[3][4]\n',
 "The Shiba's frame is compact with well-developed muscles.[5]\n",
 'The Shiba Inu is double coated, with the outer coat being stiff and straight and the undercoat soft and thick. Fur is short and even on the foxlike face, ears, and legs. Guard hairs stand off the body and are about 4 to 5\xa0cm (1+1⁄2 to 2\xa0in) long

## Getting the table info

In [7]:
table_element = soup.find("table")
print(table_element.text)

Shiba InuA three year old Red Shiba InuOther names
Japanese Shiba Inu
Japanese Small Size Dog
Japanese Brushwood Dog
Shiba Ken
Shibe
OriginJapanTraitsHeight
Dogs
35 to 43 cm (14 to 17 in)
Bitches
33 to 41 cm (13 to 16 in)Weight
Dogs
10 kg (22 lb)
Bitches
8 kg (18 lb)Coat
doubleColor
Red, black and tan, cream, sesame, black sesame, red sesame.Litter size
3 puppies on averageLife span
13–16 yearsKennel club standardsJapan Kennel Club
standardDog (domestic dog)


# Dynamic scraping

## Defining an instance of the web driver. 

In [8]:
# __________________Defining the Chrome Driver Instance
# Creating a webdriver instance
options = Options()

# options.add_argument('--headless') -> You can activate this option if you want to watch the scraping process.
options.add_argument('--disable-gpu')  # Last I checked this was necessary.
driver = webdriver.Chrome("../ChromeDriver_Path/chromedriver", chrome_options=options)

# Opening the url we have just defined in our browser
driver.get(url)

  driver = webdriver.Chrome("../ChromeDriver_Path/chromedriver", chrome_options=options)
  driver = webdriver.Chrome("../ChromeDriver_Path/chromedriver", chrome_options=options)


## Finding our data of interest

In [9]:
page = driver.find_element(By.TAG_NAME,"body") #We use the find_element() as there is only one.
page

title = page.find_element(By.TAG_NAME,"h1")
title.text

subtitles = page.find_elements(By.TAG_NAME,"h2") #We use the find_elements() as there are many of them - a list will be returned. 
subtitles_list = [ii.text for ii in subtitles]

paragraphs = page.find_elements(By.TAG_NAME,"p")
paragraphs_list = [ii.text for ii in paragraphs]

table = page.find_elements(By.TAG_NAME,"table")
table_list = [ii.text for ii in table]

## Executing dynamic actions

### Scrolling down

In [10]:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") #We scroll down the whole body height. 

### Clicking a button

In [11]:
mid_content = page.find_element(By.CLASS_NAME,"vector-toc-contents") #Find the element of the index with all subsections.
individual_content_list = mid_content.find_elements(By.TAG_NAME, "li") #Get a list of all subsections.
for ii in individual_content_list: #We iterate all of them to move down the webpage.
    print(ii.text)
    ii.click()
    time.sleep(3)

(Top)
Appearance
Temperament
History
Health
Toggle Health subsection
Life span
Grooming
Life span
Grooming
In popular culture
See also
References
External links


### Going to external links

In [12]:
references = page.find_element(By.CLASS_NAME,"mw-references-wrap") #We get the references elements.
individual_reference_list = references.find_elements(By.TAG_NAME, "li")
#We just check the first one
individual_reference_list[0].find_element(By.CLASS_NAME,"reference-text").click()