# Robotic Process Automation: Web Scraper

## Contacts:
cursifrancesco@gmail.com

## Description

This projects describes how to make an automated web scraper to read text from a web page and download data.
It is based on Selenium and ChromeDriver to scrape web pages on Google Chrome, on a Windows Machine.

The scraper reads text from a table in https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_area
opens a new tab and downloads files from https://silhouettegarden.com/category/country/


### Set-up Instructions:

Set up your directory to contain:
- src/scraper.py
- chromedriver_win32/chromedriver.exe

You should first download chromedriver.exe from https://chromedriver.chromium.org/downloads 

### Code:

Import necessary libraries

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import pandas as pd
import ast
import time
from selenium.common.exceptions import NoSuchElementException

Intialize the scraper and navigate to desired URL

In [None]:
# open Chrome and navigate to desired web page

chromedriver_folder = "../chromedriver_99_win32/chromedriver"
URL = "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_area"
op = webdriver.ChromeOptions()
driver = webdriver.Chrome(chromedriver_folder, options=op)
driver.get(URL)

#Try opening the webpage
Table_xpath = '/html/body/div[3]/div[3]/div[5]/div[1]/table[2]'
try:
    #Here we check if the table we want to read is shown and wait until this happens
    table_ok = EC.presence_of_element_located((By.XPATH,Table_xpath ))
    WebDriverWait(driver, 5).until(table_ok)
    time.sleep(2)
except TimeoutException:
    print("Timed out waiting for page to load")

    
table = driver.find_element_by_xpath(Table_xpath)
rows = table.find_elements_by_tag_name("tr") #this line identifies all the rows in the table

#Initialize data storage
Areas = []
Countries = []

#Loop through the rows to get data
for i in range(1,len(rows)):
    r = rows[i]
    values = r.find_elements_by_tag_name("td")
    n_cols = len(values)
    if n_cols == 7:
        country = values[1].text
        areas = (values[2].text).split()
    else:
        country = values[0].text
        areas = (values[1].text).split()

    try:
        area = areas[0]
        area = float(area.replace(',',''))
        Areas.append(area)
        Countries.append(country)
        print("Country ",country," Area ",area)
    except:
        print("no data")

#Create Pandas dataframe and save it
df = pd.DataFrame({'Countries': Countries,'Areas (Km2)': Areas})
df.to_csv("WorldData.csv", index=True, encoding='utf-8')

Open a new tab to run the scraper to download files

In [None]:
URL = "https://silhouettegarden.com/category/country/"
driver.execute_script('''window.open(URL,"_blank");''')

Data_element = '/html/body/div[1]/div[3]/div/div[2]'
try:
    ok = EC.presence_of_element_located((By.XPATH, Data_element))
    WebDriverWait(driver, 5).until(ok)
    time.sleep(2)
except TimeoutException:
    print("Timed out waiting for page to load")
    
#get the opened tabs
handles = driver.window_handles

#go to new tab
driver.switch_to.window(handles[1])

When downloading files from https://silhouettegarden.com/category/country/ the craper needs to click on the icon. This redirects to a new page and then press the download button.
To download additional file it needs to go back to the previous page, but this causes a loss of data structures in selenium.

In order to overocme this, a recursive approach needs to be used.

In [None]:
#Define the recursive download function
# It keeps downloading and reupdating selenium structure untill all elements have been visited
def DownloadFiles(driver,idx_row,idx_country):
    ok = EC.presence_of_element_located((By.XPATH, '/html/body/div[1]/div[3]/div/div[2]'))
    WebDriverWait(driver, 1).until(ok)
    
    #This lines re-upload selnium structure by pointing to the part of the web page containing the counties and identifying each row
    Full_images = driver.find_element_by_xpath("/html/body/div[1]/div[3]/div/div[2]")
    rows = Full_images.find_elements_by_xpath("//div[contains(@class, 'margin-bottom-30')]")

    N_rows = len(rows)

    d = False
    
    #if all rows in web page have been visited, stop recursion
    if idx_row == N_rows:
        return True
    
    #each row has a certain amount of countries
    r = rows[idx_row]
    countries = r.find_elements_by_tag_name("div")
    N_countries = len(countries)
    
    #click on the cuntry to be redirected to downloading page
    c = countries[idx_country]
    c.click()
    time.sleep(0.1)

    ok = EC.presence_of_element_located((By.XPATH, '/html/body/div[1]/div[3]/div/div[2]/div[2]/div[2]/div[1]/div/div/div/div[1]'))
    WebDriverWait(driver, 1).until(ok)
    
    #Find the download button and click it
    download_button = driver.find_element_by_xpath(
        "/html/body/div[1]/div[3]/div/div[2]/div[2]/div[2]/div[1]/div/div/div/div[1]")
    download_button.click()
    
    # go back previous page
    driver.back()
    idx_country = idx_country + 1
    time.sleep(0.1)
    
    #if all countries in the row have been visited, move to next row
    if idx_country == N_countries:
        idx_row = idx_row+1
        idx_country = 0

    d = DownloadFiles(driver,idx_row, idx_country)

    return d

The web page has an element to switch from one page to another. We need to go through each wbpage to download all the files

In [None]:
#Keep looping through all the webpages up to the final one
NextPage_Available = True
while(NextPage_Available):
    #download images
    Downloaded = False
    while not Downloaded:
        idx_row = 0
        idx_country = 0
        
        #recursively download files and re-upload the web structure
        Downloaded = DownloadFiles(driver,idx_row, idx_country)
        print("")
        
    # If the files in the current page have been downloaded, go to next page
    Next_page = driver.find_element_by_xpath("/html/body/div[1]/div[3]/div/div[2]/div[1]/div/div") #element to switch to new page
    s = Next_page.find_elements_by_tag_name("li") #this is a list of pages 
    b = s[-1] #this is the next page button
    page_name = b.text
    if page_name == "Next Page":
        button = b.find_element_by_tag_name("i")
        button.click()
        time.sleep(0.1)
    else:
        #if the "Next Page" button does not exist, stop the search. We reached the end
        NextPage_Available = False