In [3]:
# pip install beautifulsoup4
# pip install selenium

This project is part of the final projects in Miuul course about webscrapping with Beautiful Soup and Selenium (https://learning.miuul.com/courses/take/web-scraping/texts/51630603-kurs-hakkinda). The goal of the project is to help an imaginary bookshop (A) by analyzing the rival bookshop that allows scrapping on their website. Bookshop A is having a difficult time selling non-fiction and travelling books. In order to help them, we need to check the pricing of the corresponding genres of bookshop B. The details can be found in Turkish inside the Instructions file.

In [4]:
from bs4 import BeautifulSoup
from selenium import webdriver

In [5]:
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")

In [7]:
#acitvating the driver using options
driver = webdriver.Chrome(options)

TASK 2: Open, Analyze and Scrap the main page using driver

In [8]:
import time
SLEEP_TIME = 2
driver.get("https://books.toscrape.com/")
time.sleep(SLEEP_TIME)

In [9]:
cat_elements = "//a[contains(text(), 'Travel') or contains(text(), 'Nonfiction')]"

In [10]:
from selenium.webdriver.common.by import By
category_elements = driver.find_elements(By.XPATH, cat_elements)

In [11]:
cat_urls = [element.get_attribute("href") for element in category_elements]
print(cat_urls)

['https://books.toscrape.com/catalogue/category/books/travel_2/index.html', 'https://books.toscrape.com/catalogue/category/books/nonfiction_13/index.html']


TASK 3: Scrapping the Genre Pages

In [12]:
driver.get(cat_urls[0])
time.sleep(SLEEP_TIME)

In [13]:
book_elements_XPATH = "//div[@class= 'image_container']//a"
book_elements = driver.find_elements(By.XPATH, book_elements_XPATH)
book_urls = [element.get_attribute("href") for element in book_elements]
print(book_urls)
print(len(book_urls))

['https://books.toscrape.com/catalogue/its-only-the-himalayas_981/index.html', 'https://books.toscrape.com/catalogue/full-moon-over-noahs-ark-an-odyssey-to-mount-ararat-and-beyond_811/index.html', 'https://books.toscrape.com/catalogue/see-america-a-celebration-of-our-national-parks-treasured-sites_732/index.html', 'https://books.toscrape.com/catalogue/vagabonding-an-uncommon-guide-to-the-art-of-long-term-world-travel_552/index.html', 'https://books.toscrape.com/catalogue/under-the-tuscan-sun_504/index.html', 'https://books.toscrape.com/catalogue/a-summer-in-europe_458/index.html', 'https://books.toscrape.com/catalogue/the-great-railway-bazaar_446/index.html', 'https://books.toscrape.com/catalogue/a-year-in-provence-provence-1_421/index.html', 'https://books.toscrape.com/catalogue/the-road-to-little-dribbling-adventures-of-an-american-in-britain-notes-from-a-small-island-2_277/index.html', 'https://books.toscrape.com/catalogue/neither-here-nor-there-travels-in-europe_198/index.html', 'h

As explained in the instructions pdf in addition to the control we made, the website does not use the same format of urls for single page categories (index.html) and multiple page categories (page x. html). 
Because of that we need to differentiate pagination.

In [16]:
MAX_PAGINATION = 3
url = cat_urls[1]
book_urls =[]
for i in range(1,MAX_PAGINATION):
    update_url = url if i == 1 else url.replace("index",  f"page-{i}")
    driver.get(update_url)
    book_elements = driver.find_elements(By.XPATH, book_elements_XPATH)

    if not book_elements:
        break

    temp_urls = [element.get_attribute("href") for element in book_elements]
    book_urls.extend(temp_urls)

print(book_urls)
print(len(book_urls))

['https://books.toscrape.com/catalogue/worlds-elsewhere-journeys-around-shakespeares-globe_972/index.html', 'https://books.toscrape.com/catalogue/the-five-love-languages-how-to-express-heartfelt-commitment-to-your-mate_969/index.html', 'https://books.toscrape.com/catalogue/reasons-to-stay-alive_959/index.html', 'https://books.toscrape.com/catalogue/higherselfie-wake-up-your-life-free-your-soul-find-your-tribe_957/index.html', 'https://books.toscrape.com/catalogue/unseen-city-the-majesty-of-pigeons-the-discreet-charm-of-snails-other-wonders-of-the-urban-wilderness_952/index.html', 'https://books.toscrape.com/catalogue/throwing-rocks-at-the-google-bus-how-growth-became-the-enemy-of-prosperity_948/index.html', 'https://books.toscrape.com/catalogue/the-life-changing-magic-of-tidying-up-the-japanese-art-of-decluttering-and-organizing_936/index.html', 'https://books.toscrape.com/catalogue/the-gutsy-girl-escapades-for-your-life-of-epic-adventure_934/index.html', 'https://books.toscrape.com/ca

TASK 4: Book Detail Page Scrapping

In [17]:
driver.get(book_urls[0])
time.sleep(SLEEP_TIME)
content_div = driver.find_elements(By.XPATH, "//div[@class = 'content']")

inner_html = content_div[0].get_attribute("innerHTML")
soup = BeautifulSoup(inner_html, "html.parser")


In [19]:
# Book name
name_elem = soup.find("h1")
book_name = name_elem.text


# Book price
price_elem = soup.find("p", attrs={"class":"price_color"})
book_price = price_elem.text

# Book star rating
import re
regex = re.compile('^star-rating ')
star_elem = soup.find("p", attrs={"class": regex})
book_star_count = star_elem["class"][-1]

#Book Description
desc_elem = soup.find("div", attrs= {"id": "product_description"}).find_next_sibling()
book_desc = desc_elem.text

# Table information under product information
product_info = {}
table_rows = soup.find("table").find_all("tr")
for row in table_rows:
    key = row.find("th").text
    value = row.find("td").text
    product_info[key] = value





In [20]:
book_desc

'Anti-apartheid activist, Bollywood screenwriter, Nazi pin-up, hero of the Wild West: this is Shakespeare as you have never seen him before.From the sixteenth-century Baltic to the American Revolution, from colonial India to the skyscrapers of modern-day Shanghai, Shakespeare’s plays appear at the most fascinating of times and in the most unexpected of places. No other writ Anti-apartheid activist, Bollywood screenwriter, Nazi pin-up, hero of the Wild West: this is Shakespeare as you have never seen him before.From the sixteenth-century Baltic to the American Revolution, from colonial India to the skyscrapers of modern-day Shanghai, Shakespeare’s plays appear at the most fascinating of times and in the most unexpected of places. No other writer’s work has been performed, translated, adapted and altered in such a remarkable variety of cultures and languages. But what is it about William Shakespeare – a man from Warwickshire who never once set foot outside England – that has made him at 