## DEPENDECIES FOR ALL THE PROJECT 

In [1]:
# General purpose
import os
from dotenv import load_dotenv

# Extract (Web Scraping)
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# Transform & Load
import pandas as pd
import numpy as np
import sqlite3

# API
from flask import Flask, jsonify

# Visualize
import dash
from dash import dcc, html
import plotly.express as px


## SETTING UP CHROME 

In [2]:
# Setting up browser for Chrome
options = webdriver.ChromeOptions()

# Run Chrome in headless mode 
options.add_argument("--headless")  

# Launch Chrome using the options settings above
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)  



In [3]:
# Navigate browser to the bookscraping site
driver.get("https://books.toscrape.com/")


In [4]:
# Tell the browser to show us the title of the current page (to doubule check if loaded correctly)
print(driver.title)


All products | Books to Scrape - Sandbox


In [5]:
# Checking through the page and find all the books content (each book is inside an <article> tag with class 'product_pod')
books = driver.find_elements(By.CLASS_NAME, "product_pod")


In [6]:
# Let's count how many books we found on the page
print("Books found:", len(books))


Books found: 20


## SCRAPING PROCEDURE

In [7]:
# Let's grab the title of the first book on the page
first_title = books[0].find_element(By.TAG_NAME, "h3").find_element(By.TAG_NAME, "a").get_attribute("title")

# Show the title of the first book to confirm we got it right
print("First book title:", first_title)

First book title: A Light in the Attic


In [None]:
# Now let's get the price of the first book
first_price = books[0].find_element(By.CLASS_NAME, "price_color").text

# Show the price of the first book to confirm we got it right
print("First book price:", first_price)


First book price: £51.77


In [None]:
# Let's check if the first book is available (in stock or not)
first_availability = books[0].find_element(By.CLASS_NAME, "availability").text.strip()

# Show the availability of the first book to confirm we got it right
print("First book availability:", first_availability)


First book availability: In stock


In [11]:
# Let's get the rating of the first book (like 'Three', 'Four', etc.)
first_rating = books[0].find_element(By.CLASS_NAME, "star-rating").get_attribute("class").split()[-1]

# Show the rating of the first book to confirm we got it right
print("First book rating:", first_rating)

First book rating: Three


In [13]:
[
  {'title': '...', 'price': '...', 'availability': '...', 'rating': '...'},
  ...
]


[{'title': '...', 'price': '...', 'availability': '...', 'rating': '...'},
 Ellipsis]

In [16]:
# Create a list to hold all the book data
book_data = []


In [18]:
# Go through each book found on the page, one by one
for book in books:
	title = book.find_element(By.TAG_NAME, "h3").find_element(By.TAG_NAME, "a").get_attribute("title")
	price = book.find_element(By.CLASS_NAME, "price_color").text
	availability = book.find_element(By.CLASS_NAME, "availability").text.strip()
	rating = book.find_element(By.CLASS_NAME, "star-rating").get_attribute("class").split()[-1]
	book_data.append({
		'title': title,
		'price': price,
		'availability': availability,
		'rating': rating
	})


In [19]:
# Show the first 3 books collected to preview the results
print(book_data[:3])


[{'title': 'A Light in the Attic', 'price': '£51.77', 'availability': 'In stock', 'rating': 'Three'}, {'title': 'Tipping the Velvet', 'price': '£53.74', 'availability': 'In stock', 'rating': 'One'}, {'title': 'Soumission', 'price': '£50.10', 'availability': 'In stock', 'rating': 'One'}]
