## Import libraries

In [1]:
pip install requests

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [3]:
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
import time

## Get URL and send GET request

In [17]:
url = "https://books.toscrape.com/"
response = requests.get(url)

In [18]:
if response.status_code == 200:
    print("Request Successful")
else:
    print("Request Failed")

Request Successful


## Parse HTML Content

In [9]:
soup = BeautifulSoup(response.text,"html.parser")

## Extract Book Details from Page 1

In [11]:
#Find all h3 tags because within the h3 tag is the anchor element which contains the link to the page with the book info
books = soup.find_all('h3')
#find_all will return all matching elements in a list

start_time = time.time()
books_extracted = 0

#Iterate through the books and extract information for each book
for book in books:
    book_url = book.find('a')['href'] #extract href from the first anchor element
    book_response = requests.get(url+book_url)
    book_soup = BeautifulSoup(book_response.content,"html.parser")#Create new object which will return the info from that url
    
    title = book_soup.find('h1').text
    category = book_soup.find('ul',class_ = "breadcrumb").find_all('a')[2].text.strip()#class is mentioned as class_ as class is a keyword in python, to not confuse the compiler
    rating = book_soup.find('p',class_ = 'star-rating')['class'][1]
    price = book_soup.find('p',class_='price_color').text.strip()
    availability = book_soup.find('p',class_='availability').text.strip()
    
    books_extracted +=1
    
    end_time = time.time()
    total_time = (end_time - start_time)/60.0
    
    print(f'Title: {title}')
    print(f'Category: {category}')
    print(f'Rating: {rating}')
    print(f'Price: {price}')
    print(f'Availability: {availability}')
    print('***********')

Title: A Light in the Attic
Category: Poetry
Rating: Three
Price: £51.77
Availability: In stock (22 available)
***********
Title: Tipping the Velvet
Category: Historical Fiction
Rating: One
Price: £53.74
Availability: In stock (20 available)
***********
Title: Soumission
Category: Fiction
Rating: One
Price: £50.10
Availability: In stock (20 available)
***********
Title: Sharp Objects
Category: Mystery
Rating: Four
Price: £47.82
Availability: In stock (20 available)
***********
Title: Sapiens: A Brief History of Humankind
Category: History
Rating: Five
Price: £54.23
Availability: In stock (20 available)
***********
Title: The Requiem Red
Category: Young Adult
Rating: One
Price: £22.65
Availability: In stock (19 available)
***********
Title: The Dirty Little Secrets of Getting Your Dream Job
Category: Business
Rating: Four
Price: £33.34
Availability: In stock (19 available)
***********
Title: The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull
Category

## Extract Data for all 50 pages


In [21]:
import requests
from bs4 import BeautifulSoup
import time

# Create a list to hold all the book information
books_data = []

# Loop through all 50 pages
for page_num in range(1, 51):
    url = f'https://books.toscrape.com/catalogue/page-{page_num}.html'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    
    start_time = time.time()
    books_extracted = 0

    # Extract book elements
    books = soup.find_all('h3')
    
    for book in books:
        book_url = book.find('a')['href']
        book_url = 'https://books.toscrape.com/catalogue/' + book_url
        
        book_response = requests.get(book_url)
        book_soup = BeautifulSoup(book_response.content, "html.parser")
        
        title = book_soup.find('h1').text
        category = book_soup.find('ul', class_="breadcrumb").find_all('a')[2].text.strip()
        rating = book_soup.find('p', class_='star-rating')['class'][1]
        price = book_soup.find('p', class_='price_color').text.strip()
        availability = book_soup.find('p', class_='availability').text.strip()
        
        books_extracted += 1
        end_time = time.time()
        total_time = (end_time - start_time) / 60.0
        
        books_data.append([title, category, rating, price, availability])
        print(books_data[-1])  # Print the latest book data
        print('*******')

print(f"Total books extracted: {len(books_data)}")

['A Light in the Attic', 'Poetry', 'Three', '£51.77', 'In stock (22 available)']
*******
['Tipping the Velvet', 'Historical Fiction', 'One', '£53.74', 'In stock (20 available)']
*******
['Soumission', 'Fiction', 'One', '£50.10', 'In stock (20 available)']
*******
['Sharp Objects', 'Mystery', 'Four', '£47.82', 'In stock (20 available)']
*******
['Sapiens: A Brief History of Humankind', 'History', 'Five', '£54.23', 'In stock (20 available)']
*******
['The Requiem Red', 'Young Adult', 'One', '£22.65', 'In stock (19 available)']
*******
['The Dirty Little Secrets of Getting Your Dream Job', 'Business', 'Four', '£33.34', 'In stock (19 available)']
*******
['The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull', 'Default', 'Three', '£17.93', 'In stock (19 available)']
*******
['The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics', 'Default', 'Four', '£22.60', 'In stock (19 available)']
*******
['The Black Maria', '

["The Activist's Tao Te Ching: Ancient Advice for a Modern Revolution", 'Spirituality', 'Five', '£32.24', 'In stock (16 available)']
*******
['Spark Joy: An Illustrated Master Class on the Art of Organizing and Tidying Up', 'Nonfiction', 'Four', '£41.83', 'In stock (16 available)']
*******
['Soul Reader', 'Default', 'Two', '£39.58', 'In stock (16 available)']
*******
['Security', 'Horror', 'Two', '£39.25', 'In stock (16 available)']
*******
['Saga, Volume 6 (Saga (Collected Editions) #6)', 'Fantasy', 'Three', '£25.02', 'In stock (16 available)']
*******
['Saga, Volume 5 (Saga (Collected Editions) #5)', 'Sequential Art', 'Two', '£51.04', 'In stock (16 available)']
*******
['Reskilling America: Learning to Labor in the Twenty-First Century', 'Nonfiction', 'Two', '£19.83', 'In stock (16 available)']
*******
['Rat Queens, Vol. 3: Demons (Rat Queens (Collected Editions) #11-15)', 'Sequential Art', 'Three', '£50.40', 'In stock (16 available)']
*******
['Princess Jellyfish 2-in-1 Omnibus, Vol

['The Secret (The Secret #1)', 'Default', 'Four', '£27.37', 'In stock (15 available)']
*******
['The Regional Office Is Under Attack!', 'Fiction', 'Five', '£51.36', 'In stock (15 available)']
*******
['The Psychopath Test: A Journey Through the Madness Industry', 'Default', 'Two', '£36.00', 'In stock (15 available)']
*******
['The Project', 'Science Fiction', 'One', '£10.65', 'In stock (15 available)']
*******
['The Power of Now: A Guide to Spiritual Enlightenment', 'Nonfiction', 'Two', '£43.54', 'In stock (15 available)']
*******
["The Omnivore's Dilemma: A Natural History of Four Meals", 'Nonfiction', 'Two', '£38.21', 'In stock (15 available)']
*******
['The Nerdy Nummies Cookbook: Sweet Treats for the Geek in All of Us', 'Food and Drink', 'Five', '£37.34', 'In stock (15 available)']
*******
['The Murder of Roger Ackroyd (Hercule Poirot #4)', 'Mystery', 'Four', '£44.10', 'In stock (15 available)']
*******
['The Mistake (Off-Campus #2)', 'New Adult', 'Three', '£43.29', 'In stock (15 a

['You (You #1)', 'Thriller', 'Five', '£43.61', 'In stock (14 available)']
*******
['Wonder Woman: Earth One, Volume One (Wonder Woman: Earth One #1)', 'Sequential Art', 'Four', '£37.34', 'In stock (14 available)']
*******
['Wild Swans', 'Young Adult', 'Two', '£14.36', 'In stock (14 available)']
*******
['Why the Right Went Wrong: Conservatism--From Goldwater to the Tea Party and Beyond', 'Politics', 'Four', '£52.65', 'In stock (14 available)']
*******


KeyboardInterrupt: 

## Export the Data

In [23]:
df = pd.DataFrame(books_data, columns=["Title","Category","Rating","Price","Availability"])


In [24]:
#save to csv
df.to_csv("books_scraped.csv",index=False)
print('Data Saved to books_scraped.csv')

Data Saved to books_scraped.csv
