In [36]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin
import time 
import random
from selenium.webdriver.chrome.options import Options


In [37]:
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")

headers= {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

In [38]:
word_to_numbers = {
    'One': 1,
    'Two': 2,
    'Three': 3,
    'Four': 4,
    'Five': 5
}

In [None]:
def extract_book_data(book,page_url):
    time.sleep(random.uniform(2,3))
    
    title = book.h3.a['title']
    link = book.h3.a['href']
    price = float(book.find('p', class_='price_color').text.strip('£'))

    rating = book.p['class']
    rating_number = word_to_numbers.get(rating[1], 0)
    stock = book.find('p', class_='instock availability').text.strip()

    nested_page = urljoin(page_url,link)

    res = requests.get(nested_page,headers)
    nested_soup = BeautifulSoup(res.content, 'html.parser')

    table = nested_soup.find_all('td')
    description_tag = nested_soup.find('div', id='product_description')
    if description_tag:
        description = description_tag.find_next_sibling('p').text.strip()
    else:
        description = "No description"

    return {
        'Title': title,
        'Price': price,
        'Rating': rating_number,
        'upc' : table[0].text,
        'stock' : table[5].text.strip('In stock ( available)'),
        'Link': link,
        'description': description
    }

In [40]:
def scrape_books(base_url):
    dataset = []
    current = base_url
    page_num = 0

    while current:
        page_num += 1
        print(f"Scraping Page: {page_num}")

        response = requests.get(current,headers)
        soup = BeautifulSoup(response.content, 'html.parser')

        books = soup.find_all('article', class_='product_pod')
        for book in books:
            data = extract_book_data(book,current)
            dataset.append(data)

        next_btn = soup.find('li', class_='next')
        if next_btn:
            next_url = next_btn.a['href']
            current = urljoin(current, next_url)
        else:
            break

    return dataset

In [41]:
if __name__ == "__main__":
    BASE_URL = "https://books.toscrape.com/"
    books_data = scrape_books(BASE_URL)

    # Save to Excel
    df = pd.DataFrame(books_data)
    df.to_csv("D:/Projects/web_scrapping/project/output/books_data.csv", index=False)
    print("Scraping completed and data saved to 'output/books_data.csv'")


Scraping Page: 1
Scraping Page: 2
Scraping Page: 3
Scraping Page: 4
Scraping Page: 5
Scraping Page: 6
Scraping Page: 7
Scraping Page: 8
Scraping Page: 9
Scraping Page: 10
Scraping Page: 11
Scraping Page: 12
Scraping Page: 13
Scraping Page: 14
Scraping Page: 15
Scraping Page: 16
Scraping Page: 17
Scraping Page: 18
Scraping Page: 19
Scraping Page: 20
Scraping Page: 21
Scraping Page: 22
Scraping Page: 23
Scraping Page: 24
Scraping Page: 25
Scraping Page: 26
Scraping Page: 27
Scraping Page: 28
Scraping Page: 29
Scraping Page: 30
Scraping Page: 31
Scraping Page: 32
Scraping Page: 33
Scraping Page: 34
Scraping Page: 35
Scraping Page: 36
Scraping Page: 37
Scraping Page: 38
Scraping Page: 39
Scraping Page: 40
Scraping Page: 41
Scraping Page: 42
Scraping Page: 43
Scraping Page: 44
Scraping Page: 45
Scraping Page: 46
Scraping Page: 47
Scraping Page: 48
Scraping Page: 49
Scraping Page: 50
Scraping completed and data saved to 'output/books_data.csv'
