<a href="https://colab.research.google.com/github/darvesh-sd/Copy-of-TPSessions.ipynb/blob/main/TP2_Web_Scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Book Scraper
----------------------------------

This script gathers book titles and cover images from "Books to Scrape".
It's designed explicitly for easy use on Google Colab. I've included detailed explanations and interactive visualizations to enhance your experience.

Author: SEWOOGOLUM DARVESH SHARMA
Date: May 14, 2025
"""

# Step 1: Installing necessary libraries
!pip install requests beautifulsoup4 pandas matplotlib

# Import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import random
import time
from urllib.parse import urljoin
from google.colab import files

# Base URL and setup variables
BASE_URL = "https://books.toscrape.com/catalogue/"
PAGES_TO_SCRAPE = 5

# Setting headers to imitate a browser (so we're less likely to get blocked)
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
                  ' AppleWebKit/537.36 (KHTML, like Gecko)'
                  ' Chrome/91.0.4472.124 Safari/537.36'
}

# Function to build URLs for each page
def build_page_url(page_number):
    """Create full URL for each individual page."""
    return f"{BASE_URL}page-{page_number}.html"

# Extracting file extension from URLs
def get_file_extension(url):
    """Extract the image file extension from URL."""
    return url.split('.')[-1] if '.' in url else 'unknown'

# Function to scrape a single page
def scrape_books_from_page(page_url):
    """Scrape book titles and cover images from a given URL."""
    print(f"Scraping data from: {page_url}")
    books_list = []

    # Adding random delay to respect site's server
    time.sleep(random.uniform(1, 3))

    try:
        response = requests.get(page_url, headers=HEADERS)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all book listings on page
        articles = soup.select('article.product_pod')
        for article in articles:
            # Extracting the book title
            title = article.h3.a['title']

            # Extracting the image URL and converting to absolute URL
            img_relative_url = article.find('img')['src']
            img_absolute_url = urljoin('https://books.toscrape.com/', img_relative_url)

            # Determining file extension
            extension = get_file_extension(img_absolute_url)

            # Storing data
            books_list.append({
                'Title': title,
                'Image URL': img_absolute_url,
                'Extension': extension
            })

    except requests.RequestException as e:
        print(f"Oops! Couldn't retrieve {page_url}. Error: {e}")

    return books_list

# Function to visualize book cover image extensions
def visualize_extensions(df):
    """Visualizes frequency of image extensions with a bar chart."""
    print("Visualizing image extension frequency...")
    df['Extension'].value_counts().plot(kind='bar', figsize=(10, 6), color='lightcoral')
    plt.title('Book Cover Image Extensions', fontsize=16)
    plt.xlabel('Extension', fontsize=14)
    plt.ylabel('Count', fontsize=14)
    plt.grid(axis='y', linestyle='--')
    plt.show()

# Function to save and download results
def save_and_download(df, filename='books_scraped.csv'):
    """Save DataFrame to CSV and prompt user to download file."""
    df.to_csv(filename, index=False)
    print(f"Data saved successfully as '{filename}'! Here's a quick preview:")
    display(df.head(10))
    files.download(filename)

# Main scraping process
print("Let's start scraping some books!")
all_books = []

for page_num in range(1, PAGES_TO_SCRAPE + 1):
    url = build_page_url(page_num)
    page_books = scrape_books_from_page(url)
    print(f"Page {page_num} gave us {len(page_books)} books. Nice!")
    all_books.extend(page_books)

print(f"\nScraping complete! Collected a total of {len(all_books)} books.")

# Creating DataFrame for analysis
df_books = pd.DataFrame(all_books)

# Visualizing results
visualize_extensions(df_books)

# Saving and downloading results
save_and_download(df_books)

# Giving the user some interactive tips
print("\n Congrats! You've successfully scraped data from 'Books to Scrape'.")
print("You can explore further using these commands:")
print("- df_books.head() : See the first few rows of your data")
print("- df_books['Title'].value_counts() : See the frequency of book titles")
print("- df_books.groupby('Extension').size() : Check counts by image extension")