In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

titles, prices, ratings, categories = [], [], [], []

# Loop through all 50 pages
for page in range(1, 51):
    url = f"http://books.toscrape.com/catalogue/page-{page}.html"
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Page {page} not found")
        continue
    
    soup = BeautifulSoup(response.content, "html.parser")
    books = soup.find_all("article", class_="product_pod")

    for book in books:
        titles.append(book.h3.a['title'])
        prices.append(book.find("p", class_="price_color").text)
        ratings.append(book.p['class'][1])
        
        # Category requires going inside each book page
        book_url = "http://books.toscrape.com/catalogue/" + book.h3.a['href']
        book_response = requests.get(book_url)
        book_soup = BeautifulSoup(book_response.content, "html.parser")
        category = book_soup.find("ul", class_="breadcrumb").find_all("li")[2].text.strip()
        categories.append(category)

# Create DataFrame
df = pd.DataFrame({
    "Title": titles,
    "Price": prices,
    "Rating": ratings,
    "Category": categories
})
df.head()


Unnamed: 0,Title,Price,Rating,Category
0,A Light in the Attic,£51.77,Three,Poetry
1,Tipping the Velvet,£53.74,One,Historical Fiction
2,Soumission,£50.10,One,Fiction
3,Sharp Objects,£47.82,Four,Mystery
4,Sapiens: A Brief History of Humankind,£54.23,Five,History


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

base_url = "http://books.toscrape.com/catalogue/page-{}.html"

titles, prices, ratings, categories = [], [], [], []

rating_map = {"One":1, "Two":2, "Three":3, "Four":4, "Five":5}

for page in range(1, 51):  # 50 pages
    response = requests.get(base_url.format(page))
    if response.status_code != 200:
        break  # stop if page not found

    soup = BeautifulSoup(response.content, "html.parser")
    books = soup.find_all("article", class_="product_pod")

    for book in books:
        titles.append(book.h3.a['title'])
        prices.append(float(book.find("p", class_="price_color").text[1:]))  # strip £
        ratings.append(rating_map[book.p['class'][1]])  # convert rating word → number
        
        # Extract category from breadcrumb
        book_url = "http://books.toscrape.com/catalogue/" + book.h3.a['href']
        cat_soup = BeautifulSoup(requests.get(book_url).content, "html.parser")
        categories.append(cat_soup.select("ul.breadcrumb li")[2].get_text(strip=True))

# Build DataFrame
df = pd.DataFrame({
    "Title": [t.lower().strip() for t in titles],  # normalized
    "Price": prices,
    "Rating": ratings,
    "Category": categories
})

df.head()


In [None]:
import numpy as np

# Clean Price
df['Price'] = df['Price'].str.replace("£", "").astype(float)

# Normalize Title
df['Title'] = df['Title'].str.lower().str.strip()

# Convert Ratings to numbers
rating_map = {"One":1, "Two":2, "Three":3, "Four":4, "Five":5}
df['Rating'] = df['Rating'].map(rating_map)

# Remove duplicates / missing
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

df.head()


In [None]:
# Basic stats
print("Descriptive Statistics:\n", df['Price'].describe())
print("Median Price:", np.median(df['Price']))
print("Mode Price:", df['Price'].mode()[0])

# Correlation
print("Correlation:\n", df[['Price','Rating']].corr())

# Category popularity
print("Most popular categories:\n", df['Category'].value_counts().head(10))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Price Distribution
sns.histplot(df['Price'], bins=30, kde=True)
plt.title("Book Price Distribution")
plt.show()

# 2. Boxplot
sns.boxplot(x=df['Price'])
plt.title("Price Spread")
plt.show()

# 3. Scatter: Price vs Rating
sns.scatterplot(x='Rating', y='Price', data=df)
plt.title("Price vs Rating")
plt.show()

# 4. Category Popularity (Top 10)
df['Category'].value_counts().head(10).plot(kind='bar')
plt.title("Top 10 Book Categories by Count")
plt.show()
