## Web Scraping 
   1. Parsing using BeautifulSoup
   2. Saving the csv file

In [None]:
import requests
from bs4 import BeautifulSoup
import csv

In [3]:
# Since its in text format, convert star-rating text to number
def get_star_rating(star_class):
    ratings = {
        "One": 1,
        "Two": 2,
        "Three": 3,
        "Four": 4,
        "Five": 5
    }
    for key in ratings:
        if key in star_class:
            return ratings[key]
    return None

# Scrape all pages
base_url = "http://books.toscrape.com/catalogue/page-{}.html"
all_books = []

for page in range(1, 51):  # 50 pages total
    url = base_url.format(page)
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    for book in soup.select("article.product_pod"):
        title = book.h3.a["title"]
        price = book.select_one(".price_color").text.strip()
        availability = book.select_one(".availability").text.strip()
        rating_class = book.select_one("p.star-rating")["class"]
        rating = get_star_rating(rating_class)

        all_books.append({
            "title": title,
            "price": price,
            "availability": availability,
            "rating": rating
        })

# Save raw scraped data to a CSV
with open("books.csv", "w", newline='', encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["title", "price", "availability", "rating"])
    writer.writeheader()
    writer.writerows(all_books)

In [5]:
import pandas as pd

df = pd.read_csv("books.csv")
print(df.head())

                                   title   price availability  rating
0                   A Light in the Attic  £51.77     In stock       3
1                     Tipping the Velvet  £53.74     In stock       1
2                             Soumission  £50.10     In stock       1
3                          Sharp Objects  £47.82     In stock       4
4  Sapiens: A Brief History of Humankind  £54.23     In stock       5


In [7]:
#Cleaning the data into new csv file
df = pd.read_csv("books.csv", encoding="utf-8")

df['price'] = df['price'].astype(str).replace({r'[^0-9.]': ''}, regex=True)
df['price'] = df['price'].astype(float)


df['availability'] = df['availability'].str.strip().str.replace(r'\s*\(.*\)', '', regex=True)

df.to_csv("books_cleaned.csv", index=False)

print(f"Cleaned data saved to books_cleaned.csv with {len(df)} records.")

Cleaned data saved to books_cleaned.csv with 1000 records.


In [9]:
df.head()

Unnamed: 0,title,price,availability,rating
0,A Light in the Attic,51.77,In stock,3
1,Tipping the Velvet,53.74,In stock,1
2,Soumission,50.1,In stock,1
3,Sharp Objects,47.82,In stock,4
4,Sapiens: A Brief History of Humankind,54.23,In stock,5
