In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [7]:
url = "http://books.toscrape.com/"

response = requests.get(url)
html_string = response.text

document = BeautifulSoup(html_string, "html.parser")

In [8]:
book_articles = document.find_all("article", attrs={"class": "product_pod"})
len(book_articles)

20

In [9]:
first_book = book_articles[0]
first_book

<article class="product_pod">
<div class="image_container">
<a href="catalogue/a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>
</div>
<p class="star-rating Three">
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
</p>
<h3><a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
<div class="product_price">
<p class="price_color">Â£51.77</p>
<p class="instock availability">
<i class="icon-ok"></i>
    
        In stock
    
</p>
<form>
<button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
</form>
</div>
</article>

In [10]:
first_book = book_articles[0]
title = first_book.find("h3").find("a")["title"]
price = first_book.find("p", attrs={"class": "price_color"}).text.strip()
availability = first_book.find("p", attrs={"class": "instock availability"}).text.strip()
book_relative_url = first_book.find("h3").find("a")["href"]

title, price, availability, book_relative_url


('A Light in the Attic',
 'Â£51.77',
 'In stock',
 'catalogue/a-light-in-the-attic_1000/index.html')

In [11]:
from urllib.parse import urljoin

book_url = urljoin(url, book_relative_url)
book_url

'http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html'

In [12]:
response_book = requests.get(book_url)
html_book = response_book.text

book_document = BeautifulSoup(html_book, "html.parser")


description_header = book_document.find("div", attrs={"id": "product_description"})

if description_header:
    
    description_tag = description_header.find_next_sibling("p")
    if description_tag:
        description = description_tag.text.strip()
    else:
        description = ""
else:
    description = ""

description


"It's hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein's humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love th It's hard to imagine a world without A Light in the Attic. This now-classic collection of poetry and drawings from Shel Silverstein celebrates its 20th anniversary with this special edition. Silverstein's humorous and creative verse can amuse the dowdiest of readers. Lemon-faced adults and fidgety kids sit still and read these rhythmic words and laugh and smile and love that Silverstein. Need proof of his genius? RockabyeRockabye baby, in the treetopDon't you know a treetopIs no safe place to rock?And who put you up there,And your cradle, too?Baby, I think someone down here'sGot it in for you. Shel, you never sounde

In [14]:
from urllib.parse import urljoin

contents = []
titles = []
categories = []
prices = []
availabilities = []
ratings = []
urls = []

for book in book_articles:
    title = book.find("h3").find("a")["title"]
    price = book.find("p", attrs={"class": "price_color"}).text.strip()
    availability = book.find("p", attrs={"class": "instock availability"}).text.strip()
    rating_tag = book.find("p", attrs={"class": "star-rating"})
    rating = None
    if rating_tag:
        classes = rating_tag.get("class", [])
        if len(classes) > 1:
            rating = classes[1]
    
    relative_url = book.find("h3").find("a")["href"]
    full_url = urljoin(url, relative_url)
    
    response_book = requests.get(full_url)
    html_book = response_book.text
    book_document = BeautifulSoup(html_book, "html.parser")
    
    description_header = book_document.find("div", attrs={"id": "product_description"})
    if description_header:
        description_tag = description_header.find_next_sibling("p")
        if description_tag:
            description = description_tag.text.strip()
        else:
            description = ""
    else:
        description = ""
    breadcrumb = book_document.find("ul", attrs={"class": "breadcrumb"})
    category = None
    if breadcrumb:
        li_tags = breadcrumb.find_all("li")
        if len(li_tags) >= 3:
            category = li_tags[2].text.strip()
    content = description if description != "" else title
    
    contents.append(content)
    titles.append(title)
    categories.append(category)
    prices.append(price)
    availabilities.append(availability)
    ratings.append(rating)
    urls.append(full_url)


In [15]:
data = {
    "content": contents,
    "title": titles,
    "category": categories,
    "price": prices,
    "availability": availabilities,
    "rating": ratings,
    "url": urls
}

df = pd.DataFrame(data)
df.head()


Unnamed: 0,content,title,category,price,availability,rating,url
0,It's hard to imagine a world without A Light i...,A Light in the Attic,Poetry,Â£51.77,In stock,Three,http://books.toscrape.com/catalogue/a-light-in...
1,"""Erotic and absorbing...Written with starling ...",Tipping the Velvet,Historical Fiction,Â£53.74,In stock,One,http://books.toscrape.com/catalogue/tipping-th...
2,"Dans une France assez proche de la nÃ´tre, un ...",Soumission,Fiction,Â£50.10,In stock,One,http://books.toscrape.com/catalogue/soumission...
3,"WICKED above her hipbone, GIRL across her hear...",Sharp Objects,Mystery,Â£47.82,In stock,Four,http://books.toscrape.com/catalogue/sharp-obje...
4,From a renowned historian comes a groundbreaki...,Sapiens: A Brief History of Humankind,History,Â£54.23,In stock,Five,http://books.toscrape.com/catalogue/sapiens-a-...


In [16]:
df.to_csv("books_corpus.csv", index=False, encoding="utf-8")
