In [1]:
from bs4 import BeautifulSoup
import requests

In [2]:
def fetch_response(url, session=requests.Session()):
    try:
        response = session.get(url, timeout=10)
        response.raise_for_status()
        print(f"fetched: {url}")
        return response
    except requests.exceptions.HTTPError as errh:
        print("HTTP error occurred:\n", errh)
    except requests.exceptions.RequestException as err:
        print("Unexpected request error:\n", err)
    except Exception as e:
        print(f"Unhandled exception:\n{type(e).__name__}: {e}")

In [3]:
def make_soup(response):
    if not (response and response.status_code == 200):
        print("couldn't make soup with:", response)
        return
    html = response.text
    soup = BeautifulSoup(html, "lxml")
    print("soup's ready!")
    return soup

In [4]:
def extract_all_books_data(soup):
    if not soup or not (books := soup.select("article.product_pod")):
        print("soup had books data")
        return
        
    extracted_data = []
    # lambda (selector, attr): (_.get(attr) if (_:= book.select_one(selector)) else "") # to avoid NoneType has no attr x
    for book in books:
        data = {
            #image element data
            "image_href": book.select_one(".image_container a").get("href"),  # better to fail fast on no container w/ a
            "image_alt": book.select_one(".image_container a").img.get("alt"), # reach inside <a> to find <img>
            "image_src": book.select_one(".image_container a").img.get("src"),
            
            # star rating
            "star_rating": [i for i in book.select_one(".star-rating").get("class") if i != "star-rating"][0],    # from list of classes on attr filter out star-rating assume index 0 is the rating 

            # title element data
            "title_href": book.select_one("h3 a").get("href"),
            "title_text": book.select_one("h3 a").get("title"),    #.get_text()?
            
            # price and availability
            "price": book.select_one(".product_price").select_one(".price_color").get_text(strip=True),
            "availability": book.select_one(".product_price").select_one(".availability").get_text(strip=True),
        }
        extracted_data.append(data)
    print(f"got {len(extracted_data)} books")
    return extracted_data

In [5]:
__ = lambda _: [i for i in dir(_) if not i.startswith("_")]

In [6]:
# if __name__ == "__main__":

In [7]:
url = "https://books.toscrape.com"

In [8]:
response = fetch_response(url)

fetched: https://books.toscrape.com


In [9]:
html = """
<article class="product_pod">
  <div class="image_container">
    <a href="catalogue/a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg" /></a>
  </div>
  <p class="star-rating Three">
    <i class="icon-star"></i>
    <i class="icon-star"></i>
    <i class="icon-star"></i>
    <i class="icon-star"></i>
    <i class="icon-star"></i>
  </p>
  <h3><a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
  <div class="product_price">
    <p class="price_color">Â£51.77</p>
    <p class="instock availability">
      <i class="icon-ok"></i>
      
       In stock
       
    </p>
    <form>
      <button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
    </form>
  </div>
</article>
"""
soup = BeautifulSoup(html, "lxml")    # for offline

In [10]:
books = extract_all_books_data(soup)
books

got 1 books


[{'image_href': 'catalogue/a-light-in-the-attic_1000/index.html',
  'image_alt': 'A Light in the Attic',
  'image_src': 'media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg',
  'star_rating': 'Three',
  'title_href': 'catalogue/a-light-in-the-attic_1000/index.html',
  'title_text': 'A Light in the Attic',
  'price': 'Â£51.77',
  'availability': 'In stock'}]

In [11]:
soup = make_soup(response)

soup's ready!


In [12]:
books = extract_all_books_data(soup)
books

got 20 books


[{'image_href': 'catalogue/a-light-in-the-attic_1000/index.html',
  'image_alt': 'A Light in the Attic',
  'image_src': 'media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg',
  'star_rating': 'Three',
  'title_href': 'catalogue/a-light-in-the-attic_1000/index.html',
  'title_text': 'A Light in the Attic',
  'price': 'Â£51.77',
  'availability': 'In stock'},
 {'image_href': 'catalogue/tipping-the-velvet_999/index.html',
  'image_alt': 'Tipping the Velvet',
  'image_src': 'media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg',
  'star_rating': 'One',
  'title_href': 'catalogue/tipping-the-velvet_999/index.html',
  'title_text': 'Tipping the Velvet',
  'price': 'Â£53.74',
  'availability': 'In stock'},
 {'image_href': 'catalogue/soumission_998/index.html',
  'image_alt': 'Soumission',
  'image_src': 'media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg',
  'star_rating': 'One',
  'title_href': 'catalogue/soumission_998/index.html',
  'title_text': 'Soumission',
  'price': 'Â£50.10'