In [None]:
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin

import logging
import os

from concurrent.futures import ThreadPoolExecutor

In [None]:
_logger = None

def get_logger(name="books_scraper"):
    """Singleton logger to be used across the project."""
    global _logger
    if _logger:
        return _logger

    os.makedirs("logs", exist_ok=True)
    
    logger = logging.getLogger(name)
    logger.setLevel(logging.INFO)

    if not logger.handlers:
        formatter = logging.Formatter(
            "[%(levelname)s] %(filename)s:%(lineno)d %(funcName)s() %(message)s %(asctime)s"            
        )

        file_handler = logging.FileHandler("logs/scraper.log")
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)

        console = logging.StreamHandler()
        console.setFormatter(formatter)
        logger.addHandler(console)

    _logger = logger
    return logger


try:
    logger = get_logger()
    logger_fallback = False
except Exception as e:
    print(f"[Logger fallback] Logger not available: {type(e).__name__} - {e}")
    
    class DummyLogger:
        def info(self, msg): print(f"[INFO] {msg}")
        def warning(self, msg): print(f"[WARN] {msg}")
        def error(self, msg): print(f"[ERROR] {msg}")
        def exception(self, msg): print(f"[EXCEPTION] {msg}")
    
    logger = DummyLogger()
    logger_fallback = True

In [None]:
def fetch_response(url, session=requests.Session()):
    try:
        response = session.get(url, timeout=10)
        response.raise_for_status()
        logger.info(f"fetched: {url}")
        return response
    except requests.exceptions.HTTPError as errh:
        logger.warning(f"HTTP error occurred:\n{errh}")
    except requests.exceptions.RequestException as err:
        logger.warning(f"{type(err).__name__}\n{err}")
    except Exception as e:
        logger.warning(f"Unhandled exception:\n{type(e).__name__}: {e}")

In [None]:
def make_soup(response):
    if not (response and response.status_code == 200):
        logger.warning(f"couldn't make soup with: {response}")
        return
    html = response.text
    soup = BeautifulSoup(html, "lxml")
    logger.info(f"soup's ready!")
    return soup

In [None]:
def fetch_next_page(base_url):
    res = fetch_response(base_url)
    soup = make_soup(res)
    
    link = ((el := soup.select_one(".pager .next a")) and el.get("href")) if soup else None
    return urljoin(base_url, link), soup if link else None

In [None]:
def extract_all_books_data(soup):
    if not soup or not (books := soup.select("article.product_pod")):
        logger.info("soup had books data")
        return
        
    extracted_data = []
    # lambda (selector, attr): (_.get(attr) if (_:= book.select_one(selector)) else "") # to avoid NoneType has no attr x
    for book in books:
        data = {
            #image element data
            "image_href": book.select_one(".image_container a").get("href"),  # better to fail fast on no container w/ a
            "image_alt": book.select_one(".image_container a").img.get("alt"), # reach inside <a> to find <img>
            "image_src": book.select_one(".image_container a").img.get("src"),
            
            # star rating
            "star_rating": [i for i in book.select_one(".star-rating").get("class") if i != "star-rating"][0],    # from list of classes on attr filter out star-rating assume index 0 is the rating 

            # title element data
            "title_href": book.select_one("h3 a").get("href"),
            "title_text": book.select_one("h3 a").get("title"),    #.get_text()?
            
            # price and availability
            "price": book.select_one(".product_price").select_one(".price_color").get_text(strip=True),
            "availability": book.select_one(".product_price").select_one(".availability").get_text(strip=True),
        }
        extracted_data.append(data)
    return extracted_data

In [None]:
def pager(base):
    """get all next pages starting at base"""
    links = []
    while (next, rest := fetch_next_page(base)):
        links.append(next)
        base = next
    return links

In [None]:
def complete_links(books:list[dict], base_url):
    for i, book in enumerate(books):
        new_book = book.copy()
        for key, val in book.items():
            if "href" in key or "src" in key:
                new_book[key] = urljoin(base_url, val)
        books[i] = new_book
    
    return books

In [None]:
url = "https://books.toscrape.com"
html = """
<article class="product_pod">
  <div class="image_container">
    <a href="catalogue/a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg" /></a>
  </div>
  <p class="star-rating Three">
    <i class="icon-star"></i>
    <i class="icon-star"></i>
    <i class="icon-star"></i>
    <i class="icon-star"></i>
    <i class="icon-star"></i>
  </p>
  <h3><a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
  <div class="product_price">
    <p class="price_color">Â£51.77</p>
    <p class="instock availability">
      <i class="icon-ok"></i>
      
       In stock
       
    </p>
    <form>
      <button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
    </form>
  </div>
</article>
"""
class DummyResponse():
    text = html
    status_code = 200

I've decided to take advantage of Python modularization
to make this notebook the testing grounds for all the tasks i set to showcase in web scaroing using he site https://books.toscrape.com

the project consists of the files 🗃️

In [None]:
%%sh
ls

we'll mostly be working from this notebook 📓

with all of the inner code abstracted away and tucked into tiny modular .py scripts

exposing only the function calls in hopes of being able to reuse the same functions but with different arguments to scrape different sites

In [None]:
%%html
<header>
    <link rel="stylesheet" href="css/styles.css" />
</header>

<body id="default" class="default">
    <header class="header container-fluid">
        <div class="page_inner">
            <div class="row">
                <div class="col-sm-8 h1"><a href="../index.html">Books to Scrape</a><small> We love being scraped!</small>
</div>
            </div>
        </div>
    </header>
<div class="container-fluid page">
    <div class="page_inner">
    <ul class="breadcrumb">
        <li>
            <a href="../index.html">Home</a>
        </li>
        <li class="active">All products</li>
    </ul>
        <div class="row">
            <aside class="sidebar col-sm-4 col-md-3">
                <div id="promotions_left">
                </div>
        <div class="side_categories">
            <ul class="nav nav-list">
                    <li>
                        <a href="https://books.toscrape.com/category/books_1/index.html">Books</a>
                        <ul>
                    <li>
                        <a href="https://books.toscrape.com/category/books/travel_2/index.html">Travel</a>
                        </li>
                    <li>
                        <a href="https://books.toscrape.com/category/books/sequential-art_5/index.Sequential Art</a>
                        </li>        
                    <li>
                        <a href="https://books.toscrape.com/category/books/crime_51/index.html">Crime</a>
                        </li>
                            </ul></li>
            </ul>
        </div>
            </aside>

            <div class="col-sm-8 col-md-9">
                <div class="page-header action">
                    <h1>All products</h1>
                </div>
<div id="messages"></div>
                <div id="promotions"></div>         
    <form method="get" class="form-horizontal">
        <div style="display:none"> 
        </div>
                    <strong>1000</strong> results - showing <strong>981</strong> to <strong>1000</strong>. 
    </form>
        <section>
<div>
                <ol class="row">
                        <li class="col-xs-6 col-sm-4 col-md-3 col-lg-3">
    <article class="product_pod">
            <div class="image_container">
                    <a href="https://books.toscrape.com/ajin-demi-human-volume-1-ajin-demi-human-1_4/index.html"><img src="https://books.toscrape.com/media/cache/09/7c/097cb5ecc6fb3fbe1690cf0cbdea4ac5.jpg" alt="Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)" class="thumbnail"></a>
            </div>
                <p class="star-rating Four">
                    <i class="icon-star"></i>
                    <i class="icon-star"></i>
                    <i class="icon-star"></i>
                </p>
            <h3><a href="https://books.toscrape.com/ajin-demi-human-volume-1-ajin-demi-human-1_4/index.html" title="Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)">Ajin: Demi-Human, Volume 1 ...</a></h3>
            <div class="product_price">
        <p class="price_color">£57.06</p>
<p class="instock availability">
    <i class="icon-ok"></i>
        In stock
</p>
    <form>
        <button type="submit" class="btn btn-primary btn-block" data-loading-text="Adding...">Add to basket</button>
    </form> 
            </div>
    </article>
</li>
                        <li class="col-xs-6 col-sm-4 col-md-3 col-lg-3">
    <article class="product_pod">
            <div class="image_container">
                    <a href="https://books.toscrape.com/1000-places-to-see-before-you-die_1/index.html"><img src="https://books.toscrape.com/media/cache/d7/0f/d70f7edd92705c45a82118c3ff6c299d.jpg" alt="1,000 Places to See Before You Die" class="thumbnail"></a>      
            </div>
                <p class="star-rating Five">
                    <i class="icon-star"></i>
                    <i class="icon-star"></i>
                    <i class="icon-star"></i>
                </p>
            
            <h3><a href="https://books.toscrape.com/1000-places-to-see-before-you-die_1/index.html" title="1,000 Places to See Before You Die">1,000 Places to See ...</a></h3>
            <div class="product_price">
        <p class="price_color">£26.08</p>
<p class="instock availability">
    <i class="icon-ok"></i>
        In stock
</p>
    <form>
        <button type="submit" class="btn btn-primary btn-block" data-loading-text="Adding...">Add to basket</button>
    </form>
            </div>
    </article>
</li>
                </ol>
    <div>
        <ul class="pager">
                <li class="previous"><a href="page-49.html">previous</a></li>
            <li class="current">
                Page 50 of 50
            </li>
        </ul>
    </div>
            </div>
        </section>
            </div>
        </div>
    </div>
</div>


    
<footer class="footer container-fluid"></footer>
</body>

above is a minimal version of the books to scrape web site 
shown for demonstarion well be woriking to srcape components for infomation using requests and beautifulsoup

the target componebt 1st wil be the asisde navlist to get the laist of book cartegories 

In [None]:
# urls = ["https://books.toscrape.com/catalogue/page-1.html", "https://books.toscrape.com/catalogue/page-2.html", "https://books.toscrape.com/catalogue/page-3.html", "https://books.toscrape.com/catalogue/page-4.html"]

# with ThreadPoolExecutor(max_workers=5) as executor:
#     results = executor.map(fetch_response, urls)

# for i in results:
#     print(i)

In [None]:
# response = fetch_response(url) or DummyResponse()
# soup = make_soup(response)
# books = extract_all_books_data(soup)
# book = books[0]

In [None]:
# block for retriving list of categories
# nav = soup.select_one(".nav-list")