In [None]:
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin
import shutil

import logging
import os

from concurrent.futures import ThreadPoolExecutor

from functools import partial
from IPython.display import display, HTML

In [None]:
_logger = None

def get_logger(name="books_scraper"):
    """Singleton logger to be used across the project.
    
    This version explicitly deletes the logs directory and its contents
    before recreating it to ensure a clean start for each program run.
    """
    global _logger
    if _logger:
        return _logger

    if os.path.exists("logs"):
        shutil.rmtree("logs")
    
    os.makedirs("logs", exist_ok=True)
    
    logger = logging.getLogger(name)
    logger.setLevel(logging.INFO)

    if not logger.handlers:
        formatter = logging.Formatter(
            "[%(levelname)s] %(filename)s:%(lineno)d %(funcName)s() %(message)s %(asctime)s"
        )

        file_handler = logging.FileHandler("logs/scraping.log", mode="w")
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)

        console = logging.StreamHandler()
        console.setFormatter(formatter)
        logger.addHandler(console)

    _logger = logger
    return logger

try:
    logger = get_logger()
    logger_fallback = False
except Exception as e:
    print(f"[Logger fallback] Logger not available: {type(e).__name__} - {e}")
    
    class DummyLogger:
        def info(self, msg): print(f"[INFO] {msg}")
        def warning(self, msg): print(f"[WARN] {msg}")
        def error(self, msg): print(f"[ERROR] {msg}")
        def exception(self, msg): print(f"[EXCEPTION] {msg}")
    
    logger = DummyLogger()
    logger_fallback = True

In [None]:
def fetch_response(url, session=requests.Session()):
    try:
        response = session.get(url, timeout=10)
        response.raise_for_status()
        logger.info(f"fetched: {url}")
        return response
    except requests.exceptions.HTTPError as errh:
        logger.warning(f"HTTP error occurred:\n{errh}")
    except requests.exceptions.RequestException as err:
        logger.warning(f"{type(err).__name__}\n{err}")
    except Exception as e:
        logger.warning(f"Unhandled exception:\n{type(e).__name__}: {e}")

my_html = """
<header>
    <link rel="stylesheet" href="styles/styles.css" />
</header>

<body id="default" class="default">
  <header class="header container-fluid">
    <div class="page_inner">
      <div class="row">
        <div class="col-sm-8 h1"><a href="https://books.toscrape.com/index.html">Books to Scrape</a><small> We love being scraped!</small>
        </div>
      </div>
    </div>
  </header>
  <div class="container-fluid page">
    <div class="page_inner">
      <ul class="breadcrumb">
        <li>
          <a href="https://books.toscrape.com/index.html">Home</a>
        </li>
        <li class="active">All products</li>
      </ul>
      <div class="row">
        <aside class="sidebar col-sm-4 col-md-3">
          <div id="promotions_left">
          </div>
          <div class="side_categories">
            <ul class="nav nav-list">
              <li>
                <a href="https://books.toscrape.com/category/books_1/index.html">Books</a>
            </li>
            </ul>
            <ul>
              <li>
                <a href="https://books.toscrape.com/category/books/travel_2/index.html">Travel</a>
              </li>
                  <li>
                    <a href="https://books.toscrape.com/category/books/sequential-art_5/index.html">Sequential Art</a>
                  </li>
                  <li>
                    <a href="https://books.toscrape.com/category/books/crime_51/index.html">Crime</a>
                  </li>
                </ul>
              </li>
            </ul>
          </div>
        </aside>
        
        <div class="col-sm-8 col-md-9">
          <div class="page-header action">
            <h1>All products</h1>
          </div>
          <div id="messages"></div>
          <div id="promotions"></div>
          <form method="get" class="form-horizontal">
            <div style="display:none"></div>
            <strong>1000</strong> results - showing <strong>981</strong> to <strong>1000</strong>.
          </form>
          <section>
            <div>
              <ol class="row">
                <li class="col-xs-6 col-sm-4 col-md-3 col-lg-3">
                  <article class="product_pod">
                    <div class="image_container">
                      <a href="https://books.toscrape.com/ajin-demi-human-volume-1-ajin-demi-human-1_4/index.html"><img src="https://books.toscrape.com/media/cache/09/7c/097cb5ecc6fb3fbe1690cf0cbdea4ac5.jpg" alt="Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)" class="thumbnail"></a>
                    </div>
                    <p class="star-rating three">
                      <i class="icon-star"></i>
                      <i class="icon-star"></i>
                      <i class="icon-star"></i>
                    </p>
                    <h3><a href="https://books.toscrape.com/ajin-demi-human-volume-1-ajin-demi-human-1_4/index.html" title="Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)">Ajin: Demi-Human, Volume 1 ...</a></h3>
                    <div class="product_price">
                      <p class="price_color">£57.06</p>
                      <p class="instock availability">
                        <i class="icon-ok"></i>
                        In stock
                      </p>
                      <form>
                        <button type="submit" class="btn btn-primary btn-block" data-loading-text="Adding...">Add to basket</button>
                      </form>
                    </div>
                  </article>
                </li>
                <li class="col-xs-6 col-sm-4 col-md-3 col-lg-3">
                  <article class="product_pod">
                    <div class="image_container">
                      <a href="https://books.toscrape.com/1000-places-to-see-before-you-die_1/index.html"><img src="https://books.toscrape.com/media/cache/d7/0f/d70f7edd92705c45a82118c3ff6c299d.jpg" alt="1,000 Places to See Before You Die" class="thumbnail"></a>
                    </div>
                    <p class="star-rating Five">
                      <i class="icon-star"></i>
                      <i class="icon-star"></i>
                      <i class="icon-star"></i>
                    </p>
                    
                    <h3><a href="https://books.toscrape.com/1000-places-to-see-before-you-die_1/index.html" title="1,000 Places to See Before You Die">1,000 Places to See ...</a></h3>
                    <div class="product_price">
                      <p class="price_color">£26.08</p>
                      <p class="instock availability">
                        <i class="icon-ok"></i>
                        In stock
                      </p>
                      <form>
                        <button type="submit" class="btn btn-primary btn-block" data-loading-text="Adding...">Add to basket</button>
                      </form>
                    </div>
                  </article>
                </li>
              </ol>
              <div>
                <ul class="pager">
                  <li class="previous"><a href="https://books.toscrape.com/catalogue/page-49.html">previous</a></li>
                  <li class="current">
                    Page 50 of 50
                  </li>
                </ul>
              </div>
            </div>
          </section>
        </div>
      </div>
    </div>
  </div>
  <footer class="footer container-fluid"></footer>
</body>
"""
class DummyResponse():
    text = my_html
    status_code = 200

display(HTML(my_html))

In [None]:
def make_soup(response, url=None):
    if isinstance(url, str):
        response = fetch_response(url)
    elif not (response and response.status_code == 200):
        logger.warning(f"Couldn't make soup with: {response}")
        return None
    soup = BeautifulSoup(response.text, "lxml")
    logger.info("Soup's ready!")
    return soup

In [None]:
def extract_blocks(soup, block_selector, fields):
    """
    Extract structured data blocks from soup.

    :param soup: BeautifulSoup object
    :param block_selector: CSS selector to identify each block
    :param fields: dict of {field_name: (selector, attr)}, relative to the block
    :return: List of dicts with extracted data
    """
    if not soup or not (blocks := soup.select(block_selector)):
        logger.info(f"No blocks found with selector '{block_selector}'. soup: {soup}")
        return []

    extracted_data = []
    for block in blocks:
        block_data = {}
        for field_name, (selector, attr) in fields.items():
            elements = block.select(selector)

            values = []
            for el in elements:
                match attr:
                    case "text":
                        values.append(el.get_text(strip=True, separator=" "))
                    case "html":
                        values.append(str(el))
                    case "inner_html":
                        values.append("".join(str(c) for c in el.contents))
                    case _:
                        values.append(el.get(attr, ""))
            
            block_data[field_name] = values if values else None
        extracted_data.append(block_data)

    logger.info(f"Extracted {len(extracted_data)} blocks using '{block_selector}'")
    return extracted_data


In [None]:
def pager(base, max_pages, fetcher=None, callback=None):
    """
    Iterate through paginated links starting from `base`.
    Optionally inject:
      - `fetcher`: custom page-fetching logic.
      - `callback`: function to call with each (url, soup) pair.
    """

    pages = {}
    while fetcher and (fetched_page := fetcher(base)) and len(pages) <= max_pages:
        next_url, page_soup = fetched_page
        pages[next_url] = (page_soup, callback(page_soup) if callback else None)
    
        base = next_url
    return pages

In [None]:
def fetch_link_in_page(base_url, selector=None, attr="href"):
    res = fetch_response(base_url)
    soup = make_soup(res)

    if not all([soup, selector]): return
    
    link = ((el := soup.select_one(selector))) and ((el.get(attr)) if soup else None)
    return urljoin(base_url, link), soup if link else None

I've decided to take advantage of Python modularization
to make this notebook the testing grounds for all the tasks i set to showcase in web scaroing using he site https://books.toscrape.com

the project consists of the files 🗃️


we'll mostly be working from this notebook 📓

with all of the inner code abstracted away and tucked into tiny modular .py scripts

exposing only the function calls in hopes of being able to reuse the same functions but with different arguments to scrape different sites



above is a minimal version of the books to scrape web site 
shown for demonstarion well be woriking to srcape components for infomation using requests and beautifulsoup

the target componebt 1st wil be the asisde navlist to get the laist of book cartegories 

In [None]:
book_fields = {
    #image element data
    "img_href": (".image_container a", "href"),
    "img_alt": (".image_container a img", "alt"),
    "img_src": (".image_container a img", "src"),
    # star rating
    "rating": (".star-rating", "class"),
    # title element data
    "book_href": ("h3 a", "href"),
    "book_text": ("h3 a", "text"),
    # price and availability
    "price": (".price_color", "text"),
    "availability": (".availability", "text"),
    }
    # lambda (selector, attr): (_.get(attr) if (_:= book.select_one(selector)) else "") # to avoid NoneType has no attr x

cartegory_fields = {
    "cartegory_href": ("a", "href"),
    "cartegory_text": ("a", "text"),    
}

# block = extract_blocks(soup, ".nav-list ul", fields)
# block[0]

# block = extract_blocks(soup, ".product_pod", fields)

In [None]:
url = "https://books.toscrape.com"

extract_books = partial(extract_blocks, block_selector=".product_pod", fields=book_fields)

fetch_next = partial(fetch_link_in_page, selector=".pager .next a")

pages = pager(url, 3, fetcher=fetch_next, callback=extract_books)

In [None]:
for page in pages.values():
    print(page[1], end="\n\n\n")