In [None]:
from urllib.parse import urljoin

from functools import partial
from IPython.display import display, HTML

from main.utils import get_logger
from main.scraper import (fetch_response, make_soup)

In [None]:
try:
    logger = get_logger()
    logger_fallback = False
except Exception as e:
    print(f"[Logger fallback] Logger not available: {type(e).__name__} - {e}")
    
    class DummyLogger:
        def info(self, msg): print(f"[INFO] {msg}")
        def warning(self, msg): print(f"[WARN] {msg}")
        def error(self, msg): print(f"[ERROR] {msg}")
        def exception(self, msg): print(f"[EXCEPTION] {msg}")
    
    logger = DummyLogger()
    logger_fallback = True

In [None]:
my_html = """
<header>
    <link rel="stylesheet" href="styles/styles.css" />
</header>

<body id="default" class="default">
  <header class="header container-fluid">
    <div class="page_inner">
      <div class="row">
        <div class="col-sm-8 h1"><a href="https://books.toscrape.com/index.html">Books to Scrape</a><small> We love being scraped!</small>
        </div>
      </div>
    </div>
  </header>
  <div class="container-fluid page">
    <div class="page_inner">
      <ul class="breadcrumb">
        <li>
          <a href="https://books.toscrape.com/index.html">Home</a>
        </li>
        <li class="active">All products</li>
      </ul>
      <div class="row">
        <aside class="sidebar col-sm-4 col-md-3">
          <div id="promotions_left">
          </div>
          <div class="side_categories">
            <ul class="nav nav-list">
              <li>
                <a href="https://books.toscrape.com/category/books_1/index.html">Books</a>
            </li>
            </ul>
            <ul>
              <li>
                <a href="https://books.toscrape.com/category/books/travel_2/index.html">Travel</a>
              </li>
                  <li>
                    <a href="https://books.toscrape.com/category/books/sequential-art_5/index.html">Sequential Art</a>
                  </li>
                  <li>
                    <a href="https://books.toscrape.com/category/books/crime_51/index.html">Crime</a>
                  </li>
                </ul>
              </li>
            </ul>
          </div>
        </aside>
        
        <div class="col-sm-8 col-md-9">
          <div class="page-header action">
            <h1>All products</h1>
          </div>
          <div id="messages"></div>
          <div id="promotions"></div>
          <form method="get" class="form-horizontal">
            <div style="display:none"></div>
            <strong>1000</strong> results - showing <strong>981</strong> to <strong>1000</strong>.
          </form>
          <section>
            <div>
              <ol class="row">
                <li class="col-xs-6 col-sm-4 col-md-3 col-lg-3">
                  <article class="product_pod">
                    <div class="image_container">
                      <a href="https://books.toscrape.com/ajin-demi-human-volume-1-ajin-demi-human-1_4/index.html"><img src="https://books.toscrape.com/media/cache/09/7c/097cb5ecc6fb3fbe1690cf0cbdea4ac5.jpg" alt="Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)" class="thumbnail"></a>
                    </div>
                    <p class="star-rating three">
                      <i class="icon-star"></i>
                      <i class="icon-star"></i>
                      <i class="icon-star"></i>
                    </p>
                    <h3><a href="https://books.toscrape.com/ajin-demi-human-volume-1-ajin-demi-human-1_4/index.html" title="Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)">Ajin: Demi-Human, Volume 1 ...</a></h3>
                    <div class="product_price">
                      <p class="price_color">£57.06</p>
                      <p class="instock availability">
                        <i class="icon-ok"></i>
                        In stock
                      </p>
                      <form>
                        <button type="submit" class="btn btn-primary btn-block" data-loading-text="Adding...">Add to basket</button>
                      </form>
                    </div>
                  </article>
                </li>
                <li class="col-xs-6 col-sm-4 col-md-3 col-lg-3">
                  <article class="product_pod">
                    <div class="image_container">
                      <a href="https://books.toscrape.com/1000-places-to-see-before-you-die_1/index.html"><img src="https://books.toscrape.com/media/cache/d7/0f/d70f7edd92705c45a82118c3ff6c299d.jpg" alt="1,000 Places to See Before You Die" class="thumbnail"></a>
                    </div>
                    <p class="star-rating Five">
                      <i class="icon-star"></i>
                      <i class="icon-star"></i>
                      <i class="icon-star"></i>
                    </p>
                    
                    <h3><a href="https://books.toscrape.com/1000-places-to-see-before-you-die_1/index.html" title="1,000 Places to See Before You Die">1,000 Places to See ...</a></h3>
                    <div class="product_price">
                      <p class="price_color">£26.08</p>
                      <p class="instock availability">
                        <i class="icon-ok"></i>
                        In stock
                      </p>
                      <form>
                        <button type="submit" class="btn btn-primary btn-block" data-loading-text="Adding...">Add to basket</button>
                      </form>
                    </div>
                  </article>
                </li>
              </ol>
              <div>
                <ul class="pager">
                  <li class="previous"><a href="https://books.toscrape.com/catalogue/page-49.html">previous</a></li>
                  <li class="current">
                    Page 50 of 50
                  </li>
                </ul>
              </div>
            </div>
          </section>
        </div>
      </div>
    </div>
  </div>
  <footer class="footer container-fluid"></footer>
</body>
"""

other_html = """
<body>
  
  <header class="navbar fixed-top navbar-expand-lg navbar-dark navbar-static svg-background" id="navbar-top" role="banner">
    <div class="container">
      <div class="navbar-header">
        <a data-bs-target=".side-collapse" data-bs-target-2=".side-collapse-container" data-bs-toggle="collapse-side">
          <button aria-controls="navbar" aria-expanded="false" class="navbar-toggler float-end collapsed" data-bs-target="#navbar" data-bs-target-2=".side-collapse-container" data-bs-target-3=".side-collapse" data-bs-toggle="collapse" type="button">
            <span class="visually-hidden">Toggle navigation</span>
            <span class="icon-bar top-bar"></span>
            <span class="icon-bar middle-bar"></span>
            <span class="icon-bar bottom-bar"></span>
            <span class="icon-bar extra-bottom-bar"></span>
          </button>
        </a>
        <div class="navbar-brand">
          <a href="/"><img alt="Web Scraper" src="/img/logo_white.svg" /></a>
        </div>
      </div>
      <div class="side-collapse in">
        <nav class="navbar-collapse collapse" id="navbar" role="navigation">
          <ul class="nav navbar-nav navbar-right">
            <li class="nav-item">
              <a class="nav-link menuitm" href="/">
                <p>Web Scraper</p>
                <div class="crta"></div>
              </a>
            </li>
            <li class="nav-item dropdown">
              <button aria-expanded="false" aria-haspopup="true" class="menuitm nav-link dropdown-toggle" data-bs-toggle="dropdown" id="dropdownMenuLink" role="button">
                <p>
                  Learn
                  <i class="ws-icon ws-icon-down"></i>
                </p>
                <span class="crta"></span>
              </button>
              <ul aria-labelledby="dropdownMenuLink" class="dropdown-menu">
                <li>
                  <a class="dropdown-item" href="/documentation">Documentation</a>
                </li>
              </ul>
            </li>
          </ul>
        </nav>
      </div>
    </div>
  </header>
  
  
  
  
  <div class="wrapper">
    <div class="formenu-here container-fluid">
    </div>
    
    <div class="container-fluid blog-hero">
      <div class="container">
        <div class="row">
          <div class="col-lg-12">
            <h1>Test Sites</h1>
          </div>
        </div>
      </div>
    </div>
    
    <div class="container test-site">
      <div class="row">
        <div class="col-lg-3 sidebar">
          <div class="navbar-light sidebar" role="navigation">
            <div class="sidebar-nav navbar-collapse">
              <ul class="nav flex-column" id="side-menu" itemscope="" itemtype="https://schema.org/SiteNavigationElement">
                <li class="nav-item">
                  <a class="nav-link" href="/test-sites/e-commerce/static" itemprop="url"><span itemprop="name">Home</span></a>
                </li>
                <li class="nav-item active">
                  <a aria-label="Navigation category" class="category-link nav-link" href="/test-sites/e-commerce/static/computers" itemprop="url">
                    <span itemprop="name">Computers</span>
                    <i aria-hidden="true" class="ws-icon ws-icon-right"></i>
                  </a>
                  <ul class="nav nav-second-level" itemscope="" itemtype="https://schema.org/SiteNavigationElement">
                    <li class="nav-item">
                      <a aria-label="Navigation subcategory" class="nav-link subcategory-link active" href="/test-sites/e-commerce/static/computers/laptops" itemprop="url">
                        <span itemprop="name">Laptops</span>
                      </a>
                    </li>
                    <li class="nav-item">
                      <a aria-label="Navigation subcategory" class="nav-link subcategory-link" href="/test-sites/e-commerce/static/computers/tablets" itemprop="url">
                        <span itemprop="name">Tablets</span>
                      </a>
                    </li>
                  </ul>
                </li>
                <li class="nav-item">
                  <a aria-label="Navigation category" class="category-link nav-link" href="/test-sites/e-commerce/static/phones" itemprop="url">
                    <span itemprop="name">Phones</span>
                    <i aria-hidden="true" class="ws-icon ws-icon-right"></i>
                  </a>
                </li>
              </ul>
            </div>
          </div>
        </div>
        <div class="col-lg-9">
          <h1 class="page-header">Computers / Laptops</h1>
          
          
          
          <div class="row">
            <div class="col-md-4 col-xl-4 col-lg-4">
              <div class="card thumbnail" itemscope="" itemtype="https://schema.org/Product">
                <div class="product-wrapper card-body">
                  <img alt="item" class="img-fluid card-img-top image img-responsive" itemprop="image" src="/images/test-sites/e-commerce/items/cart2.png" />
                  <div class="caption">
                    <h4 class="price float-end card-title pull-right" itemprop="offers" itemscope="" itemtype="https://schema.org/Offer">
                      <span itemprop="price">$416.99</span>
                      <meta content="USD" itemprop="priceCurrency" />
                    </h4>
                    <h4>
                      <a class="title" href="/test-sites/e-commerce/static/product/31" itemprop="name" title="Packard 255 G2">
                        Packard 255 G2
                      </a>
                    </h4>
                    <p class="description card-text" itemprop="description">15.6", AMD E2-3800 1.3GHz, 4GB, 500GB, Windows 8.1</p>
                  </div>
                  <div class="ratings" itemprop="aggregateRating" itemscope="" itemtype="https://schema.org/AggregateRating">
                    <p class="review-count float-end">
                      <span itemprop="reviewCount">2</span> reviews
                    </p>
                    <p data-rating="2">
                      <span class="ws-icon ws-icon-star"></span>
                      <span class="ws-icon ws-icon-star"></span>
                    </p>
                  </div>
                </div>
              </div>
            </div>
            <div class="col-md-4 col-xl-4 col-lg-4">
              <div class="card thumbnail" itemscope="" itemtype="https://schema.org/Product">
                <div class="product-wrapper card-body">
                  <img alt="item" class="img-fluid card-img-top image img-responsive" itemprop="image" src="/images/test-sites/e-commerce/items/cart2.png" />
                  <div class="caption">
                    <h4 class="price float-end card-title pull-right" itemprop="offers" itemscope="" itemtype="https://schema.org/Offer">
                      <span itemprop="price">$306.99</span>
                      <meta content="USD" itemprop="priceCurrency" />
                    </h4>
                    <h4>
                      <a class="title" href="/test-sites/e-commerce/static/product/32" itemprop="name" title="Aspire E1-510">
                        Aspire E1-510
                      </a>
                    </h4>
                    <p class="description card-text" itemprop="description">15.6", Pentium N3520 2.16GHz, 4GB, 500GB, Linux</p>
                  </div>
                  <div class="ratings" itemprop="aggregateRating" itemscope="" itemtype="https://schema.org/AggregateRating">
                    <p class="review-count float-end">
                      <span itemprop="reviewCount">2</span> reviews
                    </p>
                    <p data-rating="3">
                      <span class="ws-icon ws-icon-star"></span>
                      <span class="ws-icon ws-icon-star"></span>
                      <span class="ws-icon ws-icon-star"></span>
                    </p>
                  </div>
                </div>
              </div>
            </div>
            <div class="col-md-4 col-xl-4 col-lg-4">
              <div class="card thumbnail" itemscope="" itemtype="https://schema.org/Product">
                <div class="product-wrapper card-body">
                  <img alt="item" class="img-fluid card-img-top image img-responsive" itemprop="image" src="/images/test-sites/e-commerce/items/cart2.png" />
                  <div class="caption">
                    <h4 class="price float-end card-title pull-right" itemprop="offers" itemscope="" itemtype="https://schema.org/Offer">
                      <span itemprop="price">$1178.99</span>
                      <meta content="USD" itemprop="priceCurrency" />
                    </h4>
                    <h4>
                      <a class="title" href="/test-sites/e-commerce/static/product/33" itemprop="name" title="ThinkPad T540p">
                        ThinkPad T540p
                      </a>
                    </h4>
                    <p class="description card-text" itemprop="description">15.6", Core i5-4200M, 4GB, 500GB, Win7 Pro 64bit</p>
                  </div>
                  <div class="ratings" itemprop="aggregateRating" itemscope="" itemtype="https://schema.org/AggregateRating">
                    <p class="review-count float-end">
                      <span itemprop="reviewCount">2</span> reviews
                    </p>
                    <p data-rating="1">
                      <span class="ws-icon ws-icon-star"></span>
                    </p>
                  </div>
                </div>
              </div>
            </div>
          </div>
          
          
          
          
          
          <div id="static-pagination">
            <nav>
              <ul class="pagination">
                <li aria-disabled="true" aria-label="« Previous" class="page-item disabled">
                  <span aria-hidden="true" class="page-link">‹</span>
                </li>
                <li aria-current="page" class="page-item active"><span class="page-link">1</span></li>
                <li class="page-item"><a class="page-link" href="/test-sites/e-commerce/static/computers/laptops?page=2">2</a></li>
                <li class="page-item"><a class="page-link" href="/test-sites/e-commerce/static/computers/laptops?page=3">3</a></li>
                <li aria-disabled="true" class="page-item disabled"><span class="page-link">...</span></li>
                <li class="page-item"><a class="page-link" href="/test-sites/e-commerce/static/computers/laptops?page=19">19</a></li>
                <li class="page-item"><a class="page-link" href="/test-sites/e-commerce/static/computers/laptops?page=20">20</a></li>
                
                
                
                <li class="page-item">
                  <a aria-label="Next »" class="page-link next" href="/test-sites/e-commerce/static/computers/laptops?page=2" rel="next">›</a>
                </li>
              </ul>
            </nav>
          </div>
          
          
        </div>
      </div>
    </div>
    
    
    <div class="clearfix"></div>
    <div class="push"></div>
  </div>
  
  
  
  
  <div class="container-fluid footer" id="layout-footer">
    <div class="container">
      <div class="row">
        <div class="col-lg-3">
          <ul>
            <li>
              <p>Company</p>
            </li>
            <li>
              <a href="/about-us">About us</a>
            </li>
            <li>
              <a href="/extension-privacy-policy">Browser Extension Privacy Policy</a>
            </li>
            <li>
              <a href="https://webscraper.io/downloads/Web_Scraper_Media_Kit.zip">Media kit</a>
            </li>
            <li><a href="/jobs">Jobs</a></li>
          </ul>
        </div>
        <div class="col-lg-3">
          <ul>
            <li>
              <p>Resources</p>
            </li>
            <li><a href="/blog">Blog</a></li>
            <li>
              <a href="/documentation">Documentation</a>
            </li>
            <li>
              <a href="/tutorials">Video Tutorials</a>
            </li>
            <li>
              <a href="/screenshots">Screenshots</a>
            </li>
          </ul>
        </div>
      </div>
      <div class="row">
        <div class="col-lg-12">
          <p class="copyright">Copyright &amp;copy 2025
            <b>Web Scraper</b> | All rights reserved
          </p>
        </div>
      </div>
    </div>
  </div>
</body>
"""
#display(HTML(my_html))
#display(HTML(other_html))

In [None]:
def extract_blocks(soup, block_selector, fields):
    """
    Extract structured data blocks from soup.

    :param soup: BeautifulSoup object
    :param block_selector: CSS selector to identify each block
    :param fields: dict of {field_name: (selector, attr)}, relative to the block
    :return: List of dicts with extracted data
    """
    if not soup or not (blocks := soup.select(block_selector)):
        logger.info(f"No blocks found with selector '{block_selector}'. soup: {soup}")
        return []

    extracted_data = []
    for block in blocks:
        block_data = {}
        for field_name, (selector, attr) in fields.items():
            elements = block.select(selector)

            values = []
            for el in elements:
                match attr:
                    case "text":
                        values.append(el.get_text(strip=True, separator=" "))
                    case "html":
                        values.append(str(el))
                    case "inner_html":
                        values.append("".join(str(c) for c in el.contents))
                    case _:
                        values.append(el.get(attr, ""))
            
            block_data[field_name] = values if values else None
        extracted_data.append(block_data)

    logger.info(f"Extracted {len(extracted_data)} blocks using '{block_selector}'")
    return extracted_data


In [None]:
def pager(base, max_pages, block_selector, element_selector, fetcher, callback, block_index=0, item_index=0):
    """
    Iterate through paginated links starting from `base`.
    Optionally inject:
      - `fetcher`: custom page-fetching logic.
      - `callback`: function to call with each (url, soup) pair.
    """

    pages = {}
    counter = 1
    while (counter <= max_pages) and (fetched_page := fetcher(url=base)):
        page_soup = fetched_page
        link = extract_blocks(page_soup, block_selector, {element_selector: (element_selector, "href")})[block_index].get(element_selector)[item_index]
        next_url = urljoin(base, link)
        pages[next_url] = (page_soup, callback(page_soup) if callback else None)
    
        base = next_url
        counter += 1
    return pages

I've decided to take advantage of Python modularization
to make this notebook the testing grounds for all the tasks i set to showcase in web scaroing using he site https://books.toscrape.com

the project consists of the files 🗃️


we'll mostly be working from this notebook 📓

with all of the inner code abstracted away and tucked into tiny modular .py scripts

exposing only the function calls in hopes of being able to reuse the same functions but with different arguments to scrape different sites



above is a minimal version of the books to scrape web site 
shown for demonstarion well be woriking to srcape components for infomation using requests and beautifulsoup

the target componebt 1st wil be the asisde navlist to get the laist of book cartegories 

In [None]:
book_fields = {
    #image element data
    "img_href": (".image_container a", "href"),
    "img_alt": (".image_container a img", "alt"),
    "img_src": (".image_container a img", "src"),
    # star rating
    "rating": (".star-rating", "class"),
    # title element data
    "book_href": ("h3 a", "href"),
    "book_text": ("h3 a", "text"),
    # price and availability
    "price": (".price_color", "text"),
    "availability": (".availability", "text"),
    }
    # lambda (selector, attr): (_.get(attr) if (_:= book.select_one(selector)) else "") # to avoid NoneType has no attr x

cartegory_fields = {
    "cartegory_href": ("a", "href"),
    "cartegory_text": ("a", "text"),    
}


product_fields = {
    "img_alt": ("img", "alt"),
    "img_src": ("img", "src"),
    "price": (".caption span", "text"),
    "currency": (".caption meta", "content"),
    "title": (".caption h4 a", "title"),
    "description": (".caption p.description", "text"),
    "review-count": (".ratings p.review-count span", "text"),
    "rating": (".ratings p", "data-rating"),
         }
# block = extract_blocks(soup, ".nav-list ul", fields)
# block[0]

# block = extract_blocks(soup, ".product_pod", fields)

In [None]:
#url = "https://books.toscrape.com"

#extract_books = partial(extract_blocks, block_selector=".product_pod", fields=book_fields)

#pages = pager(url, 3, ".pager .next", "a", fetcher=make_soup, callback=extract_books)

In [None]:
url = "https://webscraper.io/test-sites/e-commerce/static/computers/laptops" 

soup = make_soup(url=url)
extract_blocks(soup, ".product-wrapper.card-body", product_fields)

In [None]:
extract_products = partial(extract_blocks, block_selector=".product-wrapper.card-body", fields=product_fields)

pages = pager(url, 3, ".page-item", "a.next", fetcher=make_soup, callback=extract_products, block_index=-1)

In [None]:
for page in pages.values():
    print(page[1], end="\n\n\n")
    
len(pages.values())