find a way to avoid requesting the next page only to get the url of the page after it  if they are all available (next link + page 3 link) on 1st page 

- imports

In [1]:
from functools import partial
import pandas as pd
from pathlib import Path

# from main.utils import get_logger
# from main.scraper import (fetch_response, make_soup, extract_blocks, pager)

- modules ship

In [2]:
import requests
from requests.adapters import HTTPAdapter
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from urllib3.util.retry import Retry
# import pandas as pd



def fetch_response(url, session=None, timeout=10):
    if not isinstance(url, str) or not url.startswith("http"):
        logger.warning(f"Invalid URL: {url}")
        return None
    session = session or requests.Session()

    if not hasattr(session, "_retry_configured"):
        retry_strategy = Retry(
            total=3,
            status_forcelist=[429, 500, 502, 503, 504],
            allowed_methods=["GET", "HEAD"],
            backoff_factor=1,
        )
        adapter = HTTPAdapter(max_retries=retry_strategy)
        session.mount("http://", adapter)
        session.mount("https://", adapter)
        session._retry_configured = True

    headers = {
        "User-Agent": "Mozilla/5.0",
        'Accept-Charset': 'utf-8',
    }
    try:
        response = session.get(url, headers=headers, timeout=timeout)
        response.raise_for_status()
        response.encoding = 'utf-8'
        logger.info(f"fetched: {url} [{response.status_code}]")
        return response
    except requests.exceptions.HTTPError as errh:
        logger.warning(f"HTTP error occurred:\n{errh}")
    except requests.exceptions.RequestException as err:
        logger.warning(f"{type(err).__name__}\n{err}")
    except Exception as e:
        logger.warning(f"Unhandled exception:\n{type(e).__name__}: {e}")





def make_soup(response=None, url=None):
    if isinstance(url, str):
        response = fetch_response(url)
    if not (response and response.status_code == 200):
        logger.warning(f"Couldn't make soup with: {response}")
        return None
    
    soup = BeautifulSoup(response.text, "lxml")
    return soup






def safe_soup_select(soup, selector):
    try:
        return soup.select(selector)
    except AttributeError as err:
        logger.error(f"{selector} {soup} returning {None}: {err}")
    except Exception as e:
        logger.error(f"Invalid CSS selector '{selector}', returning None: {e}")

    return None

def extract_blocks(soup, block_selector, fields):
    """
    Extract structured data blocks from soup.

    :param soup: BeautifulSoup object
    :param block_selector: CSS selector to identify each block
    :param fields: dict of {field_name: (selector, attr, default)}
                   - attr defaults to "text"
                   - default defaults to None
    :return: List of dicts with extracted data
    """
    blocks = safe_soup_select(soup, block_selector)
    if not blocks:
        logger.info(f"No blocks found with selector '{block_selector}'.")
        return []

    extracted_data = []
    for i, block in enumerate(blocks, start=1):
        block_data = {}
        for field_name, field_def in fields.items():
            try:
                selector, *rest = field_def
                attr = rest[0] if len(rest) > 0 else "text"
                field_default = rest[1] if len(rest) > 1 else None
                attr = attr or "text"
                
                elements = safe_soup_select(block, selector)
                if not elements:
                    logger.info(f"Invalid selector '{selector}' for field '{field_name}' or No such elements found block '{block}'")
                    elements = []

                values = []
                for el in elements:
                    try:
                        if attr == "text":
                            value = el.get_text(strip=True, separator=" ")
                        elif attr == "html":
                            value = str(el)
                        elif attr == "inner_html":
                            value = "".join(str(c) for c in el.contents)
                        else:
                            value = el.get(attr)

                        if value:
                            values.append(value)
                    except Exception as e:
                        logger.error(f"Error extracting attr '{attr}' for field '{field_name}': {e}")

                block_data[field_name] = values if values else field_default

            except Exception as e:
                logger.error(f"Unexpected error in field '{field_name}' (block {i}): {e}")
                block_data[field_name] = None

        extracted_data.append(block_data)
    return extracted_data





def page_processor(soup, extraction_jobs):
  return extract_multiple_blocks(soup, extraction_jobs)

    


def pager(base, next_page_selectors, max_pages=3, fetcher=make_soup, callback=page_processor, **kwargs):
    """
    Iterate through paginated links starting from `base`.
    :param callback: A function that can process a single soup object and return
                     a dictionary of multiple data sets.
    """
    pages = {}
    counter = 1
    current_url = base

    while counter <= max_pages:
        page_soup = fetcher(url=current_url)
        if not page_soup:
            break

        # The callback now returns a dict of all the data from the page
        extracted_data = callback(page_soup, **kwargs)
        pages[current_url] = extracted_data

        next_url = None
        for block_sel, elem_sel in next_page_selectors:
            try:
                link_data = extract_blocks(page_soup, block_sel, {elem_sel: (elem_sel, "href")})
                if link_data:
                    link = link_data[0].get(elem_sel)
                    if link:
                        next_url = urljoin(current_url, link[0])
                        break
            except Exception as e:
                print(f"Failed to find next page link with selectors '{block_sel}' and '{elem_sel}': {e}")
        
        if not next_url or next_url == current_url:
            print("No next page link found or link is the same as current page. Stopping.")
            break
        
        current_url = next_url
        counter += 1

    return pages





def extract_multiple_blocks(soup, extraction_jobs):
    """
    Extracts multiple sets of structured data blocks from a single soup object.

    :param soup: BeautifulSoup object
    :param extraction_jobs: A dictionary mapping data type names to a dict
                            containing 'block_selector' and 'fields'.
    :return: A dictionary of lists, where each key is a data type (e.g., 'books', 'categories')
             and the value is a list of extracted data dictionaries.
    """
    if not soup:
        logger.error("extract_multiple_blocks() called with no soup object.")
        return {}

    all_extracted_data = {}
    for job_name, job_config in extraction_jobs.items():
        block_selector = job_config.get("block_selector")
        fields = job_config.get("fields")

        # Reuse the existing extract_blocks function for each job
        extracted_data = extract_blocks(soup, block_selector, fields)
        all_extracted_data[job_name] = extracted_data
    
    return all_extracted_data

    




transformer = lambda data: pd.DataFrame([
    {key: val[0] if isinstance(val, list) and val else None for key, val in record.items()}
    for record in data
]) if data else None # get 0th index of all list type values make it new key value



def join_page_records(scraped_data, job_name):
    all_data = []
    for page_url in scraped_data:
        if job_name in scraped_data[page_url] and scraped_data[page_url][job_name]:
            all_data.extend(scraped_data[page_url][job_name])
    return all_data




# --------------------------------------------------------

import os
import logging
import shutil

_logger = None
def get_logger(name="books_scraper"):
    """Singleton logger to be used across the project.
    
    This version explicitly deletes the logs directory and its contents
    before recreating it to ensure a clean start for each program run.
    """
    global _logger
    if _logger:
        return _logger

    if os.path.exists("logs"):
        shutil.rmtree("logs")
    
    os.makedirs("logs", exist_ok=True)
    
    logger = logging.getLogger(name)
    logger.setLevel(logging.INFO)

    if not logger.handlers:
        formatter = logging.Formatter(
            "[%(levelname)s] %(filename)s:%(lineno)d %(funcName)s() %(message)s %(asctime)s"
        )

        file_handler = logging.FileHandler("logs/scraping.log", mode="w")
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)

        console = logging.StreamHandler()
        console.setFormatter(formatter)
        logger.addHandler(console)

    _logger = logger
    return logger

logger = get_logger()

- extraction fields / miscellaneous(extras)

In [3]:
allinone_extractions = {
    "laptops": {
        "block_selector": ".product-wrapper.card-body",
        "fields": {
            "img_alt": ("img", "alt"),
            "img_src": ("img", "src"),
            "price": (".caption span", "text"),
            "currency": (".caption meta", "content"),
            "title": (".caption h4 a", "title"),
            "description": (".caption p.description", "text"),
            "review-count": (".ratings p.review-count span", "text"),
            "rating": (".ratings p", "data-rating"),
        }
    },
}




all_book_extractions = {
    "books": {
        "block_selector": ".product_pod",
        "fields": {
            "img_href": (".image_container a", "href"),
            "img_alt": (".image_container a img", "alt"),
            "img_src": (".image_container a img", "src"),
            "rating": (".star-rating", "class"),
            "book_href": ("h3 a", "href"),
            "book_text": ("h3 a", "text"),
            "price": (".price_color", "text"),
            "availability": (".availability", "text"),
        }
    },
    "categories": {
        "block_selector": ".nav-list ul",
        "fields": {
            "category_href": ("a", "href"),
            "category_text": ("a", "text"),
        }
    }
}

rating_map = {
    "one": 1,
    "two": 2,
    "three": 3,
    "four": 4,
    "five": 5
}


output_dir = Path("src/main/data")
output_dir.mkdir(parents=True, exist_ok=True)

# lambda (selector, attr): (_.get(attr) if (_:= book.select_one(selector)) else "") # to avoid NoneType has no attr x

---

#### all-in-one / books-to-scrape

- requests
- httpx

- bs4.beutifulSoup  .select_one("css")
- selectorlax.parser HTMLParser   .css_first("css")

- pagination - soup fetch
- common pager component types
- - with next page button
  - without next button
  - - url-based (https://example.com/products?page=)
  - infinit scroll
  - load more button 

In [4]:
scraped_data = pager(
    base="https://webscraper.io/test-sites/e-commerce/static/computers/laptops",
    next_page_selectors=[(".page-item", ".next")],
    extraction_jobs=allinone_extractions
)

[INFO] 3652031430.py:36 fetch_response() fetched: https://webscraper.io/test-sites/e-commerce/static/computers/laptops [200] 2025-08-18 15:50:37,571
[INFO] 3652031430.py:102 extract_blocks() Invalid selector '.next' for field '.next' or No such elements found block '<li aria-disabled="true" aria-label="« Previous" class="page-item disabled">
<span aria-hidden="true" class="page-link">‹</span>
</li>' 2025-08-18 15:50:37,822
[INFO] 3652031430.py:102 extract_blocks() Invalid selector '.next' for field '.next' or No such elements found block '<li aria-current="page" class="page-item active"><span class="page-link">1</span></li>' 2025-08-18 15:50:37,832
[INFO] 3652031430.py:102 extract_blocks() Invalid selector '.next' for field '.next' or No such elements found block '<li class="page-item"><a class="page-link" href="/test-sites/e-commerce/static/computers/laptops?page=2">2</a></li>' 2025-08-18 15:50:37,842
[INFO] 3652031430.py:102 extract_blocks() Invalid selector '.next' for field '.next'

No next page link found or link is the same as current page. Stopping.


In [5]:
scraped_data_1_phones = pager(
    base="https://webscraper.io/test-sites/e-commerce/static/phones/touch",
    next_page_selectors=[(".page-item", ".next")],
    extraction_jobs=allinone_extractions
)

[INFO] 3652031430.py:36 fetch_response() fetched: https://webscraper.io/test-sites/e-commerce/static/phones/touch [200] 2025-08-18 15:50:41,433
[INFO] 3652031430.py:102 extract_blocks() Invalid selector '.next' for field '.next' or No such elements found block '<li aria-disabled="true" aria-label="« Previous" class="page-item disabled">
<span aria-hidden="true" class="page-link">‹</span>
</li>' 2025-08-18 15:50:41,656
[INFO] 3652031430.py:102 extract_blocks() Invalid selector '.next' for field '.next' or No such elements found block '<li aria-current="page" class="page-item active"><span class="page-link">1</span></li>' 2025-08-18 15:50:41,662
[INFO] 3652031430.py:102 extract_blocks() Invalid selector '.next' for field '.next' or No such elements found block '<li class="page-item"><a class="page-link" href="/test-sites/e-commerce/static/phones/touch?page=2">2</a></li>' 2025-08-18 15:50:41,676


No next page link found or link is the same as current page. Stopping.


---

In [6]:
scraped_data_2 = pager(
    base="https://books.toscrape.com",
    next_page_selectors=[(".pager .next", "a")],
    extraction_jobs=all_book_extractions
)

[INFO] 3652031430.py:36 fetch_response() fetched: https://books.toscrape.com [200] 2025-08-18 15:50:52,844
HTTPSConnectionPool(host='books.toscrape.com', port=443): Max retries exceeded with url: /catalogue/page-2.html (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7946c96490>, 'Connection to books.toscrape.com timed out. (connect timeout=10)')) 2025-08-18 15:51:54,014


- preprocess / orginise data
- - join page data
  - make data frame

In [16]:
scraped_data_1_phones

{'https://webscraper.io/test-sites/e-commerce/static/phones/touch': {'laptops': [{'img_alt': ['item'],
    'img_src': ['/images/test-sites/e-commerce/items/cart2.png'],
    'price': ['$24.99'],
    'currency': ['USD'],
    'title': ['Nokia 123'],
    'description': ['7 day battery'],
    'review-count': ['11'],
    'rating': ['3']},
   {'img_alt': ['item'],
    'img_src': ['/images/test-sites/e-commerce/items/cart2.png'],
    'price': ['$57.99'],
    'currency': ['USD'],
    'title': ['LG Optimus'],
    'description': ['3.2" screen'],
    'review-count': ['11'],
    'rating': ['3']},
   {'img_alt': ['item'],
    'img_src': ['/images/test-sites/e-commerce/items/cart2.png'],
    'price': ['$93.99'],
    'currency': ['USD'],
    'title': ['Samsung Galaxy'],
    'description': ['5 mpx. Android 5.0'],
    'review-count': ['3'],
    'rating': ['3']},
   {'img_alt': ['item'],
    'img_src': ['/images/test-sites/e-commerce/items/cart2.png'],
    'price': ['$109.99'],
    'currency': ['USD'],
 

In [7]:
_ =join_page_records(scraped_data, 'laptops')
df = transformer(_)

In [18]:
_ =join_page_records(scraped_data_1_phones, 'laptops')
df_phones = transformer(_)

---

In [8]:
_ = join_page_records(scraped_data_2, 'books')
df_2 = transformer(_)

In [9]:
_ = scraped_data_2['https://books.toscrape.com']['categories']
categories_df = pd.DataFrame(_[0])

- transform
- preview

In [19]:
df_phones.head()

Unnamed: 0,img_alt,img_src,price,currency,title,description,review-count,rating
0,item,/images/test-sites/e-commerce/items/cart2.png,$24.99,USD,Nokia 123,7 day battery,11,3
1,item,/images/test-sites/e-commerce/items/cart2.png,$57.99,USD,LG Optimus,"3.2"" screen",11,3
2,item,/images/test-sites/e-commerce/items/cart2.png,$93.99,USD,Samsung Galaxy,5 mpx. Android 5.0,3,3
3,item,/images/test-sites/e-commerce/items/cart2.png,$109.99,USD,Nokia X,"Andoid, Jolla dualboot",4,4
4,item,/images/test-sites/e-commerce/items/cart2.png,$118.99,USD,Sony Xperia,"GPS, waterproof",6,1


In [10]:
df["price"] = df["price"].replace(r"[^\d\.]", "", regex=True).astype(float)
df.head()

Unnamed: 0,img_alt,img_src,price,currency,title,description,review-count,rating
0,item,/images/test-sites/e-commerce/items/cart2.png,416.99,USD,Packard 255 G2,"15.6"", AMD E2-3800 1.3GHz, 4GB, 500GB, Windows...",2,2
1,item,/images/test-sites/e-commerce/items/cart2.png,306.99,USD,Aspire E1-510,"15.6"", Pentium N3520 2.16GHz, 4GB, 500GB, Linux",2,3
2,item,/images/test-sites/e-commerce/items/cart2.png,1178.99,USD,ThinkPad T540p,"15.6"", Core i5-4200M, 4GB, 500GB, Win7 Pro 64bit",2,1
3,item,/images/test-sites/e-commerce/items/cart2.png,739.99,USD,ProBook,"14"", Core i5 2.6GHz, 4GB, 500GB, Win7 Pro 64bit",8,4
4,item,/images/test-sites/e-commerce/items/cart2.png,1311.99,USD,ThinkPad X240,"12.5"", Core i5-4300U, 8GB, 240GB SSD, Win7 Pro...",12,3


---

In [11]:
df_2["rating"] = df_2["rating"].apply(
    lambda tags: rating_map[_[0]] if ( _ := list(filter(lambda tag: tag in rating_map, map(str.lower, tags))) ) else None
)

df_2["currency"] = df_2["price"].str.extract(r"(^\D)")
df_2["price"] = df_2["price"].replace(r"[^\d\.]", "", regex=True).astype(float)

df_2.head()

Unnamed: 0,img_href,img_alt,img_src,rating,book_href,book_text,price,availability,currency
0,catalogue/a-light-in-the-attic_1000/index.html,A Light in the Attic,media/cache/2c/da/2cdad67c44b002e7ead0cc35693c...,3,catalogue/a-light-in-the-attic_1000/index.html,A Light in the ...,51.77,In stock,£
1,catalogue/tipping-the-velvet_999/index.html,Tipping the Velvet,media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f...,1,catalogue/tipping-the-velvet_999/index.html,Tipping the Velvet,53.74,In stock,£
2,catalogue/soumission_998/index.html,Soumission,media/cache/3e/ef/3eef99c9d9adef34639f51066202...,1,catalogue/soumission_998/index.html,Soumission,50.1,In stock,£
3,catalogue/sharp-objects_997/index.html,Sharp Objects,media/cache/32/51/3251cf3a3412f53f339e42cac213...,4,catalogue/sharp-objects_997/index.html,Sharp Objects,47.82,In stock,£
4,catalogue/sapiens-a-brief-history-of-humankind...,Sapiens: A Brief History of Humankind,media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c...,5,catalogue/sapiens-a-brief-history-of-humankind...,Sapiens: A Brief History ...,54.23,In stock,£


In [12]:
categories_df.head()

Unnamed: 0,category_href,category_text
0,catalogue/category/books/travel_2/index.html,Travel
1,catalogue/category/books/mystery_3/index.html,Mystery
2,catalogue/category/books/historical-fiction_4/...,Historical Fiction
3,catalogue/category/books/sequential-art_5/inde...,Sequential Art
4,catalogue/category/books/classics_6/index.html,Classics


- save

In [21]:
df_phones.to_csv(output_dir/"phones_allinone.csv", index=False, encoding="utf-8")

In [13]:
df.to_csv(output_dir/"laptops_allinone.csv", index=False, encoding="utf-8")

---

In [14]:
df_2.to_csv(output_dir/"books_bookstoscrape.csv", index=False, encoding="utf-8")

In [15]:
categories_df.to_csv(output_dir/"categories_bookstoscrape.csv", sep=";", index=False, encoding="utf-8")