--- imports ---

In [None]:
from functools import partial
from IPython.display import display, HTML

import pandas as pd

--- modulor definitions ---

In [None]:
# from main.utils import get_logger
# from main.scraper import (fetch_response, make_soup, extract_blocks, pager)

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

import os
import logging
import shutil


def fetch_response(url, session=None, timeout=10):
    if not isinstance(url, str) or not url.startswith("http"):
        logger.warning(f"Invalid URL: {url}")
        return None
    session = session or requests.Session()

    if not hasattr(session, "_retry_configured"):
        retry_strategy = Retry(
            total=3,
            status_forcelist=[429, 500, 502, 503, 504],
            allowed_methods=["GET", "HEAD"],
            backoff_factor=1,
        )
        adapter = HTTPAdapter(max_retries=retry_strategy)
        session.mount("http://", adapter)
        session.mount("https://", adapter)
        session._retry_configured = True

    headers = {
        "User-Agent": "Mozilla/5.0",
        'Accept-Charset': 'utf-8',
    }
    try:
        response = session.get(url, headers=headers, timeout=timeout)
        response.raise_for_status()
        logger.info(f"fetched: {url} [{response.status_code}]")
        return response
    except requests.exceptions.HTTPError as errh:
        logger.warning(f"HTTP error occurred:\n{errh}")
    except requests.exceptions.RequestException as err:
        logger.warning(f"{type(err).__name__}\n{err}")
    except Exception as e:
        logger.warning(f"Unhandled exception:\n{type(e).__name__}: {e}")





def make_soup(response=None, url=None):
    if isinstance(url, str):
        response = fetch_response(url)
    if not (response and response.status_code == 200):
        logger.warning(f"Couldn't make soup with: {response}")
        return None
    
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text, "lxml")
    return soup



def extract_blocks(soup, block_selector, fields):
    """
    Extract structured data blocks from soup.

    :param soup: BeautifulSoup object
    :param block_selector: CSS selector to identify each block
    :param fields: dict of {field_name: (selector, attr, default)}
                   - attr defaults to "text"
                   - default defaults to None
    :return: List of dicts with extracted data
    """
    if not soup:
        logger.error("extract_blocks() called with no soup object.")
        return []

    try:
        blocks = soup.select(block_selector)
    except Exception as e:
        logger.error(f"Invalid block selector '{block_selector}': {e}")
        return []

    if not blocks:
        logger.info(f"No blocks found with selector '{block_selector}'.")
        return []

    extracted_data = []
    for i, block in enumerate(blocks, start=1):
        block_data = {}
        for field_name, field_def in fields.items():
            try:
                # Unpack tuple with defaults
                selector, attr, field_default = (list(field_def) + ["text", None])[0:3]
                attr = attr or "text"

                try:
                    elements = block.select(selector)
                except Exception as e:
                    logger.error(f"Invalid selector '{selector}' for field '{field_name}': {e}")
                    elements = []

                values = []
                for el in elements:
                    try:
                        if attr == "text":
                            value = el.get_text(strip=True, separator=" ")
                        elif attr == "html":
                            value = str(el)
                        elif attr == "inner_html":
                            value = "".join(str(c) for c in el.contents)
                        else:
                            value = el.get(attr)

                        if value:
                            values.append(value)
                    except Exception as e:
                        logger.error(f"Error extracting attr '{attr}' for field '{field_name}': {e}")

                block_data[field_name] = values if values else field_default

            except Exception as e:
                logger.error(f"Unexpected error in field '{field_name}' (block {i}): {e}")
                block_data[field_name] = None

        extracted_data.append(block_data)
    return extracted_data






def pager(base, max_pages, next_page_selectors, fetcher, callback, **kwargs):
    """
    Iterate through paginated links starting from `base`.
    :param callback: A function that can process a single soup object and return
                     a dictionary of multiple data sets.
    """
    pages = {}
    counter = 1
    current_url = base

    while counter <= max_pages:
        page_soup = fetcher(url=current_url)
        if not page_soup:
            break

        # The callback now returns a dict of all the data from the page
        extracted_data = callback(page_soup, **kwargs)
        pages[current_url] = extracted_data

        next_url = None
        for block_sel, elem_sel in next_page_selectors:
            try:
                link_data = extract_blocks(page_soup, block_sel, {elem_sel: (elem_sel, "href")})
                if link_data:
                    link = link_data[0].get(elem_sel)
                    if link:
                        next_url = urljoin(current_url, link[0])
                        break
            except Exception as e:
                print(f"Failed to find next page link with selectors '{block_sel}' and '{elem_sel}': {e}")
        
        if not next_url or next_url == current_url:
            print("No next page link found or link is the same as current page. Stopping.")
            break
        
        current_url = next_url
        counter += 1

    return pages





def extract_multiple_blocks(soup, extraction_jobs):
    """
    Extracts multiple sets of structured data blocks from a single soup object.

    :param soup: BeautifulSoup object
    :param extraction_jobs: A dictionary mapping data type names to a dict
                            containing 'block_selector' and 'fields'.
    :return: A dictionary of lists, where each key is a data type (e.g., 'books', 'categories')
             and the value is a list of extracted data dictionaries.
    """
    if not soup:
        logger.error("extract_multiple_blocks() called with no soup object.")
        return {}

    all_extracted_data = {}
    for job_name, job_config in extraction_jobs.items():
        block_selector = job_config.get("block_selector")
        fields = job_config.get("fields")

        # Reuse the existing extract_blocks function for each job
        extracted_data = extract_blocks(soup, block_selector, fields)
        all_extracted_data[job_name] = extracted_data
    
    return all_extracted_data

    




def page_processor(soup, extraction_jobs):
  return extract_multiple_blocks(soup, extraction_jobs)






    
_logger = None
def get_logger(name="books_scraper"):
    """Singleton logger to be used across the project.
    
    This version explicitly deletes the logs directory and its contents
    before recreating it to ensure a clean start for each program run.
    """
    global _logger
    if _logger:
        return _logger

    if os.path.exists("logs"):
        shutil.rmtree("logs")
    
    os.makedirs("logs", exist_ok=True)
    
    logger = logging.getLogger(name)
    logger.setLevel(logging.INFO)

    if not logger.handlers:
        formatter = logging.Formatter(
            "[%(levelname)s] %(filename)s:%(lineno)d %(funcName)s() %(message)s %(asctime)s"
        )

        file_handler = logging.FileHandler("logs/scraping.log", mode="w")
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)

        console = logging.StreamHandler()
        console.setFormatter(formatter)
        logger.addHandler(console)

    _logger = logger
    return logger

logger = get_logger()

--- extraction fields ---

In [None]:
allinone_extractions = {
    "laptops": {
        "block_selector": ".product-wrapper.card-body",
        "fields": {
            "img_alt": ("img", "alt"),
            "img_src": ("img", "src"),
            "price": (".caption span", "text"),
            "currency": (".caption meta", "content"),
            "title": (".caption h4 a", "title"),
            "description": (".caption p.description", "text"),
            "review-count": (".ratings p.review-count span", "text"),
            "rating": (".ratings p", "data-rating"),
        }
    },
}




all_book_extractions = {
    "books": {
        "block_selector": ".product_pod",
        "fields": {
            "img_href": (".image_container a", "href"),
            "img_alt": (".image_container a img", "alt"),
            "img_src": (".image_container a img", "src"),
            "rating": (".star-rating", "class"),
            "book_href": ("h3 a", "href"),
            "book_text": ("h3 a", "text"),
            "price": (".price_color", "text"),
            "availability": (".availability", "text"),
        }
    },
    "categories": {
        "block_selector": ".nav-list ul",
        "fields": {
            "category_href": ("a", "href"),
            "category_text": ("a", "text"),
        }
    }
}

allowed_map = {
    "one": 1,
    "two": 2,
    "three": 3,
    "four": 4,
    "five": 5
}

# lambda (selector, attr): (_.get(attr) if (_:= book.select_one(selector)) else "") # to avoid NoneType has no attr x

--- books to scrape - site pagination ---

In [None]:
url = "https://books.toscrape.com"

next_page_selectors = [
    (".pager .next", "a")
]

scraped_data = pager(
    base=url,
    max_pages=3,
    next_page_selectors=next_page_selectors,
    fetcher=make_soup,
    callback=page_processor,
    extraction_jobs=all_book_extractions
)

--- categories scraped data - data framing ---

In [None]:
categories_data = scraped_data['https://books.toscrape.com']['categories']
categories_df = pd.DataFrame(categories_data[0])
categories_df.head()

--- join all scraped pages book records ---

In [None]:
all_data = []

# Loop through all the pages in your scraped data
for page_url in scraped_data:
    # Check if the page has a 'books' key and if it's not empty
    if 'books' in scraped_data[page_url] and scraped_data[page_url]['books']:
        # Extend the master list with the books from the current page
        all_data.extend(scraped_data[page_url]['books'])

In [None]:
transformer = lambda data: pd.DataFrame([
    {key: val[0] if isinstance(val, list) and val else None for key, val in record.items()}
    for record in data
]) if data else None # get 0th index of all list type values make it new key value

# Create the final DataFrame from the combined list of all books
df = transformer(all_data)

df["rating"] = df["rating"].apply(
    lambda x: [allowed_map[stars.lower()] for stars in x if stars.lower() in allowed_map]
)
df["rating"] = df["rating"].apply(lambda x: x[0] if x else None)


df["currency"] = df["price"].str.extract(r"(^\D)")
df["price"] = df["price"].str.replace(r"[^\d\.]", "", regex=True).astype(float)

df.head()

--- aggregation ---

--- all in one - pagination ---

In [None]:
url = "https://webscraper.io/test-sites/e-commerce/static/computers/laptops" 

next_page_selectors = [
    (".page-item", "a.next")
]

scraped_data = pager(
    base=url,
    max_pages=3,
    next_page_selectors=next_page_selectors,
    fetcher=make_soup,
    callback=page_processor,
    extraction_jobs=allinone_extractions
)

In [None]:
all_data = []

for page_url in scraped_data:
    if 'laptops' in scraped_data[page_url] and scraped_data[page_url]['laptops']:
        all_data.extend(scraped_data[page_url]['laptops'])

df = transformer(all_data)
df["price"] = df["price"].str.replace(r"[^\d\.]", "", regex=True).astype(float)
df["price"] = df["price"].replace(r"[^\d\.]", "", regex=True).astype(float)
df.head()