
- imports

In [None]:
from functools import partial
import pandas as pd
from pathlib import Path

# from main.utils import get_logger
# from main.scraper import (fetch_response, make_soup, extract_blocks, pager)


- modules ship

In [None]:
import requests
from requests.adapters import HTTPAdapter
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from urllib3.util.retry import Retry
# import pandas as pd



def fetch_response(url, session=None, timeout=10):
    if not isinstance(url, str) or not url.startswith("http"):
        logger.warning(f"Invalid URL: {url}")
        return None
    session = session or requests.Session()

    if not hasattr(session, "_retry_configured"):
        retry_strategy = Retry(
            total=3,
            status_forcelist=[429, 500, 502, 503, 504],
            allowed_methods=["GET", "HEAD"],
            backoff_factor=1,
        )
        adapter = HTTPAdapter(max_retries=retry_strategy)
        session.mount("http://", adapter)
        session.mount("https://", adapter)
        session._retry_configured = True

    headers = {
        "User-Agent": "Mozilla/5.0",
        'Accept-Charset': 'utf-8',
    }
    try:
        response = session.get(url, headers=headers, timeout=timeout)
        response.raise_for_status()
        response.encoding = 'utf-8'
        logger.info(f"fetched: {url} [{response.status_code}]")
        return response
    except requests.exceptions.HTTPError as errh:
        logger.warning(f"HTTP error occurred:\n{errh}")
    except requests.exceptions.RequestException as err:
        logger.warning(f"{type(err).__name__}\n{err}")
    except Exception as e:
        logger.warning(f"Unhandled exception:\n{type(e).__name__}: {e}")





def make_soup(response=None, url=None):
    if isinstance(url, str):
        response = fetch_response(url)
    if not (response and response.status_code == 200):
        logger.warning(f"Couldn't make soup with: {response}")
        return None
    
    soup = BeautifulSoup(response.text, "lxml")
    return soup






def safe_soup_select(soup, selector):
    try:
        return soup.select(selector)
    except AttributeError as err:
        logger.error(f"{selector} {soup} returning {None}: {err}")
    except Exception as e:
        logger.error(f"Invalid CSS selector '{selector}', returning None: {e}")


        

def extract_blocks(soup, block_selector, fields):
    """
    Extract structured data blocks from soup.

    :param soup: BeautifulSoup object
    :param block_selector: CSS selector to identify each block
    :param fields: dict of {field_name: (selector, attr, default)}
                   - attr defaults to "text"
                   - default defaults to None
    :return: List of dicts with extracted data
    """
    blocks = safe_soup_select(soup, block_selector)
    if not blocks:
        logger.info(f"No blocks found with selector '{block_selector}'.")
        return []

    extracted_data = []
    for i, block in enumerate(blocks, start=1):
        block_data = {}
        for field_name, field_def in fields.items():
            try:
                selector, *rest = field_def
                attr = rest[0] if len(rest) > 0 else "text"
                field_default = rest[1] if len(rest) > 1 else None
                attr = attr or "text"
                
                elements = safe_soup_select(block, selector)
                if not elements:
                    logger.info(f"Invalid selector '{selector}' for field '{field_name}' or No such elements found block '{block}'")
                    elements = []

                values = []
                for el in elements:
                    try:
                        if attr == "text":
                            value = el.get_text(strip=True, separator=" ")
                        elif attr == "html":
                            value = str(el)
                        elif attr == "inner_html":
                            value = "".join(str(c) for c in el.contents)
                        else:
                            value = el.get(attr)

                        if value:
                            values.append(value)
                    except Exception as e:
                        logger.error(f"Error extracting attr '{attr}' for field '{field_name}': {e}")

                block_data[field_name] = values if values else field_default

            except Exception as e:
                logger.error(f"Unexpected error in field '{field_name}' (block {i}): {e}")
                block_data[field_name] = None

        extracted_data.append(block_data)
    return extracted_data






def extract_multiple_blocks(soup, extraction_jobs):
    """
    Extracts multiple sets of structured data blocks from a single soup object.

    :param soup: BeautifulSoup object
    :param extraction_jobs: A dictionary mapping data type names to a dict
                            containing 'block_selector' and 'fields'.
    :return: A dictionary of lists, where each key is a data type (e.g., 'books', 'categories')
             and the value is a list of extracted data dictionaries.
    """
    if not soup:
        logger.error("extract_multiple_blocks() called with no soup object.")
        return {}

    all_extracted_data = {}
    for job_name, job_config in extraction_jobs.items():
        block_selector = job_config.get("block_selector")
        fields = job_config.get("fields")

        # Reuse the existing extract_blocks function for each job
        extracted_data = extract_blocks(soup, block_selector, fields)
        all_extracted_data[job_name] = extracted_data
    
    return all_extracted_data


    


def pager(base, max_pages=3, fetcher=make_soup, callback=extract_multiple_blocks, **kwargs):
    """
    Iterate through paginated links starting from `base`.
    :param callback: A function that can process a single soup object and return
                     a dictionary of multiple data sets.
    """
    pages = {}
    counter = 1
    current_url = base

    while counter <= max_pages:
        page_soup = fetcher(url=current_url)
        if not page_soup:
            break

        extracted_data = callback(page_soup, **kwargs)
        pages[current_url] = extracted_data

        next_hrefs = pages[current_url]["links"][0].get('next_href')
        if not next_hrefs:
            break
            
        current_url = urljoin(current_url, next_hrefs[0])
        counter += 1

    return pages





    




transformer = lambda data: pd.DataFrame([
    {key: val[0] if isinstance(val, list) and val else None for key, val in record.items()}
    for record in data
]) if data else pd.DataFrame(data) # get 0th index of all list type values make it new key value



def join_page_records(scraped_data, job_name):
    all_data = []
    for page_url in scraped_data:
        if job_name in scraped_data[page_url] and scraped_data[page_url][job_name]:
            all_data.extend(scraped_data[page_url][job_name])
    return all_data

# --------------------------------------------------------

import os
import logging
import shutil

_logger = None
def get_logger(name="books_scraper"):
    """Singleton logger to be used across the project.
    
    This version explicitly deletes the logs directory and its contents
    before recreating it to ensure a clean start for each program run.
    """
    global _logger
    if _logger:
        return _logger

    if os.path.exists("logs"):
        shutil.rmtree("logs")
    
    os.makedirs("logs", exist_ok=True)
    
    logger = logging.getLogger(name)
    logger.setLevel(logging.INFO)

    if not logger.handlers:
        formatter = logging.Formatter(
            "[%(levelname)s] %(filename)s:%(lineno)d %(funcName)s() %(message)s"
        )

        file_handler = logging.FileHandler("logs/scraping.log", mode="w")
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)

        console = logging.StreamHandler()
        console.setFormatter(formatter)
        logger.addHandler(console)

    _logger = logger
    return logger

logger = get_logger()


- extraction fields / miscellaneous(extras)

In [None]:
allinone_extractions = {
    "products": {
        "block_selector": ".product-wrapper.card-body",
        "fields": {
            "img_alt": ("img", "alt"),
            "img_src": ("img", "src"),
            "price": (".caption span", "text"),
            "currency": (".caption meta", "content"),
            "title": (".caption h4 a", "title"),
            "description": (".caption p.description", "text"),
            "review-count": (".ratings p.review-count span", "text"),
            "rating": (".ratings p", "data-rating"),
        }
    },
    "links": {
        "block_selector": ".pagination",
        "fields": {
            "next_href": ("a.next", "href"),
        }
    }
}





scrapingcourse_extractions = {
    "products": {
        "block_selector": ".product-item",
        "fields": {
            "name": (".product-name", "text"),
            
            "price": (".product-price", "text"),
            
            "image-alt": (".product-image", "alt"),
            # "image-text": (".product-image", "text"),            
            # "image-sizes": (".product-image", "sizes"),
            # "image-hieght": (".product-image", "hieght"),
            "image-width": (".product-image", "width"),
            # "image-srcset": (".product-image", "srcset"),
            
            # "id": (".button", "data-product_id"),
            # "sku": (".button", "data-product_sku"),
            # "quantity": (".button", "data-quantity"),
            # "aria-describedby": (".button", "aria-describedby"),
        }
    },
    "links": {
        "block_selector": "#pagination-container",
        "fields": {
            "next_href": (".next-page", "href"),
        }
    }
}


all_book_extractions = {
    "books": {
        "block_selector": ".product_pod",
        "fields": {
            "img_href": (".image_container a", "href"),
            "img_alt": (".image_container a img", "alt"),
            "img_src": (".image_container a img", "src"),
            "rating": (".star-rating", "class"),
            "book_href": ("h3 a", "href"),
            "book_text": ("h3 a", "text"),
            "price": (".price_color", "text"),
            "availability": (".availability", "text"),
        }
    },
    "categories": {
        "block_selector": ".nav-list ul",
        "fields": {
            "category_href": ("a", "href"),
            "category_text": ("a", "text"),
        }
    },
    "links": {
        "block_selector": ".pager .next",
        "fields": {
            "next_href": ("a", "href"),
        }
    }
}


output_dir = Path("src/main/data")
output_dir.mkdir(parents=True, exist_ok=True)

# lambda (selector, attr): (_.get(attr) if (_:= book.select_one(selector)) else "") # to avoid NoneType has no attr x

---



#### all-in-one / books-to-scrape
-
- - requests
- - httpx

-
- - bs4.beutifulSoup  .select_one("css")
- - selectorlax.parser HTMLParser   .css_first("css")

-
- - pagination - soup fetch
- - common pager component types
- - - with next page button
    - - without next button
    - - - url-based (https://example.com/products?page=)
    - - - infinit scroll
    - - - load more button

In [None]:
# scraped_data = pager(
#     base="https://webscraper.io/test-sites/e-commerce/static/computers/laptops",
#     extraction_jobs=allinone_extractions
# ) or {}

In [None]:
# scraped_data_1_phones = pager(
#     base="https://webscraper.io/test-sites/e-commerce/static/phones/touch",
#     extraction_jobs=allinone_extractions
# ) or {}

In [None]:
scraped_data_2 = pager(
    base="https://books.toscrape.com",
    extraction_jobs=all_book_extractions
) or {}

In [None]:
# scraped_data_cloths = pager(
#     base="https://www.scrapingcourse.com/pagination",
#     extraction_jobs=scrapingcourse_extractions
# ) or {}

---


- preprocess / orginise data
- - join page data
  - make data frame

In [None]:
# _ =join_page_records(scraped_data, 'products')
# df = transformer(_)

In [None]:
# _ =join_page_records(scraped_data_1_phones, 'products')
# df_phones = transformer(_)

In [None]:
_ = join_page_records(scraped_data_2, 'books')
df_2 = transformer(_)

In [None]:
# _ = scraped_data_2.get('https://books.toscrape.com', {}).get('categories', {})
# categories_df = pd.DataFrame(_[0] if _ else {})

In [None]:
# _ =join_page_records(scraped_data_cloths, 'products')
# df_cloths = transformer(_)

---


- transform
- preview

In [None]:
# df_phones.head(n=2)

In [None]:
# df["price"] = df["price"].replace(r"[^\d\.]", "", regex=True).astype(float)
# df.head(n=2)

In [None]:
df_2["currency"] = df_2["price"].str.extract(r"(^\D)")
df_2["price"] = df_2["price"].replace(r"[^\d\.]", "", regex=True).astype(float)

ratings = {"one": 1, "two": 2, "three": 3, "four": 4, "five": 5}

df_2["rating"] = df_2["rating"].apply(
    lambda tags: ratings[_[0]] if ( _ := list(filter(lambda tag: tag in ratings, map(str.lower, tags))) ) else None
)

df_2.head(n=2)

In [None]:
# categories_df.head(n=2)

In [None]:
# df_cloths["currency"] = df_cloths["price"].str.extract(r"(^\D)")
# df_cloths["price"] = df_cloths["price"].replace(r"[^\d\.]", "", regex=True).astype(float)

# df_cloths.head(n=2)

---


- save

In [None]:
# df_phones.to_csv(output_dir/"phones_allinone.csv", index=False, encoding="utf-8")

In [None]:
# df.to_csv(output_dir/"laptops_allinone.csv", index=False, encoding="utf-8")

In [None]:
# df_2.to_csv(output_dir/"books_bookstoscrape.csv", index=False, encoding="utf-8")

In [None]:
# categories_df.to_csv(output_dir/"categories_bookstoscrape.csv", sep=";", index=False, encoding="utf-8")

In [None]:
# df_cloths.to_csv(output_dir/"cloths_scrapingcourse.csv", index=False, encoding="utf-8")

---

In [None]:
def scrape_item(df, url_column, base_url, block_selector, fields,max_items=5):
    """
    Scrapes detailed information from a list of URLs and returns a DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame containing the URLs.
        url_column (str): The name of the column with the relative URLs.
        base_url (str): The base URL to join with the relative URLs.
        block_selector (str): CSS selector for the data block on the detail page.
        fields (dict): Dictionary of fields and their selectors.

    Returns:
        pd.DataFrame: A DataFrame with the scraped detailed item data.
    """
    all_item_data = []
    counter = 0

    for href in df[url_column]:
        if counter >= max_items:
            break
        link = urljoin(base_url, "../"+href)
        logger.info(f"Scraping detail page: {link}")
        
        item_soup = make_soup(url=link)
        if not item_soup:
            continue  # Skip if the page couldn't be fetched or parsed

        # The result of extract_blocks is a list of dictionaries
        scraped_records = extract_blocks(item_soup, block_selector, fields)
        
        if scraped_records:
            # Add the original URL to the scraped data for merging later
            scraped_records[0]['source_url'] = link
            all_item_data.extend(scraped_records)

        counter += 1

    # Convert the list of dictionaries to a DataFrame
    item_df = transformer(all_item_data)
    
    return item_df

In [None]:
book_detail_extractions = {
    'block_selector': '.product_page',
    'fields': {
        'product_description': ('#product_description ~ p', 'text'),
        'upc': ('.table.table-striped tr:nth-of-type(1) td', 'text'),
        'product_type': ('.table.table-striped tr:nth-of-type(2) td', 'text'),
        'price_excl_tax': ('.table.table-striped tr:nth-of-type(3) td', 'text'),
        'price_incl_tax': ('.table.table-striped tr:nth-of-type(4) td', 'text'),
        'tax': ('.table.table-striped tr:nth-of-type(5) td', 'text'),
        'availability': ('.table.table-striped tr:nth-of-type(6) td', 'text'),
        'number_of_reviews': ('.table.table-striped tr:nth-of-type(7) td', 'text'),
        'image_url': ('#product_gallery .thumbnail img', 'src'),
        'book_text': ('.product_main h1', 'text')
        
    }
}

# Scrape the detailed information for each book
detailed_books_df = scrape_item(
    df=df_2,
    url_column='book_href',  # This is the column with the relative URLs
    base_url='https://books.toscrape.com/catalogue/', # base url needed for this site.
    block_selector=book_detail_extractions['block_selector'],
    fields=book_detail_extractions['fields']
)

In [None]:
pd.merge(df_2, detailed_books_df, on="book_text")