In [1]:
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin

In [None]:

def demo_urljoin_cases():
    cases = [
        # Basic relative file in same directory
        ("https://example.com/articles/page-1.html", "page-2.html"),

        # Go up one level
        ("https://example.com/articles/page-1.html", "../page-2.html"),
        ("https://example.com/articles/page-1.html", "./page-2.html"),
        

        # Absolute path (starts from root)
        ("https://example.com/articles/page-1.html", "/page-2.html"),

        # Base ends with slash (treated as folder)
        ("https://example.com/articles/", "page-2.html"),

        # Base without slash (treated as file)
        ("https://example.com/articles", "page-2.html"),

        # Base is domain only
        ("https://example.com", "page-2.html"),

        # href is an absolute URL (should override base)
        ("https://example.com/articles/page-1.html", "https://other.com/page-2.html"),

        # href starts with ./ (same folder)
        ("https://example.com/articles/", "./page-2.html"),

        # href with nested folder
        ("https://example.com/articles/", "2025/page-2.html"),

        # href goes up two levels
        ("https://example.com/a/b/c/index.html", "../../other.html"),

        # href starts with //
        ("https://example.com/articles/", "//cdn.example.com/lib.js"),

        # href is empty string
        ("https://example.com/articles/page-1.html", ""),

        # href is a fragment
        ("https://example.com/articles/page-1.html", "#section2"),

        # href is query only
        ("https://example.com/articles/page-1.html", "?page=2"),

        # href repeats current folder name (looks nested but isn't)
        ("https://example.com/articles/", "articles/page-2.html"),

        # href repeats current folder name with ./ (actually nested)
        ("https://example.com/articles/", "./articles/page-2.html"),

        # Weird case: base with trailing slash, href with leading slash
        ("https://example.com/articles/", "/page-2.html"),

        # Weird case: base with file, href with full path including ./ and ..
        ("https://example.com/blog/posts/post-1.html", "./../other/post-2.html"),

        # base is a "file", href has fragment and query
        ("https://example.com/page.html", "next.html?x=1#top"),

        # href is just a fragment
        ("https://example.com/articles/page-1.html", "#top")
    ]

    for base, href in cases:
        result = urljoin(base, href)
        print(f"{base:<45}\n{href:<35}\n{result}\n\n\n")

# Run the demo
demo_urljoin_cases()


In [2]:
def fetch_response(url, session=requests.Session()):
    try:
        response = session.get(url, timeout=10)
        response.raise_for_status()
        print(f"fetched: {url}")
        return response
    except requests.exceptions.HTTPError as errh:
        print("HTTP error occurred:\n", errh)
    except requests.exceptions.RequestException as err:
        print("Unexpected request error:\n", err)
    except Exception as e:
        print(f"Unhandled exception:\n{type(e).__name__}: {e}")

In [3]:
def make_soup(response):
    if not (response and response.status_code == 200):
        print("couldn't make soup with:", response)
        return
    html = response.text
    soup = BeautifulSoup(html, "lxml")
    print("soup's ready!")
    return soup

In [4]:
def fetch_next_page(base_url):
    res = fetch_response(base_url)
    soup = make_soup(res)
    
    link = ((el := soup.select_one(".pager .next a")) and el.get("href")) if soup else None
    print(base_url, link, sep="\n")
    return urljoin(base_url, link) if link else None

In [None]:
def extract_all_books_data(soup):
    if not soup or not (books := soup.select("article.product_pod")):
        print("soup had books data")
        return
        
    extracted_data = []
    # lambda (selector, attr): (_.get(attr) if (_:= book.select_one(selector)) else "") # to avoid NoneType has no attr x
    for book in books:
        data = {
            #image element data
            "image_href": book.select_one(".image_container a").get("href"),  # better to fail fast on no container w/ a
            "image_alt": book.select_one(".image_container a").img.get("alt"), # reach inside <a> to find <img>
            "image_src": book.select_one(".image_container a").img.get("src"),
            
            # star rating
            "star_rating": [i for i in book.select_one(".star-rating").get("class") if i != "star-rating"][0],    # from list of classes on attr filter out star-rating assume index 0 is the rating 

            # title element data
            "title_href": book.select_one("h3 a").get("href"),
            "title_text": book.select_one("h3 a").get("title"),    #.get_text()?
            
            # price and availability
            "price": book.select_one(".product_price").select_one(".price_color").get_text(strip=True),
            "availability": book.select_one(".product_price").select_one(".availability").get_text(strip=True),
        }
        extracted_data.append(data)
    print(f"got {len(extracted_data)} books")
    return extracted_data

In [None]:
__ = lambda _: [i for i in dir(_) if not i.startswith("_")]

In [None]:
# if __name__ == "__main__":

In [None]:
response = fetch_response(url)

In [None]:
html = """
<article class="product_pod">
  <div class="image_container">
    <a href="catalogue/a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg" /></a>
  </div>
  <p class="star-rating Three">
    <i class="icon-star"></i>
    <i class="icon-star"></i>
    <i class="icon-star"></i>
    <i class="icon-star"></i>
    <i class="icon-star"></i>
  </p>
  <h3><a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
  <div class="product_price">
    <p class="price_color">Â£51.77</p>
    <p class="instock availability">
      <i class="icon-ok"></i>
      
       In stock
       
    </p>
    <form>
      <button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
    </form>
  </div>
</article>
"""
soup = BeautifulSoup(html, "lxml")    # for offline

In [None]:
books = extract_all_books_data(soup)
books

In [None]:
soup = make_soup(response)

In [None]:
books = extract_all_books_data(soup)
books

### pageation

In [5]:
base = "https://books.toscrape.com"
links = []

while (next := fetch_next_page(base)):
    links.append(next)
    base = next

fetched: https://books.toscrape.com
soup's ready!
https://books.toscrape.com
catalogue/page-2.html
fetched: https://books.toscrape.com/catalogue/page-2.html
soup's ready!
https://books.toscrape.com/catalogue/page-2.html
page-3.html
fetched: https://books.toscrape.com/catalogue/page-3.html
soup's ready!
https://books.toscrape.com/catalogue/page-3.html
page-4.html
fetched: https://books.toscrape.com/catalogue/page-4.html
soup's ready!
https://books.toscrape.com/catalogue/page-4.html
page-5.html
fetched: https://books.toscrape.com/catalogue/page-5.html
soup's ready!
https://books.toscrape.com/catalogue/page-5.html
page-6.html
fetched: https://books.toscrape.com/catalogue/page-6.html
soup's ready!
https://books.toscrape.com/catalogue/page-6.html
page-7.html
fetched: https://books.toscrape.com/catalogue/page-7.html
soup's ready!
https://books.toscrape.com/catalogue/page-7.html
page-8.html
fetched: https://books.toscrape.com/catalogue/page-8.html
soup's ready!
https://books.toscrape.com/cata

In [6]:
links

['https://books.toscrape.com/catalogue/page-2.html',
 'https://books.toscrape.com/catalogue/page-3.html',
 'https://books.toscrape.com/catalogue/page-4.html',
 'https://books.toscrape.com/catalogue/page-5.html',
 'https://books.toscrape.com/catalogue/page-6.html',
 'https://books.toscrape.com/catalogue/page-7.html',
 'https://books.toscrape.com/catalogue/page-8.html',
 'https://books.toscrape.com/catalogue/page-9.html',
 'https://books.toscrape.com/catalogue/page-10.html',
 'https://books.toscrape.com/catalogue/page-11.html',
 'https://books.toscrape.com/catalogue/page-12.html',
 'https://books.toscrape.com/catalogue/page-13.html',
 'https://books.toscrape.com/catalogue/page-14.html',
 'https://books.toscrape.com/catalogue/page-15.html',
 'https://books.toscrape.com/catalogue/page-16.html',
 'https://books.toscrape.com/catalogue/page-17.html',
 'https://books.toscrape.com/catalogue/page-18.html',
 'https://books.toscrape.com/catalogue/page-19.html',
 'https://books.toscrape.com/catalog