In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium_stealth import stealth
from bs4 import BeautifulSoup
import time

In [2]:
def make_driver():
    """Create a stealth Chrome driver."""
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--disable-infobars")
    chrome_options.add_argument("--disable-notifications")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option("useAutomationExtension", False)

    driver = webdriver.Chrome(options=chrome_options)

    stealth(driver,
            languages=["en-US", "en"],
            vendor="Google Inc.",
            platform="Win32",
            webgl_vendor="Intel Inc.",
            renderer="Intel Iris OpenGL Engine",
            fix_hairline=True)

    return driver


def scroll_to_load(driver, pause=1.5, max_scrolls=10):
    """Scroll down to trigger lazy-loaded content."""
    last_height = driver.execute_script("return document.body.scrollHeight")
    for _ in range(max_scrolls):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(pause)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
    # Scroll back to top
    driver.execute_script("window.scrollTo(0, 0);")


def wait_for_page(driver, timeout=15):
    """Wait for page to be fully loaded (DOM + JS)."""
    # Wait for document.readyState == 'complete'
    WebDriverWait(driver, timeout).until(
        lambda d: d.execute_script("return document.readyState") == "complete"
    )
    # Wait for no active network requests (jQuery / fetch idle heuristic)
    try:
        WebDriverWait(driver, 5).until(
            lambda d: d.execute_script(
                "return (typeof jQuery === 'undefined' || jQuery.active === 0)"
            )
        )
    except Exception:
        pass  # Not all sites use jQuery — that's fine


def get_clean_body(url, scroll=True, max_scrolls=10, timeout=15):
    """
    Scrape a JS-rendered page and return clean <body> HTML.
    Strips scripts, styles, svgs, and all attributes.

    Args:
        url: Target URL
        scroll: Whether to scroll down to trigger lazy loading
        max_scrolls: Max scroll iterations
        timeout: Max seconds to wait for page load
    """
    driver = make_driver()
    try:
        driver.get(url)
        wait_for_page(driver, timeout)
        if scroll:
            scroll_to_load(driver, max_scrolls=max_scrolls)
        html = driver.page_source
    finally:
        driver.quit()

    soup = BeautifulSoup(html, "html.parser")
    body = soup.body
    if body is None:
        return ""

    for tag in body(["script", "style", "svg", "link", "iframe", "noscript"]):
        tag.decompose()

    for tag in body.find_all(True):
        tag.attrs = {}

    return str(body)

In [3]:
# Full clean body (with lazy-load scrolling)
# get_clean_body('https://www.netflix.com/ca/', scroll=True, max_scrolls=5)

In [4]:
def extract_meaningful_content(url, scroll=True, max_scrolls=10, timeout=15):
    """
    Scrape a JS-rendered page and return only meaningful text content.
    Extracts headings, paragraphs, lists, links, and buttons.

    Args:
        url: Target URL
        scroll: Whether to scroll down to trigger lazy loading
        max_scrolls: Max scroll iterations
        timeout: Max seconds to wait for page load
    """
    driver = make_driver()
    try:
        driver.get(url)
        wait_for_page(driver, timeout)
        if scroll:
            scroll_to_load(driver, max_scrolls=max_scrolls)
        html = driver.page_source
    finally:
        driver.quit()

    soup = BeautifulSoup(html, "html.parser")
    body = soup.body
    if body is None:
        return ""

    meaningful_tags = ["h1", "h2", "h3", "h4", "h5", "h6",
                       "p", "ul", "ol", "li", "a", "button", "span", "td", "th"]

    clean_soup = BeautifulSoup("<body></body>", "html.parser")
    new_body = clean_soup.body

    seen_texts = set()
    for tag in body.find_all(meaningful_tags):
        text = tag.get_text(strip=True)
        if not text or text in seen_texts:
            continue
        seen_texts.add(text)
        new_tag = clean_soup.new_tag(tag.name)
        new_tag.string = text
        new_body.append(new_tag)

    return str(clean_soup)

In [5]:
# Meaningful content only (with lazy-load scrolling)
extract_meaningful_content('https://www.netflix.com/ca/', scroll=True, max_scrolls=5)

"<body><span>Netflix</span><span>Select LanguageEnglishFrançais</span><a>Sign In</a><h1>Unlimited movies, TV shows, and more</h1><p>Starts at $7.99. Cancel anytime.</p><h3>Ready to watch? Enter your email to create or restart your membership.</h3><button>Get Started</button><h3>The Netflix you love for just $7.99.</h3><p>Get our most affordable, ad-supported plan.</p><button>Learn More</button><h2>Trending Now</h2><ul>The Good Doctor11Bridgerton22Love Is Blind33Reality Check: Inside America's Next Top Model44The Lincoln Lawyer55Stranger Things66The Rip77The Night Agent88HIS &amp; HERS99Dhurandhar1010</ul><li>The Good Doctor11</li><span>11</span><span>1</span><li>Bridgerton22</li><span>22</span><span>2</span><li>Love Is Blind33</li><span>33</span><span>3</span><li>Reality Check: Inside America's Next Top Model44</li><span>44</span><span>4</span><li>The Lincoln Lawyer55</li><span>55</span><span>5</span><li>Stranger Things66</li><span>66</span><span>6</span><li>The Rip77</li><span>77</spa