In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium_stealth import stealth
from bs4 import BeautifulSoup
import time

In [2]:
def scrape_js_page(url, wait_seconds=5):
    """
    Scrape a JavaScript-rendered page using Selenium + Stealth.
    
    Args:
        url (str): URL of the page to scrape
        wait_seconds (int): seconds to wait for JS to load

    Returns:
        str: Rendered HTML content
    """
    chrome_options = Options()
    chrome_options.headless = True
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")

    driver = webdriver.Chrome(options=chrome_options)

    # Apply stealth mode
    stealth(driver,
            languages=["en-US","en"],
            vendor="Google Inc.",
            platform="Win32",
            webgl_vendor="Intel Inc.",
            renderer="Intel Iris OpenGL Engine",
            fix_hairline=True
    )

    # Open page
    driver.get(url)

    # Wait for JS to load
    time.sleep(wait_seconds)

    # Get page source
    html = driver.page_source
    driver.quit()
    return html

In [3]:
# scrape_js_page('https://www.netflix.com/ca/', wait_seconds=10)

In [4]:
def get_clean_body(url, wait_seconds=5):
    """
    Scrape a JS page and return <body> content only,
    stripped of <script>, <style>, <svg> and all attributes.
    """
    # Setup Chrome headless
    chrome_options = Options()
    chrome_options.headless = True
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    
    driver = webdriver.Chrome(options=chrome_options)
    
    # Apply stealth mode
    stealth(driver,
            languages=["en-US","en"],
            vendor="Google Inc.",
            platform="Win32",
            webgl_vendor="Intel Inc.",
            renderer="Intel Iris OpenGL Engine",
            fix_hairline=True
    )
    
    # Load page
    driver.get(url)
    time.sleep(wait_seconds)
    
    # Get page source
    html = driver.page_source
    driver.quit()
    
    soup = BeautifulSoup(html, "html.parser")
    
    # Get body
    body = soup.body
    if body is None:
        return ""
    
    # Remove unwanted tags
    for tag in body(["script", "style", "svg", "link", "iframe"]):
        tag.decompose()
    
    # Remove all attributes recursively
    for tag in body.find_all(True):
        tag.attrs = {}

    return str(body)

In [5]:
# get_clean_body('https://www.netflix.com/ca/', wait_seconds=5)

In [6]:
def extract_meaningful_content(url, wait_seconds=5):
    chrome_options = Options()
    chrome_options.headless = True
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    
    driver = webdriver.Chrome(options=chrome_options)
    
    stealth(driver,
            languages=["en-US","en"],
            vendor="Google Inc.",
            platform="Win32",
            webgl_vendor="Intel Inc.",
            renderer="Intel Iris OpenGL Engine",
            fix_hairline=True
    )
    
    driver.get(url)
    time.sleep(wait_seconds)
    html = driver.page_source
    driver.quit()
    
    soup = BeautifulSoup(html, "html.parser")
    body = soup.body
    if body is None:
        return ""
    
    meaningful_tags = ["h1","h2","h3","h4","h5","h6","p","ul","ol","li","a","button"]
    
    clean_body = BeautifulSoup("<body></body>", "html.parser")
    new_body = clean_body.body
    
    for tag in body.find_all(meaningful_tags):
        new_tag = clean_body.new_tag(tag.name)
        new_tag.string = tag.get_text(strip=True)
        if new_tag.string:
            new_body.append(new_tag)
    
    return str(clean_body)

In [7]:
extract_meaningful_content('https://www.netflix.com/ca/', wait_seconds=5)

"<body><a>Sign In</a><h1>Unlimited movies, TV shows, and more</h1><p>Starts at $7.99. Cancel anytime.</p><h3>Ready to watch? Enter your email to create or restart your membership.</h3><button>Get Started</button><h3>The Netflix you love for just $7.99.</h3><p>Get our most affordable, ad-supported plan.</p><button>Learn More</button><h2>Trending Now</h2><ul>Stranger Things11The Rip22Emily in Paris33HIS &amp; HERS44Sean Combs: The Reckoning55Free Bert66KPop Demon Hunters77Kidnapped: Elizabeth Smart88Taskaree: The Smuggler's Web99Wake Up Dead Man: A Knives Out Mystery1010</ul><li>Stranger Things11</li><button>Stranger Things11</button><li>The Rip22</li><button>The Rip22</button><li>Emily in Paris33</li><button>Emily in Paris33</button><li>HIS &amp; HERS44</li><button>HIS &amp; HERS44</button><li>Sean Combs: The Reckoning55</li><button>Sean Combs: The Reckoning55</button><li>Free Bert66</li><button>Free Bert66</button><li>KPop Demon Hunters77</li><button>KPop Demon Hunters77</button><li>Ki