In [1]:
from playwright.async_api import async_playwright
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
import pandas as pd
import requests
import asyncio
import time
import json
import os

In [3]:
base_url = "https://www.madewithnestle.ca/"
visited = set()
scraped_data = []


In [5]:
async def scrape_with_playwright(url, depth=0, max_depth=1):
    if url in visited or depth > max_depth:
        return
    visited.add(url)

    async with async_playwright() as p:
        try:
            browser = await p.chromium.launch(headless=False, slow_mo=50)
            page = await browser.new_page(user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15")
            print(f"Scraping: {url}")
            await page.goto(url, timeout=60000)

            # Wait for meaningful content to load
            try:
                await page.wait_for_selector("main", timeout=8000)
            except:
                await page.wait_for_timeout(5000)  # fallback wait

            html = await page.content()
            soup = BeautifulSoup(html, 'html.parser')
            
            title = soup.title.string.strip() if soup.title else ""
            text = " ".join([p.get_text(strip=True) for p in soup.find_all("p")])
            links = [a['href'] for a in soup.find_all("a", href=True)]

            scraped_data.append({
                "url": url,
                "title": title,
                "text": text,
                "links": links
            })

            # Crawl internal links only
            for link in links:
                if link.startswith("/") and not link.startswith("//"):
                    full_url = base_url.rstrip("/") + link
                    await scrape_with_playwright(full_url, depth + 1, max_depth)
                elif base_url in link:
                    await scrape_with_playwright(link, depth + 1, max_depth)

            await browser.close()
        except Exception as e:
            print(f"Failed to scrape {url}: {e}")


In [7]:
# Run the crawler
await scrape_with_playwright(base_url, max_depth=1)

# Save to JSON
with open("nestle_scraped.json", "w") as f:
    json.dump(scraped_data, f, indent=2)

# Load into DataFrame for inspection
df = pd.DataFrame(scraped_data)
df.head()


Scraping: https://www.madewithnestle.ca/
Scraping: https://www.madewithnestle.ca/aero
Scraping: https://www.madewithnestle.ca/coffee-crisp
Scraping: https://www.madewithnestle.ca/kit-kat
Scraping: https://www.madewithnestle.ca/smarties
Scraping: https://www.madewithnestle.ca/turtles
Scraping: https://www.madewithnestle.ca/after-eight
Scraping: https://www.madewithnestle.ca/big-turk
Scraping: https://www.madewithnestle.ca/crunch
Scraping: https://www.madewithnestle.ca/drumstick-bites
Scraping: https://www.madewithnestle.ca/easter-holiday
Scraping: https://www.madewithnestle.ca/mackintosh-toffee
Scraping: https://www.madewithnestle.ca/mirage
Scraping: https://www.madewithnestle.ca/quality-street
Scraping: https://www.madewithnestle.ca/rolo
Scraping: https://www.madewithnestle.ca/coffee-mate
Scraping: https://www.madewithnestle.ca/nescafe
Scraping: https://www.madewithnestle.ca/confectionery-frozen-desserts
Scraping: https://www.madewithnestle.ca/hd-en
Scraping: https://www.madewithnestle

Unnamed: 0,url,title,text,links
0,https://www.madewithnestle.ca/,"Nestlé Brands' Products, Recipes and News | Ma...",Learn about our commitment to sustainable coco...,"[#main-content, /, https://www.madewithnestle...."
1,https://www.madewithnestle.ca/aero,AERO | Feel the Bubbles Melt | Nestlé Canada,Enter for a chance to WIN headphones or Spotif...,"[/#facebook, /#twitter, /#pinterest, /#email, ..."
2,https://www.madewithnestle.ca/coffee-crisp,COFFEE CRISP | Makes a Nice Light Snack!,Wake up your taste buds with the perfect blend...,"[/#facebook, /#twitter, /#pinterest, /#email, ..."
3,https://www.madewithnestle.ca/kit-kat,KITKAT | Have a Break | Made with Nestlé Canada,These cookies are necessary for the website t...,"[#main-content, /, https://www.madewithnestle...."
4,https://www.madewithnestle.ca/smarties,SMARTIES | How Do You Smarties?,"Whether you shake, sort, or make your own crea...","[/#facebook, /#twitter, /#pinterest, /#email, ..."
