# Web Scraping Tutorial

This notebook demonstrates how to use our modular web scraping library to:
1. Map all endpoints of a website
2. Scrape content by tags, classes, and IDs
3. Save scraped data to files

## Setup
First, let's import our scraping modules and set up the base URL.

In [None]:
from scrapper import WebScraper, SiteMapper

# Configure your target website here
BASE_URL = "https://example.com"  # Replace with your target website
scraper = WebScraper(BASE_URL)
mapper = SiteMapper(scraper)

## 1. Mapping Website Endpoints

Let's discover all accessible endpoints on the website.

In [None]:
# Map the site with a depth of 2 levels
endpoints = mapper.map_site("/", max_depth=2)
print("Discovered endpoints:")
for endpoint in endpoints:
    print(f"- {endpoint}")

## 2. Scraping by HTML Tags

Demonstrate how to scrape specific HTML tags from a page.

In [None]:
def scrape_tags_from_endpoint(endpoint, tag):
    print(f"Scraping {tag} tags from {endpoint}")
    content = scraper.scrape_by_tag(tag, endpoint)
    return content

# Example: Scrape all paragraphs from the home page
paragraphs = scrape_tags_from_endpoint("/", "p")
print(f"Found {len(paragraphs)} paragraphs")
for i, p in enumerate(paragraphs[:3], 1):
    print(f"\nParagraph {i}:\n{p}")

## 3. Scraping by Class Names

Extract content from elements with specific class names.

In [None]:
def scrape_class_from_endpoint(endpoint, class_name):
    print(f"Scraping elements with class '{class_name}' from {endpoint}")
    content = scraper.scrape_by_class(class_name, endpoint)
    return content

# Example: Scrape elements with class 'content'
class_content = scrape_class_from_endpoint("/", "content")
print(f"\nFound {len(class_content)} elements")
for i, content in enumerate(class_content[:3], 1):
    print(f"\nContent {i}:\n{content}")

## 4. Saving Scraped Data

Demonstrate how to save scraped content to files.

In [None]:
def save_scraped_content(content, filename):
    scraper.save_to_file(content, filename)

# Example: Save scraped paragraphs
save_scraped_content(paragraphs, "paragraphs.txt")

# Example: Save content by class
save_scraped_content(class_content, "class_content.txt")

## 5. Complete Scraping Pipeline

Put everything together in a complete scraping pipeline.

In [None]:
def scrape_full_site(base_url, max_depth=2):
    # Initialize scrapers
    scraper = WebScraper(base_url)
    mapper = SiteMapper(scraper)
    
    # Get all endpoints
    print("Mapping site...")
    endpoints = mapper.map_site("/", max_depth=max_depth)
    
    # Scrape each endpoint
    for endpoint in endpoints:
        print(f"\nScraping endpoint: {endpoint}")
        
        # Scrape paragraphs
        paragraphs = scraper.scrape_by_tag("p", endpoint)
        save_scraped_content(paragraphs, f"data/paragraphs_{endpoint.replace('/', '_')}.txt")
        
        # Scrape content class
        content = scraper.scrape_by_class("content", endpoint)
        save_scraped_content(content, f"data/content_{endpoint.replace('/', '_')}.txt")
        
        # Scrape main content
        main_content = scraper.scrape_by_id("main", endpoint)
        save_scraped_content(main_content, f"data/main_{endpoint.replace('/', '_')}.txt")

# Example usage
if not os.path.exists('data'):
    os.makedirs('data')

scrape_full_site(BASE_URL, max_depth=2)