# Web Scraping Tutorial with BeautifulSoup

This notebook demonstrates web scraping using BeautifulSoup to:
1. Map all endpoints of a website
2. Scrape content by tags, classes, and IDs
3. Save scraped data to files

In [None]:
! pip install beautifulsoup4 requests ratelimit

In [4]:
from bs4 import BeautifulSoup
import requests
import os
import time
from urllib.parse import urljoin, urlparse
from ratelimit import limits, sleep_and_retry
from requests.exceptions import RequestException

class WebScraper:
    def __init__(self, base_url, rate_limit=1):
        self.base_url = base_url.rstrip('/')
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })
        self.rate_limit = rate_limit
    
    @sleep_and_retry
    @limits(calls=1, period=1)  # 1 call per second
    def get_soup(self, endpoint, timeout=10):
        url = urljoin(self.base_url, endpoint)
        try:
            response = self.session.get(url, timeout=timeout)
            response.raise_for_status()
            return BeautifulSoup(response.text, 'html.parser')
        except RequestException as e:
            print(f"Error fetching {url}: {str(e)}")
            return None

    def scrape_elements(self, endpoint, **kwargs):
        soup = self.get_soup(endpoint)
        if not soup:
            return []
            
        elements = soup.find_all(**kwargs) if kwargs.get('find_all', True) else [soup.find(**kwargs)]
        return [elem.text.strip() for elem in elements if elem]

    def scrape_by_tag(self, tag, endpoint):
        return self.scrape_elements(endpoint, name=tag)
    
    def scrape_by_class(self, class_name, endpoint):
        return self.scrape_elements(endpoint, class_=class_name)
    
    def scrape_by_id(self, id_name, endpoint):
        return self.scrape_elements(endpoint, id=id_name, find_all=False)
    
    def save_to_file(self, content, filename):
        os.makedirs(os.path.dirname(filename), exist_ok=True)
        with open(filename, 'w', encoding='utf-8') as f:
            for item in content:
                f.write(f"{item}\n{'='*80}\n")

class SiteMapper:
    def __init__(self, scraper):
        self.scraper = scraper
        self.visited = set()
        self.domain = urlparse(scraper.base_url).netloc
    
    def is_valid_url(self, url):
        parsed = urlparse(url)
        return parsed.netloc == self.domain
    
    def map_site(self, start_path, max_depth=2):
        if max_depth <= 0 or start_path in self.visited:
            return set()
        
        self.visited.add(start_path)
        soup = self.scraper.get_soup(start_path)
        if not soup:
            return {start_path}
            
        endpoints = {start_path}
        
        for link in soup.find_all('a', href=True):
            href = link['href']
            if not href or href.startswith(('#', 'javascript:', 'mailto:', 'tel:')):
                continue
                
            full_url = urljoin(self.scraper.base_url, href)
            if self.is_valid_url(full_url):
                path = urlparse(full_url).path or '/'
                if path not in self.visited:
                    endpoints.update(self.map_site(path, max_depth - 1))
        
        return endpoints

def scrape_full_site(base_url, max_depth=2):
    scraper = WebScraper(base_url)
    mapper = SiteMapper(scraper)
    
    print(f"Starting to scrape {base_url}")
    endpoints = mapper.map_site("/", max_depth=max_depth)
    print(f"Found {len(endpoints)} endpoints")
    
    for endpoint in endpoints:
        print(f"\nScraping: {endpoint}")
        safe_endpoint = endpoint.replace('/', '_').lstrip('_')
        if not safe_endpoint:
            safe_endpoint = 'home'
            
        try:
            # Scrape and save paragraphs
            paragraphs = scraper.scrape_by_tag("p", endpoint)
            if paragraphs:
                scraper.save_to_file(paragraphs, f"data/paragraphs/{safe_endpoint}.txt")
            
            # Scrape and save content divs
            content = scraper.scrape_by_class("content", endpoint)
            if content:
                scraper.save_to_file(content, f"data/content/{safe_endpoint}.txt")
            
            # Scrape and save main content
            main_content = scraper.scrape_by_id("main", endpoint)
            if main_content:
                scraper.save_to_file(main_content, f"data/main/{safe_endpoint}.txt")
                
        except Exception as e:
            print(f"Error processing {endpoint}: {str(e)}")
            continue

# Initialize and run
BASE_URL = "https://iiitkottayam.ac.in/"
scrape_full_site(BASE_URL, max_depth=10)

Starting to scrape https://iiitkottayam.ac.in/
Found 5 endpoints

Scraping: /data/pdf/cvo_new.pdf

Scraping: /data/pdf/OM-formation of SC-ST-OBC Cell-Jan2024.pdf

Scraping: /data/pdf/recruiterscorner.pdf

Scraping: /data/pdf/OM-Disciplinary Action Committee-revised-Jan2024.pdf

Scraping: /
