In [None]:
import requests
import bs4
from typing import List, Optional
from urllib.parse import urlparse
import logging

In [None]:
class Scraper:
    """Base class for web scraping implementations"""
    
    def __init__(self, base_url: str):
        """
        Initialize the scraper with a base URL
        
        Args:
            base_url (str): The base URL for the scraper
        """
        self.base_url = base_url
        self.session = requests.Session()
        self.soup = None
        self._setup_logging()
    
    def _setup_logging(self):
        """Configure logging for the scraper"""
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
        )
        self.logger = logging.getLogger(self.__class__.__name__)
    
    def _validate_url(self, url: str) -> bool:
        """
        Validate if the URL is properly formatted
        
        Args:
            url (str): URL to validate
            
        Returns:
            bool: True if valid, False otherwise
        """
        try:
            result = urlparse(url)
            return all([result.scheme, result.netloc])
        except Exception as e:
            self.logger.error(f"Invalid URL: {str(e)}")
            return False
    
    def fetch_page(self, url: str) -> Optional[bs4.BeautifulSoup]:
        """
        Fetch and parse a webpage
        
        Args:
            url (str): URL to fetch
            
        Returns:
            Optional[BeautifulSoup]: Parsed page or None if failed
        """
        if not self._validate_url(url):
            return None
            
        try:
            response = self.session.get(url)
            response.raise_for_status()
            self.soup = bs4.BeautifulSoup(response.text, 'lxml')
            return self.soup
        except requests.exceptions.RequestException as e:
            self.logger.error(f"Failed to fetch page: {str(e)}")
            return None
    
    def get_elements_by_tag(self, tag: str) -> List[bs4.element.Tag]:
        """
        Get all elements of a specific HTML tag
        
        Args:
            tag (str): HTML tag to search for
            
        Returns:
            List[Tag]: List of matching elements
        """
        if self.soup is None:
            self.logger.warning("No page has been fetched yet")
            return []
        return self.soup.find_all(tag)
    
    def get_elements_by_selector(self, selector: str) -> List[bs4.element.Tag]:
        """
        Get all elements matching a CSS selector
        
        Args:
            selector (str): CSS selector to search for
            
        Returns:
            List[Tag]: List of matching elements
        """
        if self.soup is None:
            self.logger.warning("No page has been fetched yet")
            return []
        return self.soup.select(selector)




In [None]:
class WikiScraper(Scraper):
    """Specialized scraper for Wikipedia pages"""
    
    def __init__(self):
        super().__init__("https://wikipedia.org")
        
    def get_page_title(self) -> Optional[str]:
        """
        Get the main title of the Wikipedia page
        
        Returns:
            Optional[str]: Page title or None if not found
        """
        title_elem = self.get_elements_by_selector("title")
        if title_elem:
            return title_elem[0].getText().replace(" - Wikipedia", "")
        return None
    
    def get_main_heading(self) -> Optional[str]:
        """
        Get the main heading (h1) of the article
        
        Returns:
            Optional[str]: Main heading or None if not found
        """
        heading = self.get_elements_by_selector("h1")
        if heading:
            return heading[0].getText()
        return None
    
    def get_table_of_contents(self) -> List[str]:
        """
        Get all table of contents entries
        
        Returns:
            List[str]: List of table of contents entries
        """
        toc = self.get_elements_by_selector("div#toc ul li")
        return [item.getText().strip() for item in toc]
    
    def get_paragraphs(self, limit: Optional[int] = None) -> List[str]:
        """
        Get main article paragraphs
        
        Args:
            limit (Optional[int]): Maximum number of paragraphs to return
            
        Returns:
            List[str]: List of paragraph texts
        """
        paragraphs = self.get_elements_by_selector("div#mw-content-text p")
        texts = [p.getText().strip() for p in paragraphs if p.getText().strip()]
        return texts[:limit] if limit else texts