In [7]:
import requests
from bs4 import BeautifulSoup
import os
import time
from urllib.parse import urljoin, urlparse
import json

class WebScraper:
    def __init__(self, delay=1):
        self.session = requests.Session()
        self.delay = delay
        # Set a user agent to avoid being blocked
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
    
    def scrape_url(self, url, output_format='html'):
        """
        Scrape content from a URL
        
        Args:
            url (str): The URL to scrape
            output_format (str): 'html', 'text', or 'json'
        
        Returns:
            dict: Contains status, content, and metadata
        """
        try:
            print(f"Scraping: {url}")
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            
            # Parse with BeautifulSoup
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Remove header tags and their content
            for header in soup.find_all('header'):
                header.decompose()
            
            # Extract different types of content based on format
            if output_format == 'text':
                content = soup.get_text(strip=True, separator='\n')
            elif output_format == 'json':
                # Extract structured data
                content = {
                    'title': soup.title.string if soup.title else '',
                    'meta_description': '',
                    'headings': [],
                    'links': [],
                    'text_content': soup.get_text(strip=True)
                }
                
                # Get meta description
                meta_desc = soup.find('meta', attrs={'name': 'description'})
                if meta_desc:
                    content['meta_description'] = meta_desc.get('content', '')
                
                # Get headings
                for i in range(1, 7):
                    headings = soup.find_all(f'h{i}')
                    for heading in headings:
                        content['headings'].append({
                            'level': i,
                            'text': heading.get_text(strip=True)
                        })
                
                # Get links
                links = soup.find_all('a', href=True)
                for link in links:
                    href = link['href']
                    # Convert relative URLs to absolute
                    absolute_url = urljoin(url, href)
                    content['links'].append({
                        'text': link.get_text(strip=True),
                        'url': absolute_url
                    })
                
                content = json.dumps(content, indent=2, ensure_ascii=False)
            else:  # html
                content = str(soup.prettify())
            
            return {
                'status': 'success',
                'url': url,
                'status_code': response.status_code,
                'content': content,
                'content_length': len(content)
            }
            
        except requests.exceptions.RequestException as e:
            return {
                'status': 'error',
                'url': url,
                'error': str(e),
                'content': None
            }
    
    def save_to_file(self, content, filename, encoding='utf-8'):
        """Save content to file"""
        try:
            # Create directory if it doesn't exist
            os.makedirs(os.path.dirname(filename) if os.path.dirname(filename) else '.', exist_ok=True)
            
            with open(filename, 'w', encoding=encoding) as f:
                f.write(content)
            
            print(f"Content saved to: {filename}")
            return True
            
        except Exception as e:
            print(f"Error saving file: {e}")
            return False
    
    def scrape_and_save(self, url, filename=None, output_format='html'):
        """
        Scrape URL and save to file
        
        Args:
            url (str): URL to scrape
            filename (str): Output filename (auto-generated if None)
            output_format (str): 'html', 'text', or 'json'
        """
        # Auto-generate filename if not provided
        if not filename:
            parsed_url = urlparse(url)
            domain = parsed_url.netloc.replace('www.', '')
            timestamp = int(time.time())
            
            extensions = {'html': '.html', 'text': '.txt', 'json': '.json'}
            ext = extensions.get(output_format, '.html')
            
            filename = f"scraped_{domain}_{timestamp}{ext}"
        
        # Scrape the content
        result = self.scrape_url(url, output_format)
        
        if result['status'] == 'success':
            # Save to file
            if self.save_to_file(result['content'], filename):
                print(f"Successfully scraped and saved {result['content_length']} characters")
                return filename
            else:
                print("Failed to save content to file")
                return None
        else:
            print(f"Failed to scrape URL: {result['error']}")
            return None
        
        # Add delay to be respectful
        time.sleep(self.delay)

In [4]:
def scrape(urls: list):
    # Example usage
    scraper = WebScraper(delay=1)  # 1 second delay between requests
    
    # Example URLs (replace with your target URLs)
    
    
    for url in urls:
        print(f"\n--- Scraping {url} ---")
        
        # Scrape as HTML
        html_file = scraper.scrape_and_save(url, output_format='html')
        
        # Scrape as text
        text_file = scraper.scrape_and_save(url, output_format='text')
        
        # Scrape as structured JSON
        json_file = scraper.scrape_and_save(url, output_format='json')
        
        print(f"Files created: {html_file}, {text_file}, {json_file}")

In [8]:
urls = [
        "https://hpu.ugm.ac.id/layanan-chatbot-lintang/",  # Test URL that returns HTML
        "https://cpmh.psikologi.ugm.ac.id/about/"
    ]
scrape(urls=urls)


--- Scraping https://hpu.ugm.ac.id/layanan-chatbot-lintang/ ---
Scraping: https://hpu.ugm.ac.id/layanan-chatbot-lintang/
Content saved to: scraped_hpu.ugm.ac.id_1753196112.html
Successfully scraped and saved 53048 characters
Scraping: https://hpu.ugm.ac.id/layanan-chatbot-lintang/
Content saved to: scraped_hpu.ugm.ac.id_1753196115.txt
Successfully scraped and saved 4573 characters
Scraping: https://hpu.ugm.ac.id/layanan-chatbot-lintang/
Content saved to: scraped_hpu.ugm.ac.id_1753196118.json
Successfully scraped and saved 6428 characters
Files created: scraped_hpu.ugm.ac.id_1753196112.html, scraped_hpu.ugm.ac.id_1753196115.txt, scraped_hpu.ugm.ac.id_1753196118.json

--- Scraping https://cpmh.psikologi.ugm.ac.id/about/ ---
Scraping: https://cpmh.psikologi.ugm.ac.id/about/
Content saved to: scraped_cpmh.psikologi.ugm.ac.id_1753196120.html
Successfully scraped and saved 32441 characters
Scraping: https://cpmh.psikologi.ugm.ac.id/about/
Content saved to: scraped_cpmh.psikologi.ugm.ac.id_1