## BeautifulSoup

In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import time

# TechCrunch Tech News URL
TECHCRUNCH_URL = "https://techcrunch.com/"
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

start_time = time.time()

# Request the TechCrunch homepage
response = requests.get(TECHCRUNCH_URL, headers=HEADERS)
soup = BeautifulSoup(response.text, "html.parser")

# Find article links
articles = []
for link in soup.find_all("a", href=True):
    url = link["href"]
    if url.startswith("https://techcrunch.com/") and "/20" in url and url not in articles:
        articles.append(url)

# Scrape article details
data = []
for article_url in articles[:500]:  # Scraping 100 articles
    try:
        article_response = requests.get(article_url, headers=HEADERS)
        article_soup = BeautifulSoup(article_response.text, "html.parser")

        title = article_soup.find("h1").get_text(strip=True) if article_soup.find("h1") else "No title"
        content = " ".join([p.get_text(strip=True) for p in article_soup.find_all("p")])

        data.append({"title": title, "url": article_url, "content": content})
    except Exception as e:
        print(f"Skipping {article_url}: {e}")
        continue

# Save to CSV
with open("techcrunch_articles.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["title", "url", "content"])
    writer.writeheader()
    writer.writerows(data)

end_time = time.time()
print(f"Scraping completed: {len(data)} articles saved in {end_time - start_time:.2f} seconds from TechCrunch.")


Scraping completed: 45 articles saved in 20.34 seconds from TechCrunch.


## MechanicalSoup

In [2]:
import mechanicalsoup
import time
import csv

# Initialize the browser
browser = mechanicalsoup.StatefulBrowser(user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")

# TechCrunch Tech News URL
TECHCRUNCH_URL = "https://techcrunch.com/"

# Start timing
start_time = time.time()

# Request the TechCrunch homepage
browser.open(TECHCRUNCH_URL)

# Find article links
article_links = browser.links(url_regex='/20')  # Only match articles from the current year
article_links = list(set([link.get('href') for link in article_links]))

# Scrape article details
data = []
for article_url in article_links[:500]:  # Limit to 500 articles
    try:
        # Open the article page
        browser.open(article_url)
        
        # Extract the article title and content
        title = browser.page.find('h1').get_text(strip=True) if browser.page.find('h1') else 'No title'
        content = ' '.join([p.get_text(strip=True) for p in browser.page.find_all('p')])
        
        # Save the data
        data.append({
            'title': title,
            'url': article_url,
            'content': content
        })
    except Exception as e:
        print(f"Skipping {article_url}: {e}")
        continue

# Save to CSV
with open("techcrunch_articles_mechanicalsoup.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["title", "url", "content"])
    writer.writeheader()
    writer.writerows(data)

# End timing
end_time = time.time()

# Print completion message
print(f"Scraping completed using MechanicalSoup: {len(data)} articles saved in {end_time - start_time:.2f} seconds.")

Scraping completed using MechanicalSoup: 45 articles saved in 7.55 seconds.


| Scraping Method      | Requested Articles | Articles Scraped | Time Taken (Seconds) | Advantages                                                                                   | Limitations                                                            | Notes                                 |
|----------------------|--------------------|------------------|----------------------|---------------------------------------------------------------------------------------------|------------------------------------------------------------------------|---------------------------------------|
| **MechanicalSoup**    | 500                | 44               | 2.26                 | - Simpler setup and easy to use.<br>- Lightweight and fast for small-scale scraping.<br>- Works well for simple HTML scraping.<br>- Easy to integrate with other Python libraries. | - Scrapes fewer articles (only 44) despite the request for 500.<br>- Slower in scraping large datasets.<br>- Doesn't handle JavaScript-heavy sites.<br>- Lacks advanced features like concurrency.<br>- Limited error handling. | Limited to 44 articles despite the request for 500. |
| **Scrapy**            | 500                | 500              | 4.76                 | - Robust and fast for large-scale scraping.<br>- Handles multiple requests concurrently.<br>- Can scrape JavaScript-heavy sites with additional setup.<br>- Built-in support for pagination and follow-up requests.<br>- Handles data extraction and storage efficiently.<br>- Advanced error handling and retry mechanisms. | - More complex setup.<br>- Requires more resources and memory.<br>- Might need extra configurations for certain tasks.<br>- Steeper learning curve.<br>- Slower to set up for small scraping tasks.<br>- Higher resource consumption for small-scale projects. | Scraped 500 articles as requested.   |
