In [7]:
pip install requests beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [1]:
import requests
from bs4 import BeautifulSoup

# Define the URL
url = "https://debatepolitics.com/forums/2024-us-presidential-election.227/"

# Set headers to mimic a browser request (avoid bot detection)
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

### Scraping a Single Page

In [12]:
# Send a GET request
response = requests.get(url, headers=headers)

# Extract Titles
titles = [title.get_text(strip=True) for title in soup.select(".structItem-title a")]

# Extract URLs (prepend with site domain)
urls = ["https://debatepolitics.com" + link["href"] for link in soup.select(".structItem-title a")]

# Extract Replies
replies = [reply.get_text(strip=True) for reply in soup.select(".structItem-cell.structItem-cell--meta dl:nth-of-type(1) dd")]

# Extract Views
views = [view.get_text(strip=True) for view in soup.select(".structItem-cell.structItem-cell--meta dl:nth-of-type(2) dd")]

# Print Results
for i in range(len(titles)):
    print(f"Title: {titles[i]}")
    print(f"URL: {urls[i]}")
    print(f"Replies: {replies[i]}")
    print(f"Views: {views[i]}")
    print("-" * 50)

Title: For some Latinos, ‘prosperity gospel’ led them to Trump
URL: https://debatepolitics.com/threads/for-some-latinos-%E2%80%98prosperity-gospel%E2%80%99-led-them-to-trump.557926/
Replies: 48
Views: 2K
--------------------------------------------------
Title: How the GOP is on Self-Destruct
URL: https://debatepolitics.com/threads/how-the-gop-is-on-self-destruct.557954/
Replies: 2
Views: 174
--------------------------------------------------
Title: Kamala's Been On A Drinking Binge Since Losing
URL: https://debatepolitics.com/threads/kamalas-been-on-a-drinking-binge-since-losing.556181/
Replies: 187
Views: 2K
--------------------------------------------------
Title: The Victory
URL: https://debatepolitics.com/threads/the-victory.554963/
Replies: 182
Views: 2K
--------------------------------------------------
Title: It Must Really Suck For You People
URL: https://debatepolitics.com/threads/it-must-really-suck-for-you-people.557448/
Replies: 290
Views: 2K
------------------------------

### Putting All Together and Pagination

In [15]:
# import requests
# from bs4 import BeautifulSoup
import time  # To add delays between requests

# Base URL of the forum
BASE_URL = "https://debatepolitics.com"

# Start scraping from the first page
url = BASE_URL + "/forums/2024-us-presidential-election.227/"
all_data = []  # List to store all scraped data

while url:
    print(f"Scraping: {url}")  # Display current page being scraped
    
    # Add headers to avoid bot detection
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    # Fetch the webpage
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract Titles
    titles = [title.get_text(strip=True) for title in soup.select(".structItem-title a")]

    # Extract URLs (prepend with site domain)
    urls = [BASE_URL + link["href"] for link in soup.select(".structItem-title a")]

    # Extract Replies
    replies = [reply.get_text(strip=True) for reply in soup.select(".structItem-cell.structItem-cell--meta dl:nth-of-type(1) dd")]

    # Extract Views
    views = [view.get_text(strip=True) for view in soup.select(".structItem-cell.structItem-cell--meta dl:nth-of-type(2) dd")]

    # Store data in a list of dictionaries
    for i in range(len(titles)):
        all_data.append({
            "title": titles[i],
            "url": urls[i],
            "replies": replies[i],
            "views": views[i]
        })

    # Find "Next Page" button
    next_page = soup.select_one(".pageNav .pageNav-jump--next")

    if next_page:
        next_url = next_page["href"]
        url = BASE_URL + next_url  # Append the base URL
        time.sleep(2)  # Add a short delay to avoid overloading the server
    else:
        break  # No more pages, stop the loop

# Print collected data
for post in all_data[:5]:  # Print first 5 posts as a preview
    print(post)

Scraping: https://debatepolitics.com/forums/2024-us-presidential-election.227/
Scraping: https://debatepolitics.com/forums/2024-us-presidential-election.227/page-2
Scraping: https://debatepolitics.com/forums/2024-us-presidential-election.227/page-3
Scraping: https://debatepolitics.com/forums/2024-us-presidential-election.227/page-4
Scraping: https://debatepolitics.com/forums/2024-us-presidential-election.227/page-5
Scraping: https://debatepolitics.com/forums/2024-us-presidential-election.227/page-6
Scraping: https://debatepolitics.com/forums/2024-us-presidential-election.227/page-7
Scraping: https://debatepolitics.com/forums/2024-us-presidential-election.227/page-8
Scraping: https://debatepolitics.com/forums/2024-us-presidential-election.227/page-9
Scraping: https://debatepolitics.com/forums/2024-us-presidential-election.227/page-10
Scraping: https://debatepolitics.com/forums/2024-us-presidential-election.227/page-11
Scraping: https://debatepolitics.com/forums/2024-us-presidential-elec

### Save Data to CSV

In [24]:
import pandas as pd

# Save data to CSV
df = pd.DataFrame(all_data)
df.to_csv("/Users/kaiyang/Desktop/debatepolitics_posts_full.csv", index=False)

print("Data saved to debatepolitics_forum_posts.csv")

Data saved to debatepolitics_forum_posts.csv
