<a href="https://colab.research.google.com/github/eg-jamessmith/ea-forum-analysis/blob/main/ea_forum.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Collection Info From the EA Forum for Modelling and Experiments

In [37]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from google.colab import drive

# Extract Posts from the EA Forum's AI Safety Frontpage

In [38]:
drive.mount('/content/drive')
file_path = "/content/drive/My Drive/coding_projects/ea-forum/ea-forum_posts.csv"

try:
    df_existing = pd.read_csv(file_path)
except FileNotFoundError:
    df_existing = pd.DataFrame()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [40]:
# URL of the website
URL = 'https://forum.effectivealtruism.org/topics/ai-safety?tab=posts'

# Fetch the content of the website
response = requests.get(URL)
content = response.content

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(content, 'html.parser')

# Extract the posts and their links
posts = soup.find_all('a', href=True, text=True)
filtered_posts = [post for post in posts if "/posts/" in post['href']]
post_details = [(post.get_text(), 'https://forum.effectivealtruism.org' + post['href']) for post in filtered_posts]

# Convert the post details to a pandas DataFrame
df_posts = pd.DataFrame(post_details, columns=["Title", "Link"])

# Display the DataFrame
df_posts.sample(5).Link.values

  posts = soup.find_all('a', href=True, text=True)


array(['https://forum.effectivealtruism.org/posts/y7pCAoghcNKhhufCS/ai-pause-governance-advocacy-might-be-net-negative',
       'https://forum.effectivealtruism.org/posts/ggSXcuMzRaowDbKTz/possible-divergence-in-agi-risk-tolerance-between-selfish',
       'https://forum.effectivealtruism.org/posts/6SvZPHAvhT5dtqefF/debate-series-should-we-push-for-a-pause-on-the-development',
       'https://forum.effectivealtruism.org/posts/pNhc3jensyBY4Hz6u/panel-discussion-on-ai-consciousness-with-rob-long-and-jeff',
       'https://forum.effectivealtruism.org/posts/cKbehBhq7NxTq3pck/a-case-study-of-regulation-done-well-canadian-biorisk'],
      dtype=object)

# Extract Authors and Content From Posts

In [42]:
authors = []
post_text = []

for _, row in df_posts.iterrows():

  post_link = row['Link']

  # Check if post_id is already in the existing data
  if post_link not in df_existing["Link"].values:

    # Fetch the content of the website
    response = requests.get(post_link)
    content = response.content

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(content, 'html.parser')

    # Extract the author's name dynamically
    author_tag = soup.find('a', class_='UsersNameDisplay-noColor', href=True)
    if author_tag:
        author_name = author_tag.get_text().strip()
    else:
        author_name = "Not Found"

    # Extract the post's content (assuming the content is enclosed within <p> tags)
    post_content = "\n\n".join([para.get_text().strip() for para in soup.find_all('p')])

    authors.append(author_name)
    post_text.append(post_content)

# don't do anything if there's no new posts
if len(authors) > 0:

  df_posts['authors'] = authors
  df_posts['post_text'] = post_text

  # Assuming df_posts contains the new data you want to append
  df_combined = pd.concat([df_existing, df_posts])
  df_combined.to_csv(file_path, index=False)