In [1]:
import pandas as pd
import os
import requests
import time

  from pandas.core import (


In [None]:
# Replace these with your actual handle and app password
BLUESKY_HANDLE = "jackyhe.bsky.social"
BLUESKY_APP_PASSWORD = "#"

# Create a session
response = requests.post(
    "https://bsky.social/xrpc/com.atproto.server.createSession",
    json={"identifier": BLUESKY_HANDLE, "password": BLUESKY_APP_PASSWORD},
)
response.raise_for_status()
session = response.json()
access_token = session["accessJwt"]

headers = {
    "Authorization": f"Bearer {access_token}"
}

# Example: Fetch posts
response = requests.get(
    "https://bsky.social/xrpc/app.bsky.feed.getTimeline",
    headers=headers
)

refresh_token = session["refreshJwt"]

# Refresh the session
refresh_response = requests.post(
    "https://bsky.social/xrpc/com.atproto.server.refreshSession",
    headers={"Authorization": f"Bearer {refresh_token}"}
)
refresh_response.raise_for_status()
new_session = refresh_response.json()
new_access_token = new_session["accessJwt"]

In [None]:
# HurtLex is a lexicon of offensive, aggressive, and hateful words. https://github.com/valeriobasile/hurtlex
# Load HurtLex English lexicon
hurtlex_path = os.path.join('hurtlex_EN.tsv')
hurtlex_df = pd.read_csv(hurtlex_path, sep='\t')

# Filter for 'conservative' level if desired
hurtlex_keywords = set(hurtlex_df[hurtlex_df['level'] == 'conservative']['lemma'].str.lower())

In [4]:
# Function to label posts
def label_post(text):
    if pd.isnull(text):
        return 'Not Hate Speech'
    text_lower = text.lower()
    for keyword in hurtlex_keywords:
        if keyword in text_lower:
            return 'Hate Speech'
    return 'Not Hate Speech'

In [5]:
def fetch_posts(query, limit=100, cursor=None):
    url = "https://api.bsky.app/xrpc/app.bsky.feed.searchPosts"
    params = {
        "q": query,
        "limit": limit
    }
    if cursor:
        params["cursor"] = cursor
    try:
        response = requests.get(url, params=params, headers=headers)
        response.raise_for_status()
        return response.json()
    except requests.RequestException as e:
        print(f"Error fetching posts: {e}")
        return None

In [6]:
def collect_and_label_posts(target_count_per_label=5000):
    queries = ["the", "and", "to", "of", "a"]  # Common words to fetch diverse posts
    collected_posts = []
    label_counts = {"Hate Speech": 0, "Not Hate Speech": 0}

    for query in queries:
        cursor = None
        while True:
            data = fetch_posts(query, limit=100, cursor=cursor)
            if not data or "posts" not in data:
                break

            for post in data["posts"]:
                text = post.get("record", {}).get("text", "")
                uri = post.get("uri", "")
                if not text or not uri:
                    continue

                label = label_post(text)
                if label and label_counts[label] < target_count_per_label:
                    post_url = f"https://bsky.app/profile/{uri.split('/')[2]}/post/{uri.split('/')[-1]}"
                    collected_posts.append({"post_url": post_url, "label": label})
                    label_counts[label] += 1

                    # Print progress
                    print(f"Collected {len(collected_posts)} posts: "
                          f"Hate Speech: {label_counts['Hate Speech']}, "
                          f"Not Hate Speech: {label_counts['Not Hate Speech']}")

                if all(count >= target_count_per_label for count in label_counts.values()):
                    break

            if all(count >= target_count_per_label for count in label_counts.values()):
                break

            cursor = data.get("cursor")
            if not cursor:
                break
            time.sleep(1)  # Respectful delay to avoid rate limiting

    print(f"Final counts - Total posts: {len(collected_posts)}, "
          f"Hate Speech: {label_counts['Hate Speech']}, "
          f"Not Hate Speech: {label_counts['Not Hate Speech']}")
    return collected_posts

In [7]:
# Execute the data collection
dataset = collect_and_label_posts()

Collected 1 posts: Hate Speech: 1, Not Hate Speech: 0
Collected 2 posts: Hate Speech: 1, Not Hate Speech: 1
Collected 3 posts: Hate Speech: 2, Not Hate Speech: 1
Collected 4 posts: Hate Speech: 3, Not Hate Speech: 1
Collected 5 posts: Hate Speech: 4, Not Hate Speech: 1
Collected 6 posts: Hate Speech: 5, Not Hate Speech: 1
Collected 7 posts: Hate Speech: 6, Not Hate Speech: 1
Collected 8 posts: Hate Speech: 7, Not Hate Speech: 1
Collected 9 posts: Hate Speech: 8, Not Hate Speech: 1
Collected 10 posts: Hate Speech: 9, Not Hate Speech: 1
Collected 11 posts: Hate Speech: 10, Not Hate Speech: 1
Collected 12 posts: Hate Speech: 11, Not Hate Speech: 1
Collected 13 posts: Hate Speech: 12, Not Hate Speech: 1
Collected 14 posts: Hate Speech: 13, Not Hate Speech: 1
Collected 15 posts: Hate Speech: 14, Not Hate Speech: 1
Collected 16 posts: Hate Speech: 15, Not Hate Speech: 1
Collected 17 posts: Hate Speech: 16, Not Hate Speech: 1
Collected 18 posts: Hate Speech: 17, Not Hate Speech: 1
Collected 1

In [8]:
# Save to CSV
df = pd.DataFrame(dataset)
df.to_csv("test-data/bluesky_hate_speech_dataset.csv", index=False)
print("Dataset saved to 'bluesky_hate_speech_dataset.csv'")

Dataset saved to 'bluesky_hate_speech_dataset.csv'
