# Data Exploration: BluePrint and Reddit Moderator Perceptions

This notebook explores the two datasets used for simulating social media moderation policies:
1. BluePrint: Human-AI social interactions dataset
2. Reddit Moderator Perceptions: Moderator interpretations of toxic content

In [2]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. BluePrint Dataset

In [3]:
# Load BluePrint dataset from HuggingFace
# Available configurations: "2_clusters", "100_clusters", "1000_clusters"
# Start with 100_clusters as a medium-sized dataset

blueprint = load_dataset("ComplexDataLab/BluePrint", "100_clusters")
print("Dataset splits:", blueprint.keys())
print("\nDataset info:")
print(blueprint)

Resolving data files:   0%|          | 0/100 [00:00<?, ?it/s]

Dataset splits: dict_keys(['full'])

Dataset info:
DatasetDict({
    full: Dataset({
        features: ['thread', 'cluster_id'],
        num_rows: 6828041
    })
})


In [4]:
# Examine thread structure
ds = blueprint['full']
print("Thread Structure")
first_thread = ds[0]['thread']
print(f"Type: {type(first_thread)}")

if isinstance(first_thread, list) and len(first_thread) > 0:
    print(f"\nFirst message in thread:")
    print(f"Type: {type(first_thread[0])}")
    print(first_thread[0])
    
    if isinstance(first_thread[0], dict):
        print(f"\nMessage fields: {list(first_thread[0].keys())}")

Thread Structure
Type: <class 'list'>

First message in thread:
Type: <class 'dict'>
{'relative_integer_time': 5597279, 'text': "Guess who's keeping a close eye on Australia's shaky relationship with the US? #France, of course. They quote someone who points out that we may never get our subs from the US.\n<URL>", 'user_id': 'd1d36572cab0abad6ff8c9de79b07de9531a8381fb0b9804626b496bd583b997', 'actions': {'like': False, 'unlike': False, 'repost': False, 'unrepost': False, 'follow': False, 'unfollow': False, 'block': False, 'unblock': False, 'post_update': False, 'post_delete': False, 'quote': False, 'post': True, 'reply': False}}

Message fields: ['relative_integer_time', 'text', 'user_id', 'actions']


## 3. Summary and Next Steps

In [6]:
from datasets import load_dataset
bp = load_dataset("ComplexDataLab/BluePrint", "25_clusters", split="full")
sample = bp[0]
assert "cluster_id" in sample and "thread" in sample
assert isinstance(sample["thread"], list) and isinstance(sample["thread"][0], dict)
assert "text" in sample["thread"][0]
print("Schema OK")


Resolving data files:   0%|          | 0/25 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/25 [00:00<?, ?files/s]

processed_25_clusters/cluster_0.jsonl:   0%|          | 0.00/67.4M [00:00<?, ?B/s]

processed_25_clusters/cluster_1.jsonl:   0%|          | 0.00/14.4M [00:00<?, ?B/s]

processed_25_clusters/cluster_10.jsonl:   0%|          | 0.00/316M [00:00<?, ?B/s]

processed_25_clusters/cluster_11.jsonl:   0%|          | 0.00/106M [00:00<?, ?B/s]

processed_25_clusters/cluster_12.jsonl:   0%|          | 0.00/249M [00:00<?, ?B/s]

processed_25_clusters/cluster_13.jsonl:   0%|          | 0.00/104M [00:00<?, ?B/s]

processed_25_clusters/cluster_14.jsonl:   0%|          | 0.00/322M [00:00<?, ?B/s]

processed_25_clusters/cluster_15.jsonl:   0%|          | 0.00/53.1M [00:00<?, ?B/s]

processed_25_clusters/cluster_16.jsonl:   0%|          | 0.00/211M [00:00<?, ?B/s]

processed_25_clusters/cluster_17.jsonl:   0%|          | 0.00/460M [00:00<?, ?B/s]

processed_25_clusters/cluster_18.jsonl:   0%|          | 0.00/98.6M [00:00<?, ?B/s]

processed_25_clusters/cluster_19.jsonl:   0%|          | 0.00/239M [00:00<?, ?B/s]

processed_25_clusters/cluster_2.jsonl:   0%|          | 0.00/307M [00:00<?, ?B/s]

processed_25_clusters/cluster_20.jsonl:   0%|          | 0.00/66.8M [00:00<?, ?B/s]

processed_25_clusters/cluster_21.jsonl:   0%|          | 0.00/56.0M [00:00<?, ?B/s]

processed_25_clusters/cluster_22.jsonl:   0%|          | 0.00/48.8M [00:00<?, ?B/s]

processed_25_clusters/cluster_23.jsonl:   0%|          | 0.00/201M [00:00<?, ?B/s]

processed_25_clusters/cluster_24.jsonl:   0%|          | 0.00/245M [00:00<?, ?B/s]

processed_25_clusters/cluster_3.jsonl:   0%|          | 0.00/72.3M [00:00<?, ?B/s]

processed_25_clusters/cluster_4.jsonl:   0%|          | 0.00/59.3M [00:00<?, ?B/s]

processed_25_clusters/cluster_5.jsonl:   0%|          | 0.00/109M [00:00<?, ?B/s]

processed_25_clusters/cluster_6.jsonl:   0%|          | 0.00/569M [00:00<?, ?B/s]

processed_25_clusters/cluster_7.jsonl:   0%|          | 0.00/393M [00:00<?, ?B/s]

processed_25_clusters/cluster_8.jsonl:   0%|          | 0.00/19.2M [00:00<?, ?B/s]

processed_25_clusters/cluster_9.jsonl:   0%|          | 0.00/198M [00:00<?, ?B/s]

Generating full split:   0%|          | 0/6828041 [00:00<?, ? examples/s]

Schema OK


In [7]:
from collections import Counter
import numpy as np
counts = Counter(row["cluster_id"] for row in bp.select(range(5000)))
print("Top 5 cluster counts:", counts.most_common(5))
lengths = [
    sum(len(msg.get("text", "")) for msg in row["thread"])
    for row in bp.select(range(1000))
]
print("Avg chars/thread:", np.mean(lengths))


Top 5 cluster counts: [(0, 5000)]
Avg chars/thread: 270.827


In [8]:
from transformers import AutoTokenizer


def format_thread(thread, cluster_id):
    lines = [f"[Cluster {cluster_id}]"]
    for idx, msg in enumerate(thread):
        text = (msg.get("text") or "").strip()
        if not text:
            continue
        user_hash = msg.get("user_id", "anon")[:8]
        lines.append(f"{idx:02d} [{user_hash}] {text}")
    return "\n".join(lines)


tok = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct", trust_remote_code=True)
sample_text = format_thread(bp[0]["thread"], bp[0]["cluster_id"])
token_count = len(tok(sample_text)["input_ids"])
print("Tokens in sample thread:", token_count)


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Tokens in sample thread: 108
