In [18]:
import pickle
import json
import os
import shutil

# Define file paths
ORIGINAL_FILE = 'user_graph.pkl'
UPDATED_FILE = 'user_graph_updated.pkl'  # Assuming you have an updated graph
OUTPUT_JSON = 'user_data_with_connections.json'

# Optional: Backup the original graph
if os.path.exists(ORIGINAL_FILE):
    shutil.copyfile(ORIGINAL_FILE, UPDATED_FILE)
    print(f"Backup created at '{UPDATED_FILE}'.")
else:
    raise FileNotFoundError(f"The file '{ORIGINAL_FILE}' does not exist in the current directory.")

# Load the graph data
with open(UPDATED_FILE, 'rb') as f:
    graph_data = pickle.load(f)

# Verify that the loaded data is a dictionary
if not isinstance(graph_data, dict):
    raise TypeError("The pickle file does not contain a dictionary at the top level.")

# Step 1: Filter users with 1,000 or more followers
filtered_graph = {
    handle: data for handle, data in graph_data.items()
    if data.get('followers_count', 0) >= 1000
}

print(f"Filtered users with ≥1000 followers: {len(filtered_graph)} out of {len(graph_data)}")

# Step 2: Create a set of valid handles from the filtered users
valid_handles = set(filtered_graph.keys())

# Step 3: Filter connections to include only existing nodes in the filtered set
for handle, data in filtered_graph.items():
    original_connections = data.get('connections', [])
    filtered_connections = [conn for conn in original_connections if conn in valid_handles]
    data['connections'] = filtered_connections
    print(f"User '{handle}' connections after filtering: {len(filtered_connections)}")

# Step 4: Prepare the list of user records for JSON, including 'description' and 'displayName'
user_records = []
for handle, data in filtered_graph.items():
    user_record = {
        'handle': handle,
        'avatar_url': data.get('avatar_url', ''),
        'displayName': data.get('displayName', ''),    # Include displayName
        'description': data.get('description', ''),    # Include description
        'followers_count': data.get('followers_count', 0),
        'connections': data.get('connections', [])
    }
    user_records.append(user_record)

print(f"Total user records to be saved: {len(user_records)}")

# Step 5: Save the records to a JSON file
with open(OUTPUT_JSON, 'w', encoding='utf-8') as f:
    json.dump(user_records, f, ensure_ascii=False, indent=4)

print(f"JSON file generated: {OUTPUT_JSON}")


Backup created at 'user_graph_updated.pkl'.
Filtered users with ≥1000 followers: 295 out of 430
User 'dmimica.bsky.social' connections after filtering: 0
User 'mrhitchcok.bsky.social' connections after filtering: 5
User 'economist.com' connections after filtering: 5
User 'elpais.com' connections after filtering: 5
User 'censelio.bsky.social' connections after filtering: 5
User 'apalet.bsky.social' connections after filtering: 4
User 'latercera.com' connections after filtering: 5
User 'theclinic.cl' connections after filtering: 1
User 'quintralacolorada.bsky.social' connections after filtering: 4
User 'cgajardop.bsky.social' connections after filtering: 5
User 'baradit.bsky.social' connections after filtering: 2
User 'giorgiojackson.bsky.social' connections after filtering: 5
User 'rafacavada.bsky.social' connections after filtering: 4
User 'profdevisridhar.bsky.social' connections after filtering: 2
User 'matamala.bsky.social' connections after filtering: 5
User 'sepulvedanico.bsky.soc