In [11]:
import pickle
import json
import os
import shutil

# Define file paths
GRAPH_FILE = 'user_graph.pkl'
OUTPUT_JSON = 'user_data_with_connections.json'
BACKUP_FILE = 'user_graph_backup.pkl'
DEFAULT_AVATAR = 'https://example.com/default_avatar.png'  # Replace with your default avatar URL

# Check if the pickle file exists
if not os.path.exists(GRAPH_FILE):
    raise FileNotFoundError(f"The file '{GRAPH_FILE}' does not exist in the current directory.")

# Backup the original pickle file
shutil.copyfile(GRAPH_FILE, BACKUP_FILE)
print(f"Backup created at '{BACKUP_FILE}'.")

# Load the graph data
with open(GRAPH_FILE, 'rb') as f:
    graph_data = pickle.load(f)

# Verify that the data is a dictionary
if not isinstance(graph_data, dict):
    raise TypeError("The pickle file does not contain a dictionary at the top level.")

# Step 1: Normalize the 'connections' field to always be a list
for user_handle, data in graph_data.items():
    connections = data.get('connections', [])
    if isinstance(connections, dict):
        # Extract the 'connections' list from the nested dictionary
        connections_list = connections.get('connections', [])
        if isinstance(connections_list, list):
            data['connections'] = connections_list
        else:
            # If 'connections' inside the dict is not a list, set it to empty
            data['connections'] = []
            print(f"Warning: 'connections' for '{user_handle}' is not a list. Set to empty list.")
    elif not isinstance(connections, list):
        # If 'connections' is neither dict nor list, set it to empty list
        data['connections'] = []
        print(f"Warning: 'connections' for '{user_handle}' is neither dict nor list. Set to empty list.")

# Step 2: Filter users with 1,000 or more followers
filtered_graph = {
    handle: data for handle, data in graph_data.items()
    if data.get('followers_count', 0) >= 1000
}

print(f"Filtered users with ≥1000 followers: {len(filtered_graph)} out of {len(graph_data)}")

# Step 3: Create a set of valid handles from the filtered users
valid_handles = set(filtered_graph.keys())

# Step 4: Filter connections to include only existing nodes in the filtered set
for user_handle, data in filtered_graph.items():
    original_connections = data.get('connections', [])
    # Retain only connections that are in the valid_handles set
    filtered_connections = [conn for conn in original_connections if conn in valid_handles]
    data['connections'] = filtered_connections
    print(f"User '{user_handle}' connections after filtering: {len(filtered_connections)}")

# Step 5: Prepare the list of user records for JSON
user_records = []
for user_handle, data in filtered_graph.items():
    avatar_url = data.get('avatar_url', '').strip()
    if not avatar_url:
        avatar_url = DEFAULT_AVATAR  # Use default avatar if missing
        print(f"Info: User '{user_handle}' has no avatar URL. Using default avatar.")
    user_record = {
        'handle': user_handle,
        'avatar_url': avatar_url,
        'followers_count': data.get('followers_count', 0),
        'connections': data.get('connections', [])
    }
    user_records.append(user_record)

# Step 6: Save the records to a JSON file
with open(OUTPUT_JSON, 'w', encoding='utf-8') as f:
    json.dump(user_records, f, ensure_ascii=False, indent=4)

print(f"JSON file generated: {OUTPUT_JSON}")


Backup created at 'user_graph_backup.pkl'.
Filtered users with ≥1000 followers: 89 out of 132
User 'dmimica.bsky.social' connections after filtering: 5
User 'mrhitchcok.bsky.social' connections after filtering: 5
User 'economist.com' connections after filtering: 5
User 'elpais.com' connections after filtering: 5
User 'censelio.bsky.social' connections after filtering: 1
User 'apalet.bsky.social' connections after filtering: 4
User 'latercera.com' connections after filtering: 5
User 'theclinic.cl' connections after filtering: 1
User 'quintralacolorada.bsky.social' connections after filtering: 3
User 'cgajardop.bsky.social' connections after filtering: 3
User 'baradit.bsky.social' connections after filtering: 2
User 'giorgiojackson.bsky.social' connections after filtering: 5
User 'rafacavada.bsky.social' connections after filtering: 4
User 'profdevisridhar.bsky.social' connections after filtering: 2
User 'matamala.bsky.social' connections after filtering: 5
User 'sepulvedanico.bsky.socia