In [5]:
import pickle
import os

# Define file paths
original_file = 'user_graph.pkl'
updated_file = 'user_graph_updated.pkl'

# Check if the original pickle file exists
if not os.path.exists(original_file):
    raise FileNotFoundError(f"The file '{original_file}' does not exist in the current directory.")

# Step 1: Load the existing pickle file
with open(original_file, 'rb') as file:
    try:
        user_graph = pickle.load(file)
        print(f"Successfully loaded '{original_file}'.")
    except Exception as e:
        raise ValueError(f"Error loading pickle file: {e}")

# Verify that the loaded data is a dictionary
if not isinstance(user_graph, dict):
    raise TypeError("The pickle file does not contain a dictionary at the top level.")

# Step 2: Process each user to ensure 'connections' is a list
for user_handle, user_data in user_graph.items():
    if not isinstance(user_data, dict):
        print(f"Skipping '{user_handle}': Expected a dictionary for user data.")
        continue  # Skip if user_data is not a dictionary

    connections = user_data.get('connections', [])

    if isinstance(connections, dict):
        # Extract the 'connections' list from the nested dictionary
        connections_list = connections.get('connections', [])
        if not isinstance(connections_list, list):
            print(f"Warning: 'connections' for '{user_handle}' is not a list. Setting to empty list.")
            connections_list = []
        user_data['connections'] = connections_list
        print(f"Updated 'connections' for '{user_handle}' from dict to list.")
    elif isinstance(connections, list):
        # Already a list; no action needed
        print(f"'connections' for '{user_handle}' is already a list. No update needed.")
    else:
        # Handle unexpected format by setting 'connections' to an empty list
        print(f"Warning: 'connections' for '{user_handle}' is neither a dict nor a list. Setting to empty list.")
        user_data['connections'] = []

# Step 3: Save the updated data to a new pickle file
with open(updated_file, 'wb') as file:
    try:
        pickle.dump(user_graph, file)
        print(f"Successfully saved updated data to '{updated_file}'.")
    except Exception as e:
        raise ValueError(f"Error saving updated pickle file: {e}")

print("Processing complete. The updated pickle file is ready.")


Successfully loaded 'user_graph.pkl'.
Updated 'connections' for 'pcayuqueo.bsky.social' from dict to list.
Updated 'connections' for 'jpiquer.bsky.social' from dict to list.
Updated 'connections' for 'juanseapi.bsky.social' from dict to list.
Updated 'connections' for 'dmimica.bsky.social' from dict to list.
'connections' for 'mrhitchcok.bsky.social' is already a list. No update needed.
'connections' for 'copano.bsky.social' is already a list. No update needed.
'connections' for 'elquenoaporta.bsky.social' is already a list. No update needed.
'connections' for 'guillodibujante.bsky.social' is already a list. No update needed.
'connections' for 'polarbearby.bsky.social' is already a list. No update needed.
'connections' for 'economist.com' is already a list. No update needed.
'connections' for 'elpais.com' is already a list. No update needed.
'connections' for 'mariajaraquemada.bsky.social' is already a list. No update needed.
'connections' for 'antoniaurrejola.bsky.social' is already a

In [6]:
import pickle

# Replace 'your_file.pkl' with the path to your pickle file
with open('user_graph_updated.pkl', 'rb') as file:
    data = pickle.load(file)


In [7]:
data

{'pcayuqueo.bsky.social': {'followers_count': 549,
  'connections': ['bsky.app',
   'censelio.bsky.social',
   'juanarivers.bsky.social'],
  'avatar_url': 'https://cdn.bsky.app/img/avatar/plain/did:plc:ap7xqzzg2pzto6yb53qpyiae/bafkreibdz7bv4oerrichdnldjyrpxanbc7u5upy47xrvg62m64lejekrca@jpeg'},
 'jpiquer.bsky.social': {'followers_count': 337,
  'connections': ['aoc.bsky.social',
   'washingtonpost.com',
   'jay.bsky.team',
   'marcelias.bsky.social',
   'hcrichardson.bsky.social'],
  'avatar_url': 'https://cdn.bsky.app/img/avatar/plain/did:plc:vp3olhrwglixb5g6ptoyf4zm/bafkreid2fdwgqf54ddqq6arhbvb3gli3pi77yut62sxcgdvwmjxfowj4wq@jpeg'},
 'juanseapi.bsky.social': {'followers_count': 163,
  'connections': ['bsky.app',
   'aoc.bsky.social',
   'theonion.com',
   'jay.bsky.team',
   'theguardian.com'],
  'avatar_url': 'https://cdn.bsky.app/img/avatar/plain/did:plc:jlnhkgwjgzfxnahodceknhxf/bafkreickifvi4ntf4pp5bkbtanu47awwoghvgouynucg24mx7p6fjasubm@jpeg'},
 'dmimica.bsky.social': {'followers_c

In [10]:
import pickle
import json
import os
import shutil

# Define file paths
GRAPH_FILE = 'user_graph.pkl'
OUTPUT_JSON = 'user_data_with_connections.json'
BACKUP_FILE = 'user_graph_backup.pkl'
DEFAULT_AVATAR = 'https://example.com/default_avatar.png'  # Replace with your default avatar URL

# Check if the pickle file exists
if not os.path.exists(GRAPH_FILE):
    raise FileNotFoundError(f"The file '{GRAPH_FILE}' does not exist in the current directory.")

# Backup the original pickle file
shutil.copyfile(GRAPH_FILE, BACKUP_FILE)
print(f"Backup created at '{BACKUP_FILE}'.")

# Load the graph data
with open(GRAPH_FILE, 'rb') as f:
    graph_data = pickle.load(f)

# Verify that the data is a dictionary
if not isinstance(graph_data, dict):
    raise TypeError("The pickle file does not contain a dictionary at the top level.")

# Step 1: Normalize the 'connections' field to always be a list
for user_handle, data in graph_data.items():
    connections = data.get('connections', [])
    if isinstance(connections, dict):
        # Extract the 'connections' list from the nested dictionary
        connections_list = connections.get('connections', [])
        if isinstance(connections_list, list):
            data['connections'] = connections_list
        else:
            # If 'connections' inside the dict is not a list, set it to empty
            data['connections'] = []
            print(f"Warning: 'connections' for '{user_handle}' is not a list. Set to empty list.")
    elif not isinstance(connections, list):
        # If 'connections' is neither dict nor list, set it to empty list
        data['connections'] = []
        print(f"Warning: 'connections' for '{user_handle}' is neither dict nor list. Set to empty list.")

# Step 2: Filter users with 1,000 or more followers
filtered_graph = {
    handle: data for handle, data in graph_data.items()
    if data.get('followers_count', 0) >= 1000
}

print(f"Filtered users with ≥1000 followers: {len(filtered_graph)} out of {len(graph_data)}")

# Step 3: Create a set of valid handles from the filtered users
valid_handles = set(filtered_graph.keys())

# Step 4: Filter connections to include only existing nodes in the filtered set
for user_handle, data in filtered_graph.items():
    original_connections = data.get('connections', [])
    # Retain only connections that are in the valid_handles set
    filtered_connections = [conn for conn in original_connections if conn in valid_handles]
    data['connections'] = filtered_connections
    print(f"User '{user_handle}' connections after filtering: {len(filtered_connections)}")

# Step 5: Prepare the list of user records for JSON
user_records = []
for user_handle, data in filtered_graph.items():
    avatar_url = data.get('avatar_url', '').strip()
    if not avatar_url:
        avatar_url = DEFAULT_AVATAR  # Use default avatar if missing
        print(f"Info: User '{user_handle}' has no avatar URL. Using default avatar.")
    user_record = {
        'handle': user_handle,
        'avatar_url': avatar_url,
        'followers_count': data.get('followers_count', 0),
        'connections': data.get('connections', [])
    }
    user_records.append(user_record)

# Step 6: Save the records to a JSON file
with open(OUTPUT_JSON, 'w', encoding='utf-8') as f:
    json.dump(user_records, f, ensure_ascii=False, indent=4)

print(f"JSON file generated: {OUTPUT_JSON}")


Backup created at 'user_graph_backup.pkl'.
Filtered users with ≥1000 followers: 89 out of 132
User 'dmimica.bsky.social' connections after filtering: 5
User 'mrhitchcok.bsky.social' connections after filtering: 5
User 'economist.com' connections after filtering: 5
User 'elpais.com' connections after filtering: 5
User 'censelio.bsky.social' connections after filtering: 1
User 'apalet.bsky.social' connections after filtering: 4
User 'latercera.com' connections after filtering: 5
User 'theclinic.cl' connections after filtering: 1
User 'quintralacolorada.bsky.social' connections after filtering: 3
User 'cgajardop.bsky.social' connections after filtering: 3
User 'baradit.bsky.social' connections after filtering: 2
User 'giorgiojackson.bsky.social' connections after filtering: 5
User 'rafacavada.bsky.social' connections after filtering: 4
User 'profdevisridhar.bsky.social' connections after filtering: 2
User 'matamala.bsky.social' connections after filtering: 5
User 'sepulvedanico.bsky.socia