In [15]:
import pickle
import numpy as np
import pandas as pd
import graph_tool.all as gt

from collections import Counter
from tabulate import tabulate

In [2]:
g = pickle.load(open('output/g.pkl', 'rb'))

In [3]:
g_filtered = pickle.load(open('output/g_filtered.pkl', 'rb'))
df_filtered = pickle.load(open('output/df_filtered.pkl', 'rb'))

In [4]:
df = pickle.load(open('output/df.pkl', 'rb'))

In [5]:
# ah, forgot to save v_filt. just recreate it
all_dids = set(df['did'].values)
filtered_dids = set(df_filtered['did'].values)

v_filt = np.array([did in filtered_dids for did in df['did'].values])

In [6]:
assert v_filt.sum() == len(df_filtered)

In [7]:
state = pickle.load(open('output/nested_state.pkl', 'rb'))
assort_state = pickle.load(open('output/assort_state.pkl', 'rb'))

In [8]:
assort_blocks = assort_state.get_blocks().a

In [10]:
import time
from infomap import Infomap

def run_infomap_og(g, num_trials=3, weights=None, verbose=False):
    start = time.time()

    im = Infomap(directed=g.is_directed(), silent=True, num_trials=num_trials)

    if weights:
        for e in g.edges():
            im.add_link(int(e.source()), int(e.target()), weights[e])
    else:
        for e in g.edges():
            im.add_link(int(e.source()), int(e.target()))

    if verbose:
        print(f"Done adding links. Took {time.time() - start}")

    start = time.time()

    # Run the Infomap search algorithm to find optimal modules
    im.run()

    if verbose:
        print(f"Found {im.num_top_modules} modules with codelength: {im.codelength}")

    infomap_modules = g.new_vp('int')

    for node in im.tree:
        if node.is_leaf:
            infomap_modules[node.node_id] = node.module_id

    if verbose:
        print(f"Infomap took {time.time() - start}")

    return infomap_modules

In [12]:
infomap_blocks = run_infomap_og(g_filtered, num_trials=3, verbose=True).a

Done adding links. Took 1.3824431896209717
Found 119 modules with codelength: 10.673666512073885
Infomap took 2.543097972869873


In [26]:
to_print_blocks = [(assort_blocks, 'assort'), (infomap_blocks, 'infomap')]
to_print_blocks += [(state.project_level(i).get_blocks().a, f'nested_state_{i}') for i in range(4)]

# rename followers_count to fs_count and following_count to fg_count

account_cols = ['handle', 'display_name', 'fs_count', 'fg_count', 'description']

for (labels, name) in to_print_blocks:
    with open(f"output/cd_txt/{name}_state.txt", "w") as f:
        for b, c in Counter(labels).most_common():
            subgraph = gt.GraphView(g_filtered, vfilt=labels == b)
            subgraph_pr = gt.pagerank(subgraph)
            top_k_in_block = np.argsort(subgraph_pr.a)[::-1][:c]

            block_df = df_filtered.iloc[top_k_in_block].copy()
            block_df = block_df.rename(columns={'followers_count': 'fs_count', 'following_count': 'fg_count'})
            block_df['description'] = block_df['description'].apply(lambda x: x.replace("\n", "")[:20] if x else '')

            f.write(f"Block {b} (size {c})\n")
            # f.write(block_df.head(100).to_string(index=False, max_colwidth=50, columns=account_cols))
            f.write(tabulate(block_df.head(100)[account_cols], headers='keys', tablefmt='psql'))
            f.write("\n\n\n")

In [27]:
extensions = df.handle.apply(lambda s: ".".join(s.split(".")[1:]))
extensions_counter = Counter(extensions)
extensions_counter.most_common(100)

[('bsky.social', 21196),
 ('com', 351),
 ('net', 67),
 ('dev', 52),
 ('me', 47),
 ('xyz', 46),
 ('org', 31),
 ('jp', 24),
 ('io', 23),
 ('ingroup.social', 22),
 ('social', 15),
 ('ca', 11),
 ('co', 10),
 ('blog', 10),
 ('lol', 10),
 ('us', 9),
 ('blue', 9),
 ('ir', 9),
 ('id', 8),
 ('sh', 8),
 ('goose.art', 7),
 ('com.br', 7),
 ('is', 7),
 ('tech', 6),
 ('de', 6),
 ('life', 6),
 ('space', 6),
 ('info', 5),
 ('online', 5),
 ('ai', 5),
 ('one', 5),
 ('uk', 5),
 ('zone', 4),
 ('bsky.team', 4),
 ('wtf', 4),
 ('co.uk', 4),
 ('computer', 4),
 ('st', 4),
 ('in', 4),
 ('bestie.social', 4),
 ('at', 3),
 ('outgroup.social', 3),
 ('systems', 3),
 ('cc', 3),
 ('club', 3),
 ('haha.computer', 3),
 ('bot.gar.lol', 3),
 ('fyi', 3),
 ('pm', 3),
 ('codes', 3),
 ('eatnews.net', 3),
 ('galanter.net', 3),
 ('cloud', 3),
 ('pizza', 3),
 ('im', 3),
 ('fm', 3),
 ('fun', 3),
 ('vibe.camp', 2),
 ('haus', 2),
 ('cat', 2),
 ('money', 2),
 ('fish', 2),
 ('rocks', 2),
 ('world', 2),
 ('ninja', 2),
 ('to', 2),
 ('bo

In [28]:
extensions_counter['bsky.social'] / len(df)

0.9483668903803132

In [29]:
# now add the dataframe info the the filtered_g as property maps
vprop_handle = g_filtered.new_vertex_property("string")
vprop_display_name = g_filtered.new_vertex_property("string")
vprop_description = g_filtered.new_vertex_property("string")
vprop_avatar_url = g_filtered.new_vertex_property("string")
vprop_indexed_at = g_filtered.new_vertex_property("string")
vprop_date_updated = g_filtered.new_vertex_property("string")
vprop_followers_count = g_filtered.new_vertex_property("int")
vprop_following_count = g_filtered.new_vertex_property("int")

for i, row in df_filtered.iterrows():
    v = g_filtered.vertex(i)
    vprop_handle[v] = row.handle
    vprop_display_name[v] = row.display_name
    vprop_description[v] = row.description
    vprop_avatar_url[v] = row.avatar_url
    vprop_indexed_at[v] = row.indexed_at
    vprop_date_updated[v] = row.date_updated
    vprop_followers_count[v] = row.followers_count
    vprop_following_count[v] = row.following_count

shorter_handles = []

for v in g_filtered.vertices():
    h = vprop_handle[v]

    if "bsky.social" in h:
        shorter_handles.append(h.replace(".bsky.social", ""))
    else:
        shorter_handles.append(h)

g_filtered.vertex_properties["handle"] = vprop_handle
g_filtered.vertex_properties["short_handle"] = g_filtered.new_vertex_property("string", vals=shorter_handles)
g_filtered.vertex_properties["display_name"] = vprop_display_name
g_filtered.vertex_properties["description"] = vprop_description
g_filtered.vertex_properties["avatar_url"] = vprop_avatar_url
g_filtered.vertex_properties["indexed_at"] = vprop_indexed_at
g_filtered.vertex_properties["date_updated"] = vprop_date_updated
g_filtered.vertex_properties["followers_count"] = vprop_followers_count
g_filtered.vertex_properties["following_count"] = vprop_following_count
g_filtered.vp['assort_block'] = g_filtered.new_vp('int', assort_state.get_blocks().a)
g_filtered.vp['infomap_blocks'] = g_filtered.new_vp('int', infomap_blocks)

# in gephi copy this to an "image" column
# no, just naming it "image" here doesn't work: ????
g_filtered.vp['image_fn'] = g_filtered.new_vp('string', [f"{vprop_handle[v]}.jpg" for v in g_filtered.vertices()])

for i in range(5):
    g_filtered.vp[f'ndcsbm_level_{i}'] = g_filtered.new_vp('int', state.project_level(i).get_blocks().a)

g_filtered.save("output/bluesky_test_graph.graphml")

In [30]:
A_full = gt.adjacency(g).T
out_degrees = A_full.sum(axis=1).A1
in_degrees = A_full.sum(axis=0).A1

In [31]:
pr = gt.pagerank(g_filtered)
eig, auth, hub = gt.hits(g_filtered)
df_filtered = df[v_filt].copy()
df_filtered['following_count'] = out_degrees[v_filt].astype(int)
df_filtered['followers_count'] = in_degrees[v_filt].astype(int)
df_filtered['authority'] = auth.a
# df_filtered['hub'] = hub.a
df_filtered['pagerank'] = pr.a
df_filtered['authority_rank'] = df_filtered['authority'].rank(ascending=False)
df_filtered['pagerank_rank'] = df_filtered['pagerank'].rank(ascending=False)
df_filtered['followers_rank'] = df_filtered['followers_count'].rank(ascending=False)
cols = ['handle', 'display_name', 'following_count', 'followers_count', 'followers_rank', 'pagerank_rank', 'authority_rank', 'description']
df_filtered.sort_values(by="followers_rank").head(50)[cols]

Unnamed: 0,handle,display_name,following_count,followers_count,followers_rank,pagerank_rank,authority_rank,description
66,jay.bsky.team,Jay 🦋,2086,4476,1.0,1.0,1.0,chief cat-herder around here\n\nsocieties evol...
22245,jack.bsky.social,jack,15,4354,2.0,3.0,4.0,
6,pfrazee.com,Paul Frazee✌️,241,4112,3.0,4.0,3.0,Developer at Bluesky. The one who puts bugs in...
48,why.bsky.team,Whyrusleeping,7450,4020,4.0,2.0,2.0,"Technical advisor to @bluesky, first engineer ..."
22261,vishal.bsky.social,Vishal Gulia 🌞,22246,3459,5.0,6.0,5.0,"Bluesky’s top memer, Web3 enthusiast, early ad..."
22349,tho.bsky.social,Thomas Pockrandt,22346,3196,6.0,7.0,9.0,Tech Advisor & Digital Strategist 🦾\n\nhttps:/...
20777,shinyakato.dev,Shinya Kato 🤯,22501,2703,7.0,9.0,8.0,Dart/Flutter & atproto enthusiast\nContrib. bl...
22345,miranda.bsky.social,Miranda,21302,2598,8.0,10.0,6.0,• Host\n• Writer\n• Contact: heymiranda@outloo...
19012,cats.bsky.social,Cat,21028,2531,9.0,11.0,10.0,Cats lover
5296,bsky.app,Bluesky,9,2225,10.0,12.0,33.0,Official bluesky account (check domain 👆)\n\nF...
