In [1]:
import pickle
import numpy as np
import pandas as pd
import graph_tool.all as gt

from collections import Counter
from tabulate import tabulate

In [2]:
g = pickle.load(open('output/g.pkl', 'rb'))

In [3]:
g_filtered = pickle.load(open('output/g_filtered.pkl', 'rb'))
df_filtered = pickle.load(open('output/df_filtered.pkl', 'rb'))

In [4]:
df = pickle.load(open('output/df.pkl', 'rb'))

In [5]:
# ah, forgot to save v_filt. just recreate it
all_dids = set(df['did'].values)
filtered_dids = set(df_filtered['did'].values)

v_filt = np.array([did in filtered_dids for did in df['did'].values])

In [6]:
assert v_filt.sum() == len(df_filtered)

In [7]:
state = pickle.load(open('output/nested_state.pkl', 'rb'))
assort_state = pickle.load(open('output/assort_state.pkl', 'rb'))

In [8]:
assort_state = assort_state.copy(b=gt.order_partition_labels(assort_state.get_blocks().a))

In [9]:
state = state.copy(bs=gt.order_nested_partition_labels(state.get_bs()))

In [10]:
assort_state.entropy()

6079345.364458094

In [11]:
state.entropy()

5324207.808946402

In [12]:
assort_blocks = assort_state.get_blocks().a

In [13]:
import time
from infomap import Infomap

def run_infomap_og(g, num_trials=3, weights=None, verbose=False):
    start = time.time()

    im = Infomap(directed=g.is_directed(), silent=True, num_trials=num_trials)

    if weights:
        for e in g.edges():
            im.add_link(int(e.source()), int(e.target()), weights[e])
    else:
        for e in g.edges():
            im.add_link(int(e.source()), int(e.target()))

    if verbose:
        print(f"Done adding links. Took {time.time() - start}")

    start = time.time()

    # Run the Infomap search algorithm to find optimal modules
    im.run()

    if verbose:
        print(f"Found {im.num_top_modules} modules with codelength: {im.codelength}")

    infomap_modules = g.new_vp('int')

    for node in im.tree:
        if node.is_leaf:
            infomap_modules[node.node_id] = node.module_id

    if verbose:
        print(f"Infomap took {time.time() - start}")

    return infomap_modules

In [14]:
infomap_blocks = run_infomap_og(g_filtered, num_trials=3, verbose=True).a

Done adding links. Took 4.824987411499023
Found 39 modules with codelength: 11.319825937795484
Infomap took 7.943307399749756


In [15]:
len(assort_blocks), g_filtered

(15022,
 <Graph object, directed, with 15022 vertices and 2130575 edges, at 0x7fc547f27f10>)

In [16]:
to_print_blocks = [(assort_blocks, 'assort'), (infomap_blocks, 'infomap')]
to_print_blocks += [(state.project_level(i).get_blocks().a, f'nested_state_{i}') for i in range(4)]

# rename followers_count to fs_count and following_count to fg_count

account_cols = ['handle', 'display_name', 'fs_count', 'fg_count', 'description']

for (labels, name) in to_print_blocks:
    print(name)
    with open(f"output/cd_txt/{name}_state.txt", "w") as f:
        for b, c in Counter(labels).most_common():
            filt = labels == b
            print(filt.sum())
            subgraph = gt.GraphView(g_filtered, vfilt=labels == b)
            print(subgraph.num_vertices())
            subgraph_pr = gt.pagerank(subgraph)
            top_k_in_block = np.argsort(subgraph_pr.a)[::-1][:c]

            block_df = df_filtered.iloc[top_k_in_block].copy()
            block_df = block_df.rename(columns={'followers_count': 'fs_count', 'following_count': 'fg_count'})
            block_df['description'] = block_df['description'].apply(lambda x: x.replace("\n", " ") if x else '')
            block_df['display_name'] = block_df['display_name'].apply(lambda x: x.replace("\n", " ") if x else '')

            f.write(f"Block {b} (size {c})\n")
            f.write(tabulate(block_df.head(100)[account_cols], headers='keys', tablefmt='psql', showindex=False, maxcolwidths=[30, 20, 10, 10, 100]))
            f.write("\n\n\n")

assort
2280
2280
1506
1506
1404
1404
1280
1280
1146
1146
1141
1141
1111
1111
936
936
888
888
861
861
688
688
608
608
579
579
523
523
14
14
13
13
12
12
11
11
11
11
10
10
infomap
2134
2134
2106
2106
1511
1511
1503
1503
1481
1481
1371
1371
1113
1113
1064
1064
829
829
340
340
327
327
275
275
264
264
124
124
95
95
78
78
72
72
57
57
48
48
34
34
33
33
26
26
24
24
17
17
16
16
15
15
11
11
10
10
8
8
7
7
6
6
5
5
4
4
4
4
4
4
3
3
1
1
1
1
1
1
nested_state_0
118
118
114
114
110
110
96
96
94
94
93
93
90
90
89
89
88
88
80
80
79
79
79
79
77
77
75
75
75
75
73
73
71
71
70
70
68
68
67
67
67
67
66
66
65
65
64
64
64
64
64
64
63
63
62
62
62
62
61
61
61
61
59
59
58
58
58
58
58
58
58
58
57
57
57
57
57
57
57
57
57
57
57
57
57
57
56
56
56
56
56
56
56
56
56
56
55
55
54
54
54
54
54
54
54
54
54
54
54
54
54
54
53
53
53
53
53
53
53
53
53
53
52
52
52
52
52
52
52
52
52
52
52
52
52
52
51
51
51
51
50
50
50
50
49
49
49
49
49
49
49
49
48
48
48
48
48
48
48
48
48
48
48
48
48
48
48
48
47
47
47
47
47
47
47
47
47
47
47
47
46
46


In [17]:
extensions = df.handle.apply(lambda s: ".".join(s.split(".")[1:]))
extensions_counter = Counter(extensions)
extensions_counter.most_common(100)

[('bsky.social', 46784),
 ('com', 1442),
 ('dev', 359),
 ('net', 188),
 ('me', 159),
 ('xyz', 122),
 ('io', 109),
 ('org', 81),
 ('social', 64),
 ('blue', 61),
 ('co', 56),
 ('com.br', 42),
 ('jp', 42),
 ('art', 30),
 ('lol', 29),
 ('gay', 28),
 ('ca', 28),
 ('co.uk', 26),
 ('sh', 26),
 ('wtf', 25),
 ('codes', 23),
 ('ingroup.social', 22),
 ('de', 22),
 ('tech', 21),
 ('blog', 20),
 ('online', 20),
 ('space', 20),
 ('is', 20),
 ('us', 17),
 ('es', 16),
 ('computer', 15),
 ('ai', 15),
 ('in', 14),
 ('moe', 13),
 ('fyi', 13),
 ('cloud', 13),
 ('app', 12),
 ('se', 12),
 ('info', 12),
 ('id', 11),
 ('zone', 11),
 ('tv', 11),
 ('cc', 11),
 ('ir', 11),
 ('design', 10),
 ('fun', 10),
 ('fm', 10),
 ('ch', 10),
 ('eu', 10),
 ('one', 9),
 ('be', 9),
 ('live', 8),
 ('page', 8),
 ('gg', 8),
 ('cool', 8),
 ('goose.art', 8),
 ('uk', 8),
 ('im', 8),
 ('at', 7),
 ('bsky.team', 7),
 ('re', 7),
 ('world', 7),
 ('work', 7),
 ('club', 7),
 ('so', 7),
 ('pizza', 7),
 ('life', 7),
 ('fr', 6),
 ('pro', 6),
 

In [18]:
extensions_counter['bsky.social'] / len(df)

0.9149651881404991

In [19]:
# now add the dataframe info the the filtered_g as property maps
vprop_handle = g_filtered.new_vertex_property("string")
vprop_display_name = g_filtered.new_vertex_property("string")
vprop_description = g_filtered.new_vertex_property("string")
vprop_avatar_url = g_filtered.new_vertex_property("string")
vprop_indexed_at = g_filtered.new_vertex_property("string")
vprop_date_updated = g_filtered.new_vertex_property("string")
vprop_followers_count = g_filtered.new_vertex_property("int")
vprop_following_count = g_filtered.new_vertex_property("int")

for time_str, row in df_filtered.iterrows():
    v = g_filtered.vertex(time_str)
    vprop_handle[v] = row.handle
    vprop_display_name[v] = row.display_name
    vprop_description[v] = row.description
    vprop_avatar_url[v] = row.avatar_url
    vprop_indexed_at[v] = row.indexed_at
    vprop_date_updated[v] = row.date_updated
    vprop_followers_count[v] = row.followers_count
    vprop_following_count[v] = row.following_count

shorter_handles = []

for v in g_filtered.vertices():
    h = vprop_handle[v]

    if "bsky.social" in h:
        shorter_handles.append(h.replace(".bsky.social", ""))
    else:
        shorter_handles.append(h)

g_filtered.vertex_properties["handle"] = vprop_handle
g_filtered.vertex_properties["short_handle"] = g_filtered.new_vertex_property("string", vals=shorter_handles)
g_filtered.vertex_properties["display_name"] = vprop_display_name
g_filtered.vertex_properties["description"] = vprop_description
g_filtered.vertex_properties["avatar_url"] = vprop_avatar_url
g_filtered.vertex_properties["indexed_at"] = vprop_indexed_at
g_filtered.vertex_properties["date_updated"] = vprop_date_updated
g_filtered.vertex_properties["followers_count"] = vprop_followers_count
g_filtered.vertex_properties["following_count"] = vprop_following_count
g_filtered.vp['assort_block'] = g_filtered.new_vp('int', assort_state.get_blocks().a)
g_filtered.vp['infomap_blocks'] = g_filtered.new_vp('int', infomap_blocks)

# in gephi copy this to an "image" column
# no, just naming it "image" here doesn't work: ????
g_filtered.vp['image_fn'] = g_filtered.new_vp('string', [f"{vprop_handle[v]}.jpg" for v in g_filtered.vertices()])

for time_str in range(5):
    g_filtered.vp[f'ndcsbm_level_{time_str}'] = g_filtered.new_vp('int', state.project_level(time_str).get_blocks().a)



In [20]:
import datetime

indexed_at_timestamp = []

for v in g_filtered.vertices():
    time_str = g_filtered.vp['indexed_at'][v]

    if time_str is not None and time_str != "None":
        indexed_at_timestamp.append(datetime.datetime.strptime(time_str, "%Y-%m-%dT%H:%M:%S.%fZ").timestamp())
    else:
        indexed_at_timestamp.append(0)

# replace 0s with median time of non-zero values
indexed_at_timestamp = np.array(indexed_at_timestamp)
indexed_at_timestamp[indexed_at_timestamp == 0] = np.median(indexed_at_timestamp[indexed_at_timestamp != 0])
indexed_at_timestamp = indexed_at_timestamp.astype(int)

g_filtered.vp['indexed_at_timestamp'] = g_filtered.new_vp('int', indexed_at_timestamp)

In [21]:
g_filtered.vp['indexed_at_str'] = g_filtered.new_vp('string', [datetime.datetime.fromtimestamp(t).strftime("%Y-%m-%d") for t in indexed_at_timestamp])

In [22]:
g_filtered.save("output/bluesky_test_graph.graphml")
g_filtered.save("output/bluesky_test_graph.gt")

In [23]:
A_full = gt.adjacency(g).T
out_degrees = A_full.sum(axis=1).A1
in_degrees = A_full.sum(axis=0).A1

In [24]:
pr = gt.pagerank(g_filtered)
eig, auth, hub = gt.hits(g_filtered)
df_filtered = df[v_filt].copy()
df_filtered['following_count'] = out_degrees[v_filt].astype(int)
df_filtered['followers_count'] = in_degrees[v_filt].astype(int)
df_filtered['authority'] = auth.a
# df_filtered['hub'] = hub.a
df_filtered['pagerank'] = pr.a
df_filtered['authority_rank'] = df_filtered['authority'].rank(ascending=False)
df_filtered['pagerank_rank'] = df_filtered['pagerank'].rank(ascending=False)
df_filtered['followers_rank'] = df_filtered['followers_count'].rank(ascending=False)
cols = ['handle', 'display_name', 'following_count', 'followers_count', 'followers_rank', 'pagerank_rank', 'authority_rank', 'description']
df_filtered.sort_values(by="pagerank_rank").head(50)[cols]

Unnamed: 0,handle,display_name,following_count,followers_count,followers_rank,pagerank_rank,authority_rank,description
70,jay.bsky.team,Jay 🦋,2388,11921,1.0,1.0,1.0,"CEO of Bluesky, steward of AT Protocol. Let’s ..."
12,pfrazee.com,Paul Frazee,359,9434,2.0,2.0,2.0,Developer at Bluesky. The one who puts bugs in...
52,why.bsky.team,Whyrusleeping,7964,7208,4.0,3.0,3.0,"Technical advisor to @bluesky, first engineer ..."
34035,bsky.app,Bluesky,11,9259,3.0,4.0,7.0,Official bluesky account (check domain 👆)\n\nF...
6784,aoc.bsky.social,Alexandria Ocasio-Cortez,157,6971,5.0,5.0,53.0,"Congresswoman for NY14, repping the Bronx and ..."
40179,rose.bsky.team,Rose 🌹,2255,4723,12.0,6.0,8.0,"Bluesky team 🙋🏻‍♀️💙\nStrategy, Ops, Growth, & ..."
50938,emily.bsky.team,Emily,473,4826,11.0,7.0,11.0,🌀 dev rel + community at bluesky\n🤖 @earthquak...
50951,vishal.bsky.social,Vishal Gulia 🌞,51096,6907,7.0,8.0,4.0,"Bluesky’s first memer, early adopter & investo..."
51131,tho.bsky.social,Thomas Pockrandt,51126,6952,6.0,9.0,5.0,Tech Advisor & Digital Strategist 🦾
50936,dholms.xyz,daniel 🫠,830,2764,31.0,10.0,22.0,dreaming of protocol level sovereignty \n\nbsk...
