In [2]:
from graph_tool.all import *
from collections import Counter,defaultdict
from tqdm import tqdm
import numpy as np


# ADAPTATION FROM https://github.com/martingerlach/hSBM_Topicmodel

In [3]:
# Import necessary libraries
import pandas as pd
from scipy.sparse import csr_matrix, coo_matrix
from sklearn.preprocessing import LabelEncoder
import bz2
import pickle

# Define the path to the data directory
path = './data/meneame/real-data/'

# Read votes data from a compressed file
df = pd.read_csv(path + 'df_stories_votes.tsv.gz', sep="\t", compression="gzip").drop_duplicates().reset_index(drop=True)

df


Unnamed: 0,story_id,username_vote,story_vote_time,story_vote,username_post
0,3753131,2db82cdb3c46088d4bd4e21ac761f3c7a445ad54,2022-11-27 21:33:00,1,b596d02a9dce66b03862df10c92eab0f14d42a8e
1,3753131,f984b2a0792ff35f1242a3ad7015de25ecece1f8,2022-11-27 18:46:00,1,b596d02a9dce66b03862df10c92eab0f14d42a8e
2,3753131,0615b413f12a510f105fb9359fc1d3123532eef4,2022-11-27 16:54:00,1,b596d02a9dce66b03862df10c92eab0f14d42a8e
3,3753131,df56297302e35f8f795227d58d597b3aebebcd79,2022-11-27 16:43:00,1,b596d02a9dce66b03862df10c92eab0f14d42a8e
4,3753131,1c7dabc3f7051c658d96a0a0feb84e0a868d02f7,2022-11-27 16:31:00,1,b596d02a9dce66b03862df10c92eab0f14d42a8e
...,...,...,...,...,...
1899456,3835578,3d4c338c1f00b427a0c2dee39f2912155adfb0a2,2023-07-17 12:08:00,1,8a89243e657772bcff175316fd511ca769946d94
1899457,3835578,3fb6fccfacf0f7919d102834cf67535874b4aab9,2023-07-17 12:07:00,1,8a89243e657772bcff175316fd511ca769946d94
1899458,3835578,589fbe2c899ed5d1fad2720164fe1601f9e322a3,2023-07-17 12:02:00,1,8a89243e657772bcff175316fd511ca769946d94
1899459,3835578,5e741036b55cf395d8d79db162e8fb209c05baa2,2023-07-17 11:58:00,1,8a89243e657772bcff175316fd511ca769946d94


In [4]:
# Concatenate 'story_id' and 'username_vote' into a single series
all_ids = pd.concat([df['story_id'].astype(str), df['username_vote']])
# Apply LabelEncoder to the concatenated series
encoder = LabelEncoder().fit(all_ids)

In [5]:
# Transform 'story_id' and 'username_vote' using the fitted encoder
df['story_index'] = encoder.transform(df['story_id'].astype(str))
df['username_index'] = encoder.transform(df['username_vote'])


In [6]:
df

Unnamed: 0,story_id,username_vote,story_vote_time,story_vote,username_post,story_index,username_index
0,3753131,2db82cdb3c46088d4bd4e21ac761f3c7a445ad54,2022-11-27 21:33:00,1,b596d02a9dce66b03862df10c92eab0f14d42a8e,2542,2046
1,3753131,f984b2a0792ff35f1242a3ad7015de25ecece1f8,2022-11-27 18:46:00,1,b596d02a9dce66b03862df10c92eab0f14d42a8e,2542,59901
2,3753131,0615b413f12a510f105fb9359fc1d3123532eef4,2022-11-27 16:54:00,1,b596d02a9dce66b03862df10c92eab0f14d42a8e,2542,286
3,3753131,df56297302e35f8f795227d58d597b3aebebcd79,2022-11-27 16:43:00,1,b596d02a9dce66b03862df10c92eab0f14d42a8e,2542,58717
4,3753131,1c7dabc3f7051c658d96a0a0feb84e0a868d02f7,2022-11-27 16:31:00,1,b596d02a9dce66b03862df10c92eab0f14d42a8e,2542,1268
...,...,...,...,...,...,...,...
1899456,3835578,3d4c338c1f00b427a0c2dee39f2912155adfb0a2,2023-07-17 12:08:00,1,8a89243e657772bcff175316fd511ca769946d94,47310,51303
1899457,3835578,3fb6fccfacf0f7919d102834cf67535874b4aab9,2023-07-17 12:07:00,1,8a89243e657772bcff175316fd511ca769946d94,47310,51412
1899458,3835578,589fbe2c899ed5d1fad2720164fe1601f9e322a3,2023-07-17 12:02:00,1,8a89243e657772bcff175316fd511ca769946d94,47310,52554
1899459,3835578,5e741036b55cf395d8d79db162e8fb209c05baa2,2023-07-17 11:58:00,1,8a89243e657772bcff175316fd511ca769946d94,47310,52798


In [7]:
edge_list = list(df[['username_index','story_index', 'story_vote']].itertuples(index=False, name=None))

In [8]:
g = Graph(directed=False)
g.add_edge_list(edge_list,  eprops=[('weight', 'int')])
is_bip, part = is_bipartite(g, partition=True)

In [9]:
signs = [0 if w <0 else 1 for w in g.ep.weight.a]
kinds = [1 if part[v] == 1 else 0 for v in g.vertices()]
color = ['red' if s == 0 else 'blue' for s in signs]

In [10]:
g.vp['kind'] = g.new_vertex_property("int", vals=kinds)
g.ep['sign'] = g.new_edge_property("int", vals=signs)
g.ep['color'] = g.new_edge_property("string", vals=color)

In [11]:
clabel = g.vp['kind']
sign_prop = g.ep['sign']

state_args_ = {'clabel': clabel, 'pclabel': clabel}

In [12]:
state = minimize_nested_blockmodel_dl(g, state_args=dict(recs=[g.ep.sign], rec_types=["discrete-binomial"],
                                                            deg_corr=True, **state_args_), multilevel_mcmc_args=dict(verbose=True))

level: 0
staging multilevel, N = 60189
0 1 -2.51089e+06 25655 0.426241
1 inf -488701 21867 0.852348
2 inf -152891 20973 0.959116
3 inf -95363.8 20419 0.973585
4 inf -72976.2 19926 0.975856
5 inf -62223.2 19440 0.97561
6 inf -53245.5 18988 0.976749
7 inf -51760.1 18479 0.973194
8 inf -48739.2 18005 0.974349
9 inf -42993.8 17587 0.976784
10 inf -45426.3 17177 0.976687
11 inf -45050.8 16767 0.976131
12 inf -41671.3 16374 0.976561
13 inf -43688.9 16002 0.977281
14 inf -41848 15621 0.97619
15 inf -43338.1 15289 0.978747
16 inf -43309 14943 0.977369
17 inf -41113.7 14631 0.979121
18 inf -39336.2 14307 0.977855
19 inf -40320.1 14017 0.97973
20 inf -35268.9 13739 0.980167
21 inf -33720.3 13490 0.981876
22 inf -37018.4 13249 0.982135
23 inf -37271.4 12986 0.980149
24 inf -34038 12784 0.984445
25 inf -30949.7 12566 0.982947
26 inf -28194 12361 0.983686
27 inf -29275.8 12166 0.984225
28 inf -26180.6 11979 0.984629
29 inf -25407.5 11799 0.984974
30 inf -22806.8 11622 0.984999
31 inf -23561.7 11461

In [13]:
with open('hsbm.pkl', 'wb') as f:
    pickle.dump(state, f)

In [15]:
#state.draw(vertex_fill_color = clabel, edge_color=g.ep.color, output = './plots/sbm_meneame.pdf')
state.draw(layout='bipartite', output='./plots/sbm_meneame.pdf',
                        subsample_edges=10000, vertex_fill_color=clabel, edge_color=g.ep.color)

(<VertexPropertyMap object with value type 'vector<double>', for Graph 0x169ab4d90, at 0x16a687d50>,
 <GraphView object, directed, with 60243 vertices and 60242 edges, edges filtered by (<EdgePropertyMap object with value type 'bool', for Graph 0x2cc304190, at 0x14b4a28d0>, False), vertices filtered by (<VertexPropertyMap object with value type 'bool', for Graph 0x2cc304190, at 0x2cc307310>, False), at 0x2cc304190>,
 <VertexPropertyMap object with value type 'vector<double>', for Graph 0x2cc304190, at 0x1063bb990>)

In [21]:
state.get_bs()

[PropertyArray([ 8265,  8265,  7783, ...,  8265, 11538, 11538], dtype=int32),
 PropertyArray([6, 3, 3, ..., 3, 9, 6], dtype=int32),
 PropertyArray([4, 0, 4, 0, 4, 1, 1, 4, 0, 4, 1, 0, 4, 1, 1, 4, 4, 1, 4],
               dtype=int32),
 PropertyArray([0, 1, 0, 0, 0], dtype=int32),
 PropertyArray([0, 1, 0], dtype=int32),
 PropertyArray([0, 1], dtype=int32),
 PropertyArray([0, 1], dtype=int32),
 PropertyArray([0, 1], dtype=int32),
 PropertyArray([0, 1], dtype=int32),
 PropertyArray([0, 1], dtype=int32),
 PropertyArray([0, 1], dtype=int32),
 PropertyArray([0, 1], dtype=int32),
 PropertyArray([0, 1], dtype=int32),
 PropertyArray([0, 1], dtype=int32),
 PropertyArray([0, 1], dtype=int32),
 PropertyArray([0, 1], dtype=int32),
 PropertyArray([0, 1], dtype=int32)]

In [24]:
state

<NestedBlockState object, with base <BlockState object with 60189 blocks (19 nonempty), degree-corrected, with 1 edge covariate, for graph <Graph object, undirected, with 60189 vertices and 1899461 edges, 1 internal vertex property, 3 internal edge properties, at 0x169ab4d90>, at 0x2c85cf110>, and 17 levels of sizes [(60189, 19), (19, 5), (5, 3), (3, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2)] at 0x2c85cef90>