In [34]:
# ! pip install -q graphistry[igraph]

# PyGraphistry Example: Graphing the Marvel Universe
### Plots hero social network based on co-appearences between heroes

*  Uses pandas, igraph, and PyGraphistry
*  Combines comic book and hero data
*  Near end, computes clusters and to avoid a hairball, weakens the edge weights between  nodes of different clusters


In [None]:
import graphistry, igraph, pandas as pd

# To specify Graphistry account & server, use:
# graphistry.register(api=3, username='...', password='...')
# For more options, see https://github.com/graphistry/pygraphistry#configure


# Load heroes, comics, appearences

In [None]:
#characters_path = 'https://raw.githubusercontent.com/graphistry/pygraphistry/master/demos/data/characters.txt'
characters_path = '../../data/characters.txt'

heroes = pd.read_csv(characters_path, encoding='unicode_escape', names=['hero_name'])
heroes['hero_id'] = heroes['hero_name'].str.extract(r'^Vertex ([0-9]+):')
heroes['hero_name'] = heroes['hero_name'].str.extract(r'^Vertex [0-9]+: (.*)')
heroes.sample(3)

Unnamed: 0,hero_name,hero_id
4387,PSI-LORD/FRANKLIN BENJAMIN RICHARDS,4388
1968,FORTUNATO,1969
1105,COFFY,1106


In [None]:
#comics_path = 'https://raw.githubusercontent.com/graphistry/pygraphistry/master/demos/data/comics.txt'
comics_path = '../../data/comics.txt'

comics = pd.read_csv(comics_path, encoding='unicode_escape', names=['comic_name'])
comics['comic_id'] = comics['comic_name'].str.extract(r'Vertex ([0-9]+):')
comics['comic_name'] = comics['comic_name'].str.extract(r'Vertex [0-9]+: (.*)')
comics

Unnamed: 0,comic_name,comic_id
0,AA2 35,6487
1,M/PRM 35,6488
2,M/PRM 36,6489
3,M/PRM 37,6490
4,WI? 9,6491
...,...,...
12937,BIZADV 33,19424
12938,WI 25/2,19425
12939,AA2 30,19426
12940,AA2 20,19427


In [None]:
#appearances_path = 'https://raw.githubusercontent.com/graphistry/pygraphistry/master/demos/data/appearances.txt'
appearances_path = '../../data/appearances.txt'

appearances = pd.read_csv(appearances_path, encoding='unicode_escape', names=['comic'])
appearances = appearances[len(heroes) + len(comics) + 2:]
appearances['hero'] = appearances.comic.str.extract(r'^([0-9]+) ')
appearances['comic'] = appearances.comic.str.extract(r'^[0-9]+ (.*)')
appearances['comic'] = appearances.comic.apply(lambda s: s.split(' '))
appearances = appearances.explode('comic')
appearances[:3]

Unnamed: 0,comic,hero
19430,6487,1
19431,6488,2
19431,6489,2


#Link heroes who co-appear

In [None]:
# You may need to install numexpr: pip install numexpr
coappearances = (
    appearances
        .merge(appearances, on='comic')
        .merge(comics, left_on='comic', right_on='comic_id')
        [['hero_x', 'hero_y']]
        .query('hero_x > hero_y')
)

unique_coappearances = coappearances.drop_duplicates(['hero_x', 'hero_y']).set_index(['hero_x', 'hero_y'])
unique_coappearances['counts'] = coappearances.groupby(['hero_x', 'hero_y']).size()
unique_coappearences = unique_coappearances.reset_index()

print('#edges', len(unique_coappearances))        
unique_coappearances[:3]

#edges 168267


Unnamed: 0_level_0,Unnamed: 1_level_0,counts
hero_x,hero_y,Unnamed: 2_level_1
1999,1,1
6459,1,1
6459,1999,1


# Plot!

In [None]:
g = graphistry.edges(unique_coappearences, source='hero_x', destination='hero_y')

In [None]:
g.plot()

# Label Nodes & Edges

In [None]:
# Here we are using two dataframes, one for edges and one for nodes
g2 = (
    g.nodes(heroes, 'hero_id')
    .bind(point_title='hero_name')
    .bind(edge_title='counts')
)

In [None]:
g2.plot()

# Color using igraph infomap

### Infomap Community Detection

In [None]:
#Warning: slow
ig = g2.to_igraph(directed=False)
ig.vs['cluster'] = ig.community_infomap().membership

# load just the desired attributes: reuse original edges, and enrich with just cluster
g3a = g2.from_igraph(ig, load_edges=False, node_attributes=['hero_id', 'cluster'])
g3a._nodes.sample(3)

Unnamed: 0,hero_id,cluster,hero_name
4906,4907,81,SCHEELE
2700,2701,81,KALE
4116,4117,159,OZ


In [28]:
g3b = g3a.nodes(
    g3a._nodes.assign(color=g3a._nodes['cluster'].apply(lambda x: x % 9).astype('int32'))
)
g3 = g3b.encode_point_color('color').bind(edge_weight='counts')

g3._nodes.sample(3)

Unnamed: 0,hero_id,cluster,hero_name,color
2707,2708,5,KAMIKAZE,5
4727,4728,8,ROGUE | MUTANT X-VERSE,8
4166,4167,15,PARTRIDGE,6


In [None]:
g3.plot()

# Restrict to biggest communities

In [29]:
big_clusters = (g3._nodes
    .pivot_table(index=['cluster'], aggfunc=lambda x: len(x.unique()))
    .rename(columns={'hero_id': 'cluster_size'})
    .query('cluster_size > 100')
    .reset_index()[['cluster', 'cluster_size']]
)
print('# big clusters', len(big_clusters))
big_clusters[:3]

# big clusters 10


Unnamed: 0,cluster,cluster_size
0,1,1354
1,2,198
2,5,836


In [30]:
good_nodes = g3._nodes.merge(big_clusters, on='cluster')
print('# nodes', len(good_nodes))
good_nodes[:3]

# nodes 3711


Unnamed: 0,hero_id,cluster,hero_name,color,cluster_size
0,2,1,3-D MAN/CHARLES CHANDLER & HAROLD CHANDLER,1,1354
1,10,1,ABOMINATION/EMIL BLONSKY,1,1354
2,18,1,ACBA,1,1354


In [31]:
good_edges = unique_coappearences\
    .merge(good_nodes, left_on='hero_x', right_on='hero_id')\
    .merge(good_nodes, left_on='hero_y', right_on='hero_id')\
    [['hero_x', 'hero_y', 'counts']]
print('# edges', len(good_edges))
good_edges[:3]

# edges 118318


Unnamed: 0,hero_x,hero_y,counts
0,2186,2,3
1,2555,2,3
2,3491,2,3


In [32]:
g4 = g3.edges(good_edges).nodes(good_nodes)

g4.plot()

# Seperate communities
### Treat intra-community edges as strong edge weights, and inter-community as weak edge weight

In [None]:
#label edges whether they stay inside a cluster or connect nodes in different clusters
good_edges2 = good_edges\
        .merge(\
                 good_nodes[['cluster', 'hero_id']].rename(columns={'cluster': 'cluster_x'}),\
                 left_on='hero_x', right_on='hero_id')\
        .merge(\
                 good_nodes[['cluster', 'hero_id']].rename(columns={'cluster': 'cluster_y'}),\
                 left_on='hero_y', right_on='hero_id')
good_edges2['is_inner'] = good_edges2.apply(lambda x: x['cluster_x'] == x['cluster_y'], axis=1)

#bind to edge_weight
good_edges2['weight'] = good_edges2.apply(lambda x: 10 if x['is_inner'] else 8, axis=1)
good_edges2 = good_edges2[['hero_x', 'hero_y', 'counts', 'is_inner', 'weight']]
good_edges2[:3]

Unnamed: 0,hero_x,hero_y,counts,is_inner,weight
0,2186,2,3,True,10
1,2555,2,3,True,10
2,3491,2,3,True,10


### Plot; control the edge weight in the settings panel

In [33]:
g5 = (g4
      .bind(edge_weight='weight')
      .edges(good_edges2)
      .settings(url_params={'edgeInfluence': 0.7, 'edgeOpacity': 0.5})
)

g5.plot()

# Filter by k-core shell

In [None]:
ig5 = g5.to_igraph()
ig5.vs['shell'] = g5.to_igraph().shell_index()
print('#shells', str(len(list(set(ig5.vs['shell'])))))

#shells 95


In [None]:
g5._edges

Unnamed: 0,hero_x,hero_y,counts,is_inner,weight
0,2186,2,3,True,10
1,2555,2,3,True,10
2,3491,2,3,True,10
3,6022,2,3,True,10
4,859,2,3,True,10
...,...,...,...,...,...
113111,3493,335,1,True,10
113112,993,991,1,True,10
113113,805,6394,1,True,10
113114,818,4670,1,True,10


### Plot: Use the histogram tool to filter for the smaller shells

In [None]:
#update just the attributes we want
g5.from_igraph(ig5, node_attributes=['shell', 'hero_id'], load_edges=False).plot()