# PyGraphistry Example: Graphing the Marvel Universe
### Uses pandas, igraph, and PyGraphistry
### Install: `pip install pandas python-igraph graphistry`
Note: `pip install igraph` is the wrong package, use `python-igraph`

In [1]:
from __future__ import print_function
from io import open
import pandas
import igraph # Install Igraph with pip install python-igraph
import graphistry

graphistry.register(key='<Email pygraphistry@graphistry.com to get your key>')
graphistry.__version__

'0.9.3'

# Load heroes, comics, appearences
###<font style="color:red">Data should be in `data/<file>.txt`</font>

In [2]:
with open('data/characters.txt', encoding="latin-1") as f:
    lines = f.readlines()
heroes = pandas.DataFrame(
    list(map(lambda x: (int(x.split(':')[0].split(' ')[1]), x.split(': ', 1)[1].split('\n')[0]), lines)),
    columns=['hero_id', 'hero_name'])
print('#Heroes:', len(heroes))
heroes[:3]

#Heroes: 6486


Unnamed: 0,hero_id,hero_name
0,1,24-HOUR MAN/EMMANUEL
1,2,3-D MAN/CHARLES CHANDLER & HAROLD CHANDLER
2,3,4-D MAN/MERCURIO


In [3]:
with open('data/comics.txt', encoding="latin-1") as f:
    lines = f.readlines()
comics = pandas.DataFrame(
    list(map(lambda x: (int(x.split(':')[0].split(' ')[1]), x.split(': ', 1)[1].split('\n')[0]), lines)),
    columns=['comic_id', 'comic_name'])
print('#Comics: ', len(comics))
comics[:3]

#Comics:  12942


Unnamed: 0,comic_id,comic_name
0,6487,AA2 35
1,6488,M/PRM 35
2,6489,M/PRM 36


In [4]:
with open('data/appearances.txt', encoding="latin-1") as f:
    lines = f.readlines()[len(heroes) + len(comics) + 2:]
def expand (line):
    parts = list(map(int, line.split(' ')))
    return [(parts[0], role) for role in parts[1:]]
appearences = pandas.DataFrame(
    [item for sublist in list(map(expand, lines)) for item in sublist],
    columns=['hero', 'comic'])
appearences[:3]

Unnamed: 0,hero,comic
0,1,6487
1,2,6488
2,2,6489


#Link heroes who co-appear

In [5]:
# You may need to install numexpr: pip install numexpr
coappearences = \
    appearences\
        .merge(appearences, on='comic')\
        .merge(comics, left_on='comic', right_on='comic_id')\
        [['hero_x', 'hero_y']]\
        .query('hero_x > hero_y')
unique_coappearences = coappearences.drop_duplicates(['hero_x', 'hero_y']).set_index(['hero_x', 'hero_y'])
unique_coappearences['counts'] = coappearences.groupby(['hero_x', 'hero_y']).size()
unique_coappearences = unique_coappearences.reset_index()
print('#edges', len(unique_coappearences))        
unique_coappearences[:5]

#edges 168267


Unnamed: 0,hero_x,hero_y,counts
0,1999,1,1
1,6459,1,1
2,6459,1999,1
3,6463,1,1
4,6463,1999,18


# Plot!

In [6]:
plotter = graphistry.bind(source='hero_x', destination='hero_y', edge_title='counts')

In [7]:
plotter.plot(unique_coappearences)

ValueError: ('Server reported error:', u'incorrect header check')

#Label Nodes

In [7]:
plotter2 = plotter.bind(node='hero_id', point_title='hero_name')
# Here we are using two dataframes, one for edges and one for nodes

In [None]:
plotter2.plot(unique_coappearences, heroes)

# Color using igraph infomap

### Infomap Community Detection

In [8]:
ig = plotter2.pandas2igraph(unique_coappearences, directed=False)
clusters = ig.community_infomap()
(i_edges, i_nodes) = plotter2.igraph2pandas(ig)
print('#clusters', str(len(list(set(clusters.membership)))))

#clusters 214


In [9]:
nodes_colored = pandas.DataFrame({'cluster': clusters.membership})\
    .reset_index().rename(columns={'index': 'denseid'})\
    .merge(i_nodes.reset_index().rename(columns={'index':'denseid'}), on='denseid')\
    .merge(heroes, left_on='hero_id', right_on='hero_id')
print('#colored nodes', str(len(nodes_colored)))
nodes_colored[:3]

#colored nodes 6467


Unnamed: 0,denseid,cluster,hero_id,hero_name
0,0,32,1999,"FROST, CARMILLA"
1,1,32,1,24-HOUR MAN/EMMANUEL
2,2,32,6459,G'RATH


In [10]:
nodes_colored['color'] = nodes_colored.apply(lambda x: x['cluster'] % 9, axis=1)
nodes_colored.pivot_table(index=['color'], aggfunc=lambda x: len(x.unique()))

Unnamed: 0_level_0,cluster,denseid,hero_id,hero_name
color,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,24,1751,1751,1751
1,24,1150,1150,1150
2,24,929,929,929
3,24,514,514,514
4,24,481,481,481
5,24,458,458,458
6,24,421,421,421
7,23,368,368,368
8,23,395,395,395


In [12]:
plotter3 = plotter2.bind(point_color='color', edge_weight='counts')

In [None]:
plotter3.plot(unique_coappearences,  nodes_colored)

# Restrict to biggest communities

In [13]:
big_clusters = nodes_colored\
    .pivot_table(index=['cluster'], aggfunc=lambda x: len(x.unique()))\
    .rename(columns={'hero_id': 'cluster_size'})\
    .query('cluster_size > 100')\
    .reset_index()[['cluster', 'cluster_size']]
print('# big clusters', len(big_clusters))
big_clusters[:3]

# big clusters 9


Unnamed: 0,cluster,cluster_size
0,0,1366
1,1,789
2,2,542


In [14]:
good_nodes = nodes_colored.merge(big_clusters, on='cluster')
print('# nodes', len(good_nodes))
good_nodes[:3]

# nodes 3564


Unnamed: 0,denseid,cluster,hero_id,hero_name,color,cluster_size
0,6,0,2186,GORILLA-MAN,0,1366
1,7,0,2,3-D MAN/CHARLES CHANDLER & HAROLD CHANDLER,0,1366
2,8,0,2555,HUMAN ROBOT,0,1366


In [15]:
good_edges = unique_coappearences\
    .merge(good_nodes, left_on='hero_x', right_on='hero_id')\
    .merge(good_nodes, left_on='hero_y', right_on='hero_id')\
    [['hero_x', 'hero_y', 'counts']]
print('# edges', len(good_edges))
good_edges[:3]

# edges 111712


Unnamed: 0,hero_x,hero_y,counts
0,2186,2,3
1,2555,2,3
2,3491,2,3


In [None]:
plotter3.plot(good_edges, good_nodes)

# Seperate communities

In [27]:
good_edges2 = good_edges\
        .merge(\
                 good_nodes[['cluster', 'hero_id']].rename(columns={'cluster': 'cluster_x'}),\
                 left_on='hero_x', right_on='hero_id')\
        .merge(\
                 good_nodes[['cluster', 'hero_id']].rename(columns={'cluster': 'cluster_y'}),\
                 left_on='hero_y', right_on='hero_id')
good_edges2['is_inner'] = good_edges2.apply(lambda x: x['cluster_x'] == x['cluster_y'], axis=1)
good_edges2 = good_edges2[['hero_x', 'hero_y', 'counts', 'is_inner']]
good_edges2[:3]

Unnamed: 0,hero_x,hero_y,counts,is_inner
0,2186,2,3,True
1,2555,2,3,True
2,3491,2,3,True


In [34]:
plotter3.plot(good_edges2[good_edges2['is_inner'] == True], good_nodes.drop('hero_name', 1))