In [1]:
import itertools

In [2]:
from pybtex.database import parse_file as pybtex_parse_file

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx

In [4]:
%matplotlib inline

In [5]:
vispapers = pybtex_parse_file('groups-in-graphs.corinna-vehlow.com.bib')

In [6]:
G = nx.Graph()

In [48]:
from collections import Counter
all_keywords = {}
for name, entry in vispapers.entries.iteritems():
    keywords = entry.fields['keywords'].split(', ')
    for k in keywords:
        if ':' in k:
            k = k.split(':')
            if k[0] not in all_keywords:
                all_keywords[k[0]] = Counter()
            all_keywords[k[0]][k[1]] += 1
        else:
            if k not in all_keywords:
                all_keywords[k] = Counter()
            all_keywords[k]['_'] += 1
entry.fields['title']
entry.fields['keywords'].split(', ')

[u'paper_type:technique',
 u'group_type:vertex-based',
 u'graph_vis:node-link',
 u'group_overlap:overlapping_crisp',
 u'group_structure:flat',
 u'graph:generic',
 u'group_origin:categorical_attr',
 u'application:social_network',
 u'publication_channel:InfoVis_J',
 u'edge-group_vis:none',
 u'group_vis:juxtaposed',
 u'visual_attr:!',
 u'juxtaposed_vis:separate',
 u'superimposed_vis:!',
 u'embedded_vis:!',
 u'implicit_vis:!',
 u'evaluation:case_study',
 u'radial']

In [19]:
for name, entry in vispapers.entries.iteritems():
    authors_this_paper = []
    this_paper = {'title': entry.fields['title'], 'year': entry.fields['year']}
    for author in entry.persons['author']:
        author_name = "{}, {}".format(author.get_part_as_text('last'), author.get_part_as_text('first'))
        author_name = author_name.replace('\\"', '')
        if G.has_node(author_name):
            G.node[author_name]['papers'].append(this_paper)
        else:
            G.add_node(author_name, attr_dict={'author_name': author_name, 'papers': [this_paper]})
        authors_this_paper.append(author_name)
    if len(authors_this_paper) > 1:
        for a1, a2 in itertools.combinations(authors_this_paper, 2):
            if G.has_edge(a1, a2):
                G[a1][a2]['weight'] += 1
            else:
                G.add_edge(a1, a2, weight=1)
        

In [20]:
components = list(nx.connected_components(G))

In [21]:
for a, b, data in sorted(G.edges(data=True), key=lambda x: x[2]['weight'], reverse=True):
    print('{a} | {b} | {w}'.format(a=a, b=b, w=data['weight']))

Weiskopf, Daniel | Vehlow, Corinna | 8
Burch, Michael | Weiskopf, Daniel | 7
Munzner, Tamara | Archambault, Daniel | 6
Munzner, Tamara | Auber, David | 6
Auber, David | Archambault, Daniel | 6
Beck, Fabian | Diehl, Stephan | 6
Burch, Michael | Diehl, Stephan | 5
Didimo, Walter | Liotta, Giuseppe | 5
Auber, David | Bourqui, Romain | 4
Fekete, Jean-Daniel | Henry, Nathalie | 4
Feng, Qing-Wen | Eades, Peter | 4
Montecchiani, Fabrizio | Didimo, Walter | 4
Hu, Yifan | Kobourov, Stephen | 4
Byelas, Heorhiy | Telea, Alexandru | 4
Marriott, Kim | Dwyer, Tim | 4
Muelder, Chris | Ma, Kwan-Liu | 4
Auber, David | Jourdan, Fabien | 3
Nguyen, Quang | Huang, Mao | 3
Beck, Fabian | Burch, Michael | 3
Beck, Fabian | Vehlow, Corinna | 3
Beck, Fabian | Weiskopf, Daniel | 3
Burch, Michael | Vehlow, Corinna | 3
Tal, Ayellet | Frishman, Yaniv | 3
Wijk, Jarke | Pretorius, A. | 3
Schmalstieg, Dieter | Streit, Marc | 3
Schmalstieg, Dieter | Lex, Alexander | 3
Lex, Alexander | Streit, Marc | 3
Bourqui, Romain |

In [22]:
import igraph

In [23]:
g2 = igraph.Graph.Adjacency((nx.to_numpy_matrix(G)).tolist())

In [24]:
g2.community_infomap()

<igraph.clustering.VertexClustering at 0x10e798990>

In [25]:
# nx.write_pajek(G, 'coauthorship.net')

In [103]:
# # need to clean up the pajek file: only need, the number and name for the vertices
# with open('coauthorship.net', 'r') as f:
#     pjk_txt = f.readlines()
# this_line_is_vertex = False
# with open('coauthorship.net', 'w') as outf:
#     for line in pjk_txt:
#         if line[0] == '*':
#             if line[1].lower() == 'v':
#                 this_line_is_vertex = True
#             else:
#                 this_line_is_vertex = False
#         elif this_line_is_vertex:
#             line = line[:line.find(' 0.0')]
#             line = line + '\n'
#         outf.write(line)

In [26]:
with open('coauthorship.tree', 'r') as f:
    rows = []
    for line in f:
        if line[0] == '#':
            continue
        
        line = line.strip().split()
        author = line[2].strip('"')
        node = line[3]
        cl = line[0]
        flow = line[1]
        row = ( node, author, cl, flow )
        rows.append(row)

In [27]:
df = pd.DataFrame(rows, columns='node author cl flow'.split())

In [28]:
df = pd.read_csv('coauthorship.tree', skiprows=2, quotechar='"', delimiter=' ', names='cl flow author node'.split())

In [29]:
df['cl_bottom'] = df.cl.apply(lambda x: ':'.join(x.split(':')[:-1]))
df['cl_top'] = df.cl.apply(lambda x: x.split(':')[0])

In [30]:
df.set_index('author', inplace=True)

In [31]:
df

Unnamed: 0_level_0,cl,flow,node,cl_bottom,cl_top
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"Weiskopf, Daniel",1:1:1:1,0.018467,145,1:1:1,1
"Vehlow, Corinna",1:1:1:2,0.014774,278,1:1:1,1
"Burch, Michael",1:1:1:3,0.011542,106,1:1:1,1
"Diehl, Stephan",1:1:1:4,0.010157,138,1:1:1,1
"Beck, Fabian",1:1:1:5,0.009234,80,1:1:1,1
"Schmauder, Hansj{{o}}rg",1:1:1:6,0.001385,14,1:1:1,1
"Fritz, Peter",1:1:1:7,0.001385,40,1:1:1,1
"Auw{a}rter, Patrick",1:1:1:8,0.001385,140,1:1:1,1
"Schmidt, Benjamin",1:1:1:9,0.000923,54,1:1:1,1
"Hlawatsch, Marcel",1:1:1:10,0.000923,55,1:1:1,1


In [32]:
from collections import Counter
c = Counter()
for comp in components:
    c[len(comp)] += 1

In [33]:
c

Counter({1: 5,
         2: 24,
         3: 13,
         4: 3,
         5: 5,
         6: 2,
         7: 6,
         9: 4,
         10: 2,
         11: 1,
         12: 1,
         14: 1,
         23: 1,
         31: 1,
         127: 1})

In [34]:
nx.set_node_attributes(G, 'cl_bottom', df.cl_bottom.to_dict())
nx.set_node_attributes(G, 'cl_top', df.cl_top.to_dict())
nx.set_node_attributes(G, 'flow', df.flow.to_dict())

In [35]:
G.nodes(data=True)

[('Wu, Jie',
  {'author_name': 'Wu, Jie',
   'cl_bottom': '5:1',
   'cl_top': '5',
   'flow': 0.0023083999999999999,
   'papers': [{'title': u'{VisANT}: data-integrating visual framework for biological networks and modules',
     'year': u'2005'}]}),
 ('Smith, Adam',
  {'author_name': 'Smith, Adam',
   'cl_bottom': '20',
   'cl_top': '20',
   'flow': 0.00184672,
   'papers': [{'title': u'{RuleBender}: integrated modeling, simulation and visualization for rule-based intracellular biochemistry',
     'year': u'2012'}]}),
 ('Nakazawa, Rina',
  {'author_name': 'Nakazawa, Rina',
   'cl_bottom': '2:1:2:2',
   'cl_top': '2',
   'flow': 0.0023083999999999999,
   'papers': [{'title': u'Integrated Visualization of Gene Network and Ontology Applying a Hierarchical Graph Visualization Technique',
     'year': u'2012'},
    {'title': u'A Visualization of Research Papers Based on the Topics and Citation Network',
     'year': u'2015'}]}),
 ('Hasco{e}t, Mountaz',
  {'author_name': 'Hasco{e}t, Mountaz

In [36]:
G = nx.relabel_nodes(G, df.node.to_dict())

In [37]:
G.nodes(data=True)

[(1,
  {'author_name': 'Wu, Jie',
   'cl_bottom': '5:1',
   'cl_top': '5',
   'flow': 0.0023083999999999999,
   'papers': [{'title': u'{VisANT}: data-integrating visual framework for biological networks and modules',
     'year': u'2005'}]}),
 (2,
  {'author_name': 'Smith, Adam',
   'cl_bottom': '20',
   'cl_top': '20',
   'flow': 0.00184672,
   'papers': [{'title': u'{RuleBender}: integrated modeling, simulation and visualization for rule-based intracellular biochemistry',
     'year': u'2012'}]}),
 (3,
  {'author_name': 'Nakazawa, Rina',
   'cl_bottom': '2:1:2:2',
   'cl_top': '2',
   'flow': 0.0023083999999999999,
   'papers': [{'title': u'Integrated Visualization of Gene Network and Ontology Applying a Hierarchical Graph Visualization Technique',
     'year': u'2012'},
    {'title': u'A Visualization of Research Papers Based on the Topics and Citation Network',
     'year': u'2015'}]}),
 (4,
  {'author_name': 'Hasco{e}t, Mountaz',
   'cl_bottom': '45',
   'cl_top': '45',
   'flow':

In [38]:
from networkx.readwrite import json_graph
import json

In [39]:
json_data = json_graph.node_link_data(G)
with open('coauthorship.json', 'w') as outf:
    json.dump(json_data, outf)

In [40]:
with open('coauthorship.json', 'r') as f:
    j = json.load(f)

In [29]:
ctypes = Counter()
for node in j['nodes']:
    ctypes[len(node)] += 1

In [30]:
ctypes

Counter({4: 457})

In [31]:
G.nodes(data=True)

[(1, {'author_name': 'Wu, Jie', 'cl_bottom': '5:1', 'cl_top': '5'}),
 (2, {'author_name': 'Smith, Adam', 'cl_bottom': '20', 'cl_top': '20'}),
 (3, {'author_name': 'Nakazawa, Rina', 'cl_bottom': '2:1:2:2', 'cl_top': '2'}),
 (4, {'author_name': 'Mader, Martin', 'cl_bottom': '12', 'cl_top': '12'}),
 (5, {'author_name': 'Sangal, Neeraj', 'cl_bottom': '27', 'cl_top': '27'}),
 (6, {'author_name': 'Elmqvist, Niklas', 'cl_bottom': '4:1', 'cl_top': '4'}),
 (7, {'author_name': 'Philippi, Stephan', 'cl_bottom': '8', 'cl_top': '8'}),
 (8, {'author_name': 'Vernik, Rudi', 'cl_bottom': '30', 'cl_top': '30'}),
 (9,
  {'author_name': 'Goesmann, Alexander', 'cl_bottom': '21', 'cl_top': '21'}),
 (10, {'author_name': 'Tanaka, Jiro', 'cl_bottom': '22:1', 'cl_top': '22'}),
 (11, {'author_name': 'Mary, Patrick', 'cl_bottom': '2:1:1:1', 'cl_top': '2'}),
 (12, {'author_name': 'Nguyen, QuanHoang', 'cl_bottom': '12', 'cl_top': '12'}),
 (13, {'author_name': 'Kramer, Andrei', 'cl_bottom': '1:1:2', 'cl_top': '1'}),

In [32]:
deg = pd.DataFrame.from_dict(G.degree(), orient='index').rename(columns={0: 'degree'})

In [33]:
deg.degree.value_counts()

2     91
4     75
3     70
1     60
6     50
5     37
8     29
7     13
11     7
9      5
0      5
10     4
12     4
13     2
20     1
14     1
16     1
18     1
22     1
Name: degree, dtype: int64

In [48]:
df['cl_depth'] = df.cl_bottom.apply(lambda x: len(x.split(':')))

In [53]:
df[df.cl_depth>1].cl_bottom.value_counts().shape

(41,)