# Collaboration Network Analysis

In [1]:
import networkx as nx
import matplotlib.pyplot as plt
from pyvis.network import Network

## 1. Load and preprocess the collaboration data

In [2]:
'''
#index 1
#n O. Willum
#a Res. Center for Microperipherik, Technische Univ. Berlin, Germany
#pc 1
#cn 0
#hi 0
#pi 0.0000
#upi 0.0000
#t new product;product group;active product;long product lifetime;old product;product generation;new technology;environmental benefit;environmental choice;environmental consequence
'''

with open('data/AMiner-Author.txt', 'r') as file:
    data = file.read().split('\n\n')
    
author={}    
detail = ['index','n', 'a', 'pc', 'cn', 'hi', 'pi', 'upi', 't']
for auth in data:
    info = auth.split('\n')
    if info[0] is '':
        break
    index = info[0].split(' ')[1]
    author[index]={}
    for num,each in enumerate(detail):
        author[index][each] = info[num].replace('#'+each+' ', '')
    author[index]['a'] = author[index]['a'].split(', ')
    author[index]['t'] = author[index]['t'].split(';')
    author[index]['field'] = []
    author[index]['field']+=([item for field in author[index]['t'] for item in field.split(' ')])

In [3]:
def filter(keyword, by_type):
    filtered_list = []
    for index in author:
        if keyword in author[index][by_type]:
            filtered_list.append(index)
    return filtered_list

In [4]:
coauthorfile = open('data/AMiner-Coauthor.txt')
coauthors = []
for line in coauthorfile:
    author1, author2, count = line[1:].split('\t')
    coauthors.append((author1, author2, int(count)))

In [5]:
coauthors[:10]

[('522324', '1034146', 1),
 ('1355779', '1229932', 2),
 ('688814', '947067', 2),
 ('1329221', '1140429', 1),
 ('742331', '314944', 1),
 ('898041', '1061829', 1),
 ('1075448', '1040028', 1),
 ('1218654', '1244844', 2),
 ('117148', '364153', 2),
 ('1335705', '738530', 1)]

In [6]:
sortedcoauthors = sorted(coauthors, key = lambda coauth: -coauth[2])
sortedcoauthors[:10]

[('111806', '977442', 320),
 ('966551', '111806', 320),
 ('966551', '977442', 320),
 ('980079', '68033', 310),
 ('549347', '80953', 306),
 ('324627', '33938', 234),
 ('860814', '1693619', 216),
 ('946534', '1536687', 194),
 ('833156', '815734', 143),
 ('218997', '173556', 132)]

## 2. Visualize the top 10 collaboration pairs

In [7]:
# Credit: https://gist.github.com/quadrismegistus/92a7fba479fc1e7d2661909d19d4ae7e

def visualize(networkx_graph, name):
    pyvis_graph = Network(height=800, width=800, notebook=True)
    for node,node_attrs in networkx_graph.nodes(data=True):
        pyvis_graph.add_node(node,**node_attrs)

    # for each edge and its attributes in the networkx graph
    for source,target,edge_attrs in networkx_graph.edges(data=True):
        # if value/width not specified directly, and weight is specified, set 'value' to 'weight'
        if not 'value' in edge_attrs and not 'width' in edge_attrs and 'weight' in edge_attrs:
            # place at key 'value' the weight of the edge
            edge_attrs['value']=edge_attrs['weight']
        # add the edge
        pyvis_graph.add_edge(source,target,**edge_attrs)

    return pyvis_graph.show('docs/' + name)

In [8]:
def make_graph(coauth_tuples):
    G = nx.Graph()
    for coauthor in coauth_tuples:
        G.add_edge(coauthor[0], coauthor[1], weight=coauthor[2])
    return G

In [9]:
#remove all disconnected nodes to node

import copy
def remove_disconnected(G, node):
    node_list = copy.deepcopy(nx.nodes(G))
    for each in node_list:
        if nx.has_path(G,source=each, target=node) == 0:
            G.remove_node(each)

In [10]:
# visualize(make_graph(sortedcoauthers[:10]), 'top10.html')

In [11]:
# visualize(make_graph(sortedcoauthers[:1000]), 'top1000.html')

In [12]:
G = make_graph(coauthors)
print('whole graph connected: ',nx.is_connected(G))
print('numbers of node in whole graph',G.number_of_nodes())

# finding maximum degree node of whole graph
node_id = []
deg = 0

for node in nx.nodes(G):
    if G.degree[node] > deg:
        node_id = [node]
        deg = G.degree[node]
    elif G.degree[node] == deg:
        node_id.append(node)  
print("nodes with maximum degree", node_id)
print('degree' ,deg)

whole graph connected:  False
numbers of node in whole graph 1560640
nodes with maximum degree ['1642231']
degree 551


In [None]:
# connected graph for node '1642231'
remove_disconnected(G, '1642231')
print('number of node after removed: ',G.number_of_nodes())
print('all connected: ',nx.is_connected(G))

## 3. Filter data

In [13]:
mathematics = filter('mathematics', 'field')

In [None]:
print(mathematics)

In [14]:
math_graph = nx.Graph.subgraph(G, mathematics)

In [16]:
visualize(math_graph, 'mathematics.html')

In [17]:
graph = filter('graph', 'field')

In [18]:
graph_graph = nx.Graph.subgraph(G, graph)

In [19]:
visualize(graph_graph, 'graph.html')

In [None]:
math = filter('Beifang Chen', 'n')

In [None]:
print(math)

In [None]:
print(author['1082338'])

In [None]:
print(author['188513'])