# Collaboration Network Analysis

## 0. Initialize

In [1]:
import networkx as nx
import matplotlib.pyplot as plt
from pyvis.network import Network
import os.path
import json
import gzip
import sqlite3
from sqlite3 import Error

In [2]:
# global variables

data_cache_prefix = 'data_cache/'
graph_suffix = '.adjlist.gz'
json_suffix = '.json.gz'
database_file = data_cache_prefix+"author.db"

In [3]:
# caching helpers

def open_graph(filename):
    file_exists = os.path.isfile(data_cache_prefix+filename+graph_suffix)
    if file_exists:
        return nx.read_adjlist(data_cache_prefix+filename+graph_suffix)
    else:
        return None

def save_graph(G, filename):
    nx.write_adjlist(G, data_cache_prefix+ filename+graph_suffix)

def open_or_compute_graph(filename, func):
    graph = open_graph(filename)
    if graph is None :
        print('computing '+filename)
        graph = func()
        save_graph(graph, filename)
        return graph
    else:
        print('loaded cached '+filename)
        return graph

def open_json(filename):
    file_exists = os.path.isfile(data_cache_prefix+filename+json_suffix)

    if file_exists:
        with gzip.open(data_cache_prefix+filename+json_suffix, 'r') as f:
            return json.loads(f.read().decode('utf-8'))
    else:
        return None

def save_json(data, filename):
    with gzip.open(data_cache_prefix+filename+json_suffix, 'w') as f:
        f.write(json.dumps(data).encode('utf-8'))

def open_or_compute_json(filename, func):
    data = open_json(filename)
    if data is None :
        print('computing '+filename)
        data = func()
        save_json(data, filename)
        return data
    else:
        print('loaded cached  '+filename)
        return data

In [4]:
# database helpers

def create_connection(db_file):
    """ create a database connection to a SQLite database """
    conn = None
    try:
        conn = sqlite3.connect(db_file)
        print(sqlite3.version)
        return conn
    except Error as e:
        print(e)

def create_table(conn, create_table_sql):
    """ create a table from the create_table_sql statement
    :param conn: Connection object
    :param create_table_sql: a CREATE TABLE statement
    :return:
    """
    try:
        c = conn.cursor()
        c.execute(create_table_sql)
    except Error as e:
        print(e)

def create_author_table(conn):
    sql_create_authors_table = """
        CREATE TABLE IF NOT EXISTS authors (
            id integer PRIMARY KEY,
            idx integer,
            n text NOT NULL,
            a text NOT NULL,
            pc integer ,
            cn integer,
            hi integer,
            pi REAL,
            upi REAL,
            t text
        );
        """
    create_table(conn, sql_create_authors_table)

def create_author(conn, author):
    sql = """
    INSERT INTO authors (idx,n,a,pc,cn,hi,pi,upi,t) VALUES (?,?,?,?,?,?,?,?,?)
    """
    cur = conn.cursor()
    cur.execute(sql, author)
    return cur.lastrowid

## 1. Load and preprocess the collaboration data.

In [5]:
# Process author data and store to database

'''
#index 1
#n O. Willum
#a Res. Center for Microperipherik, Technische Univ. Berlin, Germany
#pc 1
#cn 0
#hi 0
#pi 0.0000
#upi 0.0000
#t new product;product group;active product;long product lifetime;old product;product generation;new technology;environmental benefit;environmental choice;environmental consequence
'''
def process_author_sqlite():
    with open('data/AMiner-Author.txt', 'r') as file:
        data = file.read().split('\n\n')

    database = data_cache_prefix+"author.db"
    conn = create_connection(database)

    # create tables
    if conn is None:
        # create projects table
        print("Error! cannot create the database connection.")
    else:
        with conn:
            create_author_table(conn)

            detail = ['index', 'n', 'a', 'pc', 'cn', 'hi', 'pi', 'upi', 't']
            total = len(data)
            for n, auth in enumerate(data):
                info = auth.split('\n')
                if info[0] is '':
                    break
                index = info[0].split(' ')[1]
                author = [0,0,0,0,0,0,0,0,0]
                for num,each in enumerate(detail):
                    author[num] = info[num].replace('#'+each+' ', '')
                create_author(conn, author)
                if n % 10000 == 0:
                    print("processed %d/%d"%(n,total))
    conn.close()

if os.path.isfile(database_file):
    print('%s already exist, skipping author processing'%database_file)
else:
    print('processing authors')
    process_author_sqlite()

data_cache/author.db already exist, skipping author processing


In [6]:
def filter(keyword, by_type):
    with create_connection(database_file) as conn:
        rows = conn.cursor().execute(
            f"""
            SELECT idx FROM authors WHERE {by_type} LIKE '%'||?||'%'
            """
        ,[keyword]).fetchall()
        return [row[0] for row in rows]

In [7]:
coauthorfile = open('data/AMiner-Coauthor.txt')
coauthors = []
for line in coauthorfile:
    author1, author2, count = line[1:].split('\t')
    coauthors.append((author1, author2, int(count)))

In [8]:
coauthors[:10]

[('522324', '1034146', 1),
 ('1355779', '1229932', 2),
 ('688814', '947067', 2),
 ('1329221', '1140429', 1),
 ('742331', '314944', 1),
 ('898041', '1061829', 1),
 ('1075448', '1040028', 1),
 ('1218654', '1244844', 2),
 ('117148', '364153', 2),
 ('1335705', '738530', 1)]

In [9]:
sortedcoauthors = sorted(coauthors, key = lambda coauth: -coauth[2])
sortedcoauthors[:10]

[('111806', '977442', 320),
 ('966551', '111806', 320),
 ('966551', '977442', 320),
 ('980079', '68033', 310),
 ('549347', '80953', 306),
 ('324627', '33938', 234),
 ('860814', '1693619', 216),
 ('946534', '1536687', 194),
 ('833156', '815734', 143),
 ('218997', '173556', 132)]

## 2. Visualize the top 10 collaboration pairs

In [10]:
# Credit: https://gist.github.com/quadrismegistus/92a7fba479fc1e7d2661909d19d4ae7e

def visualize(networkx_graph, name):
    pyvis_graph = Network(height=800, width=800, notebook=True)
    for node,node_attrs in networkx_graph.nodes(data=True):
        pyvis_graph.add_node(node,**node_attrs)

    # for each edge and its attributes in the networkx graph
    for source,target,edge_attrs in networkx_graph.edges(data=True):
        # if value/width not specified directly, and weight is specified, set 'value' to 'weight'
        if not 'value' in edge_attrs and not 'width' in edge_attrs and 'weight' in edge_attrs:
            # place at key 'value' the weight of the edge
            edge_attrs['value']=edge_attrs['weight']
        # add the edge
        pyvis_graph.add_edge(source,target,**edge_attrs)

    return pyvis_graph.show('docs/' + name)

In [11]:
def make_graph(coauth_tuples):
    G = nx.Graph()
    for coauthor in coauth_tuples:
        G.add_edge(coauthor[0], coauthor[1], weight=coauthor[2])
    return G

In [12]:
#remove all disconnected nodes to node

import copy
def remove_disconnected(G, node):
    node_list = copy.deepcopy(nx.nodes(G))
    for each in node_list:
        if nx.has_path(G,source=each, target=node) == 0:
            G.remove_node(each)
    return G

In [13]:
# visualize(make_graph(sortedcoauthers[:10]), 'top10.html')

In [14]:
# visualize(make_graph(sortedcoauthers[:1000]), 'top1000.html')

In [15]:
G = open_or_compute_graph('whole_graph',lambda: make_graph(coauthors))
print('whole graph connected: ',nx.is_connected(G))
print('numbers of node in whole graph',G.number_of_nodes())

# finding maximum degree node of whole graph
node_id = []
deg = 0

for node in nx.nodes(G):
    if G.degree[node] > deg:
        node_id = [node]
        deg = G.degree[node]
    elif G.degree[node] == deg:
        node_id.append(node)  
print("nodes with maximum degree", node_id)
print('degree' ,deg)

loaded cached whole_graph
whole graph connected:  False
numbers of node in whole graph 1560640
nodes with maximum degree ['1642231']
degree 551


In [16]:
# connected graph for node '1642231'

G = open_or_compute_graph('removed_disconnected', lambda: remove_disconnected(G, '1642231'))

print('number of node after removed: ',G.number_of_nodes())
print('all connected: ',nx.is_connected(G))

loaded cached removed_disconnected
number of node after removed:  1057194
all connected:  True


## 3. Filter data

In [17]:
mathematics = filter('mathematics', 't')

2.6.0


In [18]:
print(mathematics)

[200, 665, 688, 2260, 2702, 3663, 5367, 5489, 6086, 6420, 6710, 6787, 7069, 7286, 8562, 9925, 10426, 10455, 10464, 10788, 11413, 12030, 14052, 14746, 15423, 16086, 16092, 16814, 17163, 17368, 17663, 17844, 17852, 18230, 18933, 19319, 19772, 19919, 22878, 23675, 23915, 24446, 25006, 25241, 25847, 26106, 26343, 27532, 30629, 30896, 32172, 32443, 33269, 33726, 34600, 34796, 34804, 34973, 35592, 35644, 36050, 36904, 37053, 37246, 37328, 37361, 37776, 40502, 40790, 41411, 42006, 42716, 43256, 44084, 44764, 45341, 45569, 45707, 46864, 47186, 47347, 49387, 49729, 50466, 50753, 50760, 50772, 52229, 52779, 54151, 54947, 55061, 55208, 55282, 55462, 55579, 55627, 56589, 57474, 57727, 58036, 58427, 58488, 58638, 58846, 59225, 59633, 59849, 60207, 60781, 61116, 61670, 63501, 63580, 63770, 64039, 64771, 64836, 65182, 65343, 65494, 65621, 66477, 66869, 67832, 67939, 67982, 68010, 68600, 68643, 68751, 68878, 69197, 69895, 70160, 70370, 71243, 72261, 72546, 73325, 74013, 74226, 74329, 74360, 74760, 770

In [19]:
G = open_or_compute_graph('math', lambda: G.subgraph(mathematics))

loaded cached math


In [20]:
for node in nx.nodes(G):
    if G.degree[node] > deg:
        node_id = [node]
        deg = G.degree[node]
    elif G.degree[node] == deg:
        node_id.append(node)
print("nodes with maximum degree", node_id)
print('degree' ,deg)


nodes with maximum degree ['1642231']
degree 551


In [21]:
# del author
# del coauthorfile
# del coauthors
# del mathematics
# del sortedcoauthors

In [22]:
visualize(G,'math.html')

In [23]:
# maxG = nx.make_max_clique_graph(G)
maxG = sorted(list(nx.algorithms.clique.find_cliques(G)), key=len, reverse=True)[0]


visualize(G.subgraph(maxG), 'max_clique_math.html')

In [24]:
from IPython import display
from ipywidgets import widgets
colorpicker = widgets.ColorPicker(
    concise=False,
    description='Pick a color',
    value='black',
    disabled=False
)
display.display(colorpicker)


ColorPicker(value='black', description='Pick a color')