In [None]:
from pygraph.classes.graph import graph
from pygraph.readwrite import dot
from pygraph.algorithms.minmax import shortest_path
from pygraph.algorithms.accessibility import connected_components
from pygraph.algorithms.pagerank import pagerank

In [None]:
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import re
%matplotlib inline

In [None]:
import itertools
import random
import statistics

In [1]:
from network import *

In [None]:
from IPython.core.display import Image, display

In [None]:
def construct_graph_filtered(old_graph, node_predicate, egde_predicate):
    new_graph = graph()
    for node in old_graph.nodes():
        if node_predicate(node):
            new_graph.add_node(node)
    for edge in old_graph.edges():
        if not new_graph.has_edge(edge):
            if node_predicate(edge[0]) and node_predicate(edge[1]) and egde_predicate(edge):
                new_graph.add_edge(edge)
    return new_graph

In [2]:
def load_authors_dataset(filename):
    f = open(filename, 'r')
    for line in f:
        [journal, title, authors, year, abstract] = line.split("\t")
        yield Article(title, authors.split(",") if authors != "" else [], year, journal, abstract, None, [])    

In [3]:
def parse_dataset_file(filename):
    paper_title_regexp = re.compile('#\*(.*)\n')
    authors_regexp = re.compile('#@(.*)\n')
    year_regexp = re.compile('#t(.*)\n')
    publication_venue_regexp = re.compile('#c(.*)\n')
    paper_index_regexp = re.compile('#index(.*)\n')
    references_ids_regexp = re.compile('#%(.*)\n')
    
    f = open(filename, 'r')
    
    paper_title = authors = publication_venue = paper_index_id = ''
    year = -1
    references_ids = []
    
    for line in f:
        if paper_title_regexp.search(line) is not None:
            paper_title = paper_title_regexp.search(line).group(1)
        elif authors_regexp.search(line) is not None:
            authors = authors_regexp.search(line).group(1)
        elif year_regexp.search(line) is not None:
            year = year_regexp.search(line).group(1)
        elif publication_venue_regexp.search(line) is not None:
            publication_venue = publication_venue_regexp.search(line).group(1)
        elif paper_index_regexp.search(line) is not None:
            paper_index = paper_index_regexp.search(line).group(1)
        elif references_ids_regexp.search(line) is not None:
            references_ids.append(references_ids_regexp.search(line).group(1))
        elif line == "\n":
            yield Article(paper_title, authors.split(",") 
                          if authors != '' else [], year, publication_venue, paper_index, None, references_ids)
            paper_title = authors = publication_venue = paper_index_id = ''
            year = -1
            references_ids = []

In [4]:
def analyse(file, loader):
    coauthorNetwork = CoathorNetwork()
    for article in loader(file):
        coauthorNetwork.add_article(article)

    print("Uniq authors:", len([x for x in coauthorNetwork.author_to_article.keys()]))
    return coauthorNetwork
    
#file = "./data/authors/Medical Informatics.txt"
file = "./data/out_test.txt"
coauthorNetwork = analyse(file, parse_dataset_file)
components = connected_components(coauthorNetwork.gr)


#!dot 'graph.dot' -Tpng -o "graph.png"
#display(Image('graph.png' ))

Uniq authors: 2258


In [None]:
# calculate distance distribution for 
# detect small world phenomenon
def get_distance_stat(graph, n):
    stat = {}
    summ = len(graph.nodes()) * n
    for i in range(n):
        random_author = random.choice(graph.nodes())
        distances = shortest_path(graph, random_author)[1]
        for x in distances.values():
            stat[x] = stat.get(x, 0) + 1;
    return dict( (key, value / summ) for key, value in stat.items() )
        

In [None]:
def calc_pagerank(graph):
    weighted_nodes = pagerank(graph)

In [None]:
def analize_components(coauthorNetwork, components, component_num):
    PRECISION = 20 # number of authors to calc avg distance
    component_count = max(components.values())

    component_sizes = {}

    for author, component in components.items():
        component_sizes[component] = component_sizes.get(component, 0) + 1

    top10_components = sorted(component_sizes.items(), key = lambda pair: -pair[1])[0:10]

    top1_component = top10_components[component_num][0]

    top1_subgraph = construct_graph_filtered(coauthorNetwork.gr,
                                             lambda node: components[node] == top1_component,
                                             lambda egde: True)
    
    print(len(top1_subgraph.nodes()))
    stat = get_distance_stat(top1_subgraph, PRECISION)
    #print(stat)
    print("mean", sum([value * key for key, value in stat.items()]))
    plt.plot([x for x in stat.keys()], [x for x in stat.values()])
    plt.show()

analize_components(coauthorNetwork, components, 0)
analize_components(coauthorNetwork, components, 1)


In [None]:
for key, value in coauthorNetwork.author_to_article.items():
    if len(value) > 1:
        pass
        #print(key)

In [None]:
g = graph()
g.add_nodes(["a", "b"])
g.add_node_attribute('a',("weight",7))
dot.write(g)

In [None]:
for x in coauthorNetwork.articles[0:10]:
    print(x.__dict__)