In [1]:
import collections
import math
import time
import datetime
import pickle

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import scipy.stats
import seaborn as sns

import networkx as nx
from networkx.algorithms import community
import graph_tool.all as gt

import cc_graph_ops

## Running Girvan-Newman

[Girvan-Newman](https://en.wikipedia.org/wiki/Girvan%E2%80%93Newman_algorithm) is a standard algorithm for community detection for graphs. We choose to use the built-in `networkx` implementation for convenience; we can study other algorithms for community detection later on, but right now we're still exploring the structure of the network.

In [2]:
INPUT_FILE = 'fdg_input_file.json'

In [3]:
g = cc_graph_ops.create_graph_from_file(INPUT_FILE)

In [4]:
def most_central_edge(g):
    centrality = nx.edge_betweenness_centrality(g, weight='weight')
    return max(centrality, key=centrality.get)

In [5]:
licenses = cc_graph_ops.get_licenses(g)

In [6]:
quota = 10
proportion = 0.3
min_subgraph_size = 100

In [8]:
subgraphs = cc_graph_ops.all_license_subgraphs(g, licenses, quota=quota, proportion=proportion)

In [9]:
OUTPUT_FILE = 'communities.pkl'

In [10]:
communities = dict()
try:
    for license in licenses:
        if len(subgraphs[license]) < min_subgraph_size:
            continue
        tic = time.time()
        print(license)
        print('domains:', len(subgraphs[license]))
        comp = community.centrality.girvan_newman(subgraphs[license], most_valuable_edge=most_central_edge)
        communities[license] = tuple(sorted(c) for c in next(comp))
        toc = time.time()
        print('elapsed:', toc-tic)
except KeyboardInterrupt:
    with open(OUTPUT_FILE, 'wb') as output:
        pickle.dump(communities, output)

('gpl', '2.0')
409


In [11]:
with open(OUTPUT_FILE, 'wb') as output:
    pickle.dump(communities, output)