In [1]:
import collections
import math
import os
import glob
import time
import datetime
import pickle

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import scipy.stats
import seaborn as sns

import networkx as nx
from networkx.algorithms import community
import graph_tool.all as gt

import cc_graph_ops

## Running the Stochastic Block Model

We use the weighted-edge schochastic block model to generate communities within our graph. Our code is taking from the [cookbook](https://graph-tool.skewed.de/static/doc/demos/inference/inference.html) on `graph-tool`'s documentation.

In [2]:
INPUT_FILE = 'fdg_input_file.json'

In [3]:
g = cc_graph_ops.create_graph_from_file(INPUT_FILE, format='gt')

In [4]:
quota = 10
proportion = 0.3
min_subgraph_size = 100

In [5]:
licenses = list(cc_graph_ops.get_licenses(g))

In [6]:
subgraphs = cc_graph_ops.all_license_subgraphs(g, licenses, quota=quota, proportion=proportion)

In [7]:
OUTPUT_DIR = 'block_models/'

In [8]:
class BlockModelData:
    def __init__(self, elapsed, license, model):
        self.elapsed = elapsed
        self.license = license
        self.model = model

In [9]:
def write_communities(subgraphs, output_dir, min_subgraph_size):
    os.mkdir(output_dir)
    for license, subg in subgraphs.items():
        if len(list(subg.vertices())) < min_subgraph_size:
            continue
        print(license)
        tic = time.time()
        model = gt.minimize_nested_blockmodel_dl(subg, state_args=dict(recs=[subg.ep.weight],
                                                rec_types=['discrete-geometric']))
        model = model.copy(bs=model.get_bs() + [np.zeros(1)] * 4,
                            sampling = True)
        for i in range(100):
            ret = model.multiflip_mcmc_sweep(niter=10, beta=np.inf)
        toc = time.time()
        print(toc - tic)
        with open(os.path.join(output_dir, "".join(e for e in license if e.isalnum()) + '.pkl'), 'wb') as f:
            pickle.dump(BlockModelData(toc - tic, license, model), f)

In [10]:
## We only need to run this once
# write_communities(subgraphs, OUTPUT_DIR, min_subgraph_size)

In [24]:
def read_communities(output_dir, verbose=False, domains=False):
    license_communities = dict()
    for filepath in glob.glob(output_dir + "*.pkl"):
        if verbose:
            print(filepath)
        with open(filepath, 'rb') as file:
            model = pickle.load(file)
            levels = model.model.get_levels()
            communities = collections.defaultdict(list)
            for v in subgraphs[model.license].vertices():
                v_id = subgraphs[model.license].vp['id'][v]
                if domains:
                    v_id = (v_id, subgraphs[model.license].vp['provider_domain'][v])
                com_id = levels[0].get_blocks()[v]
                communities[com_id].append(v_id)
            license_communities[model.license] = communities
    return license_communities

In [25]:
license_communities = read_communities(OUTPUT_DIR, domains=True)

In [26]:
license_communities.keys()

dict_keys(["('by-nc', '2.0')", "('by', '4.0')", "('cc0', '1.0')", "('gpl', '2.0')", "('by-nc-sa', '3.0')", "('by-nc', '3.0')", "('by-nc-sa', '4.0')", "('pdm', '1.0')", "('by-nc-nd', '4.0')", "('by-nc-sa', '2.5')", "('by-nc', '4.0')", "('by', '3.0')", "('by-sa', '4.0')", "('by-nd', '4.0')", "('by-sa', '3.0')", "('by', '2.5')", "('by-nd', '3.0')", "('by-sa', '2.0')", "('by-nc-nd', '2.0')", "('by-nc-nd', '3.0')", "('by-sa', '2.5')", "('by-nc', '2.5')", "('by', '2.0')", "('by-nd', '2.0')", "('by-nc-nd', '2.5')", "('by-nc-sa', '2.0')"])

In [27]:
for license, communities in license_communities.items():
    print(license)
    print()
    for num, community in communities.items():
        # Very large communities are likely spurious
        if len(community) > 0.3 * len(list(subgraphs[license].vertices())) or len(community) > 100:
            continue
        print(len(community))
        print(community)
        print()
    print()
    print('-'*70)
    print()

('by-nc', '2.0')

15
[('schoolplaten', 'www.schoolplaten.com'), ('a-worldofwords', 'www.a-worldofwords.com'), ('timbres-bordeaux', 'www.timbres-bordeaux.fr'), ('schulbilder', 'www.schulbilder.org'), ('france-timbres', 'marcophilie.france-timbres.net'), ('educol', 'www.educol.net'), ('album-timbres', 'album-timbres.fr'), ('planchage-timbres', 'www.planchage-timbres.fr'), ('sclera', 'sclera.be'), ('infobilder', 'www.infobilder.com'), ('educima', 'www.educima.com'), ('e-timbres', 'www.marcophilie.e-timbres.net'), ('eg0', 'eg0.me'), ('mouton-rebelle', 'ego.mouton-rebelle.com'), ('onceuponabookcase', 'www.onceuponabookcase.co.uk')]

26
[('ordinary-times', 'ordinary-times.com'), ('opb', 'www.opb.org'), ('dchauvin', 'dchauvin.fr'), ('sensorysmithfield', 'sensorysmithfield.com'), ('managementmania', 'managementmania.com'), ('oulip', 'oulip.info'), ('arborupdate', 'arborupdate.com'), ('obercom', 'obs.obercom.pt'), ('bijinkei', 'bijinkei.net'), ('sensorycities', 'www.sensorycities.com'), ('danwe

## Running Girvan-Newman

[Girvan-Newman](https://en.wikipedia.org/wiki/Girvan%E2%80%93Newman_algorithm) is a standard algorithm for community detection for graphs. We choose to use the built-in `networkx` implementation for convenience; we can study other algorithms for community detection later on, but right now we're still exploring the structure of the network.

In [None]:
# INPUT_FILE = 'fdg_input_file.json'

In [None]:
# g = cc_graph_ops.create_graph_from_file(INPUT_FILE)

In [None]:
# def most_central_edge(g):
#     centrality = nx.edge_betweenness_centrality(g, weight='weight')
#     return max(centrality, key=centrality.get)

In [None]:
# licenses = cc_graph_ops.get_licenses(g)

In [None]:
# quota = 10
# proportion = 0.3
# min_subgraph_size = 100

In [None]:
# subgraphs = cc_graph_ops.all_license_subgraphs(g, licenses, quota=quota, proportion=proportion)

In [None]:
# OUTPUT_FILE = 'communities.pkl'

In [None]:
# communities = dict()
# try:
#     for license in licenses:
#         if len(subgraphs[license]) < min_subgraph_size:
#             continue
#         tic = time.time()
#         print(license)
#         print('domains:', len(subgraphs[license]))
#         comp = community.centrality.girvan_newman(subgraphs[license], most_valuable_edge=most_central_edge)
#         communities[license] = tuple(sorted(c) for c in next(comp))
#         toc = time.time()
#         print('elapsed:', toc-tic)
# except KeyboardInterrupt:
#     with open(OUTPUT_FILE, 'wb') as output:
#         pickle.dump(communities, output)

In [None]:
# with open(OUTPUT_FILE, 'wb') as output:
#     pickle.dump(communities, output)