In [1]:
import igraph as ig
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import powerlaw

from tqdm import trange, tqdm

from collections import defaultdict
from itertools import combinations

In [29]:
def measure_params(g, coms, drop_outliers=False, min_degree=5, min_com_size=10):
    """
    Measure parameters of a graph with known communities.

    input:
    - g: the graph (igraph). Each vertex must have it's a set of it's communities stored in a 'comms' attribute.
    - coms: a list of lists containing the vertex id's in each community
    - drop_outliers  False: drop the outliers before measuring any params (if true then n_out will be 0).
    - min_degree  5: override the minimum degree parameter if the actual min degree is too small.
        This value will also be used for computeing the degree powerlaw exponent.
    - min_com_size  10: override the minimum community size parameter if the actual min com size is too small.
        This value will also be used for computeing the community size powerlaw exponent.

    returns:
    - params: a dictionary with the measured parameters
    """
    params = dict()

    n_coms = np.array([len(comms) for comms in g.vs["comms"]])
    is_outlier = n_coms == 0
    if drop_outliers:
        g = g.subgraph(np.argwhere(~is_outlier).reshape(-1))
        params["n"] = int(g.vcount())
        params["n_out"] = 0
        n_coms = n_coms[~is_outlier]
    else:
        params["n"] = int(g.vcount())
        params["n_out"] = int(np.sum(is_outlier))

    params["eta"] = float(np.mean(n_coms))

    degrees = np.array(g.degree())
    d_min = np.maximum(min_degree, np.min(degrees))
    params["d_min"] = d_min
    params["d_max"] = int(np.max(degrees))
    params["t1"] = powerlaw.Fit(degrees, discrete=True, verbose=False, xmin=d_min).power_law.alpha

    com_sizes = np.array([len(com) for com in coms])
    c_min = np.maximum(min_com_size, np.min(com_sizes))
    params["c_min"] = c_min
    params["c_max"] = int(np.max(com_sizes))
    params["t2"] = powerlaw.Fit(com_sizes, discrete=True, verbose=False, xmin=c_min).power_law.alpha

    xi = sum([len(g.vs[e.source]["comms"].intersection(g.vs[e.target]["comms"]))==0 for e in g.es]) / g.ecount()
    params["xi"] = xi

    rho = np.corrcoef(degrees, [len(c) for c in g.vs["comms"]])[0, 1]
    params["rho"] = rho

    return params

In [26]:
# # Read Graph
graph_path = "../../SNAP/com-dblp.ungraph.txt"
com_path = "../../SNAP/com-dblp.all.cmty.txt"

graph_path = "../../SNAP/com-amazon.ungraph.txt"
com_path = "../../SNAP/com-amazon.all.dedup.cmty.txt"

graph_path = "../../SNAP/com-youtube.ungraph.txt"
com_path = "../../SNAP/com-youtube.all.cmty.txt"


edges = pd.read_csv(graph_path, sep='\t', comment="#")
coms = []
with open(com_path, "r") as infile:
    for line in infile:
        x = line[:-1]  # drop trailing newline
        x = x.split('\t')
        coms.append(frozenset([int(y) for y in x])) ## map to 0-based

g = ig.Graph.DataFrame(edges, directed=False)

c = [set() for _ in range(g.vcount())]
for i, com in enumerate(coms):
    for v in com:
        c[v].add(i)
        #print(f"Vertex {v} in community {i}")
        #print(f"Vetrex {v} is now in {len(c[v])} coms")
c = [frozenset(i) for i in c] 
g.vs["comms"] = c

g = g.subgraph(np.arange(g.vcount())[np.array(g.degree()) > 0])  # Vertex list is not continugous, degree 0 vertices get added


print(f"{g.vcount()} vertices")
print(f"{g.ecount()} edges")
print(f"{len(coms)} communities")

1134890 vertices
2987623 edges
16386 communities


In [30]:
measure_params(g, coms, drop_outliers=True)

{'n': 52675,
 'n_out': 0,
 'eta': 2.4528144280968203,
 'd_min': 5,
 'd_max': 1928,
 't1': 1.8702187087097446,
 'c_min': 10,
 'c_max': 3001,
 't2': 2.130965769664415,
 'xi': 0.5928066048845747,
 'rho': 0.3746343169285614}