In [None]:
%pylab inline
import json
import os
from matplotlib import pyplot as plt
import numpy as np
import logbook
import msgpack

In [None]:
exp_name = "23_plot_path"
exp = logbook.Experiment(exp_name)
log = exp.get_logger()
project_file = 'data/projects-2016-10-14-dedup.json'
coeditor_stats = "archive/17.1_coeditor_stats/2017-05-03 09:30:32 aa49f7f"
path_dirs = [
    "archive/20_find_path_length/2017-05-05 13:28:24 dbe068d",
    "archive/20_find_path_length/2017-05-12 16:42:41 3cbb13d",
    "archive/20_find_path_length/2017-05-23 11:24:22 17de948"
]
stats_file = "path_stats.csv"

In [None]:
project_names = {}
with open(project_file, "rb") as f:
    for row in f:
        datum = json.loads(row)
        project_names[datum["project_id"]] = datum["project_name"]

In [None]:
project_node_count = {}
with open(coeditor_stats, "rb") as f:
    for row in f:
        project_id, node_count, edge_count = row.strip().split("\t")
        project_node_count[project_id] = node_count

In [None]:
def get_mean_path_length(filename):
    all_nodes = set()
    total = 0
    count = 0
    with open(filename, "rb") as f:
        for row in f:
            project_id, source, target, length = row.rstrip().split(",")
            if source == target:
                continue
            total += int(length)
            count += 1
            all_nodes.add(int(target))
            all_nodes.add(int(source))
    project_node_count = len(all_nodes)
    mean_length = float(total) / float(count)
    return mean_length

In [None]:
def get_harmonic_path_length(filename):
    all_nodes = set()
    total = 0.0
    with open(filename, "rb") as f:
        for row in f:
            project_id, source, target, length = row.rstrip().split(",")
            if source == target:
                continue
            total += 1.0 / float(length)
            all_nodes.add(int(target))
            all_nodes.add(int(source))
    node_count = len(all_nodes)
    pair_count = node_count * (node_count - 1)
    mean_length = float(pair_count) / total
    return mean_length

def get_harmonic_path_length_sampled(filename, project_id, strata, per_stratum):
    all_nodes = set()
    total = 0.0
    with open(filename, "rb") as f:
        for row in f:
            pid, source, target, length = row.rstrip().split(",")
            if source == target:
                continue
            total += 1.0 / float(length)
    node_count = project_node_count[project_id]
    sample_count = strata * per_stratum
    pair_count = sample_count * (node_count - 1)
    mean_length = float(pair_count) / total
    return mean_length

In [None]:
def get_connectivity(filename):
    all_nodes = set()
    paths = 0
    with open(filename, "rb") as f:
        for row in f:
            pid, source, target, length = row.rstrip().split(",")
            all_nodes.add(source)
            all_nodes.add(target)
            if source == target:
                continue
            paths += 1
    node_count = len(all_nodes)
    connectivity = float(paths) / (node_count * (node_count - 1))
    return connectivity

def get_connectivity_sampled(filename, project_id, strata, per_stratum):
    all_nodes = set()
    paths = 0
    with open(filename, "rb") as f:
        for row in f:
            pid, source, target, length = row.rstrip().split(",")
            all_nodes.add(source)
            all_nodes.add(target)
            if source == target:
                continue
            paths += 1
    node_count = project_node_count[project_id]
    sample_count = strata * per_stratum
    pair_count = sample_count * (node_count - 1)
    connectivity = float(paths) / float(pair_count)
    return connectivity

In [None]:
all_means = []
all_harmonic = []
all_connectivity = []
with open(exp.get_filename(stats_file), "wb") as out:
    out.write("project_id,path_harmonic,path_mean,connectivity\n")
    for path_dir in path_dirs:
        for i, path_file in enumerate(os.listdir(path_dir)):
            if path_file.endswith('path_length.csv'):
                project_id = path_file.split("-")[0]
                log.info("Loading path file: %s" % path_file)
                path_mean = get_mean_path_length(os.path.join(path_dir, path_file))
                all_means.append(path_mean)
                path_harmonic = get_harmonic_path_length(os.path.join(path_dir, path_file))
                all_harmonic.append(path_harmonic)
                connectivty = get_connectivity(os.path.join(path_dir, path_file))
                all_connectivity.append(connectivity)
                out.write("%s,%s,%s,%s\n" % (project_id, repr(path_harmonic), repr(path_mean), repr(connectivity)))
                out.flush()
            if path_file.endswith('path_length-sampled.csv'):
                parts = path_file.split("-")
                project_id = int(parts[0])
                strata = int(parts[1])
                per_stratum = int(parts[2])
                log.info("Loading path file: %s" % path_file)
                path_mean = get_mean_path_length(os.path.join(path_dir, path_file))
                all_means.append(path_mean)
                path_harmonic = get_harmonic_path_length_sampled(
                    os.path.join(path_dir, path_file), project_id, strata, per_stratum)
                all_harmonic.append(path_harmonic)
                connectivity = get_connectivity_sampled(
                    os.path.join(path_dir, path_file), project_id, strata, per_stratum)
                all_connectivity.append(connectivity)
                out.write("%s,%s,%s,%s\n" % (project_id, repr(path_harmonic), repr(path_mean), repr(connectivity)))
                out.flush()

In [None]:
plt.figure(figsize=(8,8))
plt.hist(all_harmonic, 60)
plt.xlabel("Harmonic mean path length")
plt.ylabel("Frequency")
plt.title("editor_count < 5551")
plt.tight_layout()

In [None]:
path_file