In [None]:
import csv
import json
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as spstats
import logbook

In [None]:
exp_name="21_plot_degree"
project_file = "data/projects-2016-10-14-dedup.json"
degree_file = "output/19_find_degree/2016-11-16 14:14:42 ab98fcc/%d-degree.csv"
stats_file = "degree_stats.csv"
efficiency_files = [
    "output/10_find_efficiency/2016-11-02 22:03:53 f3da2a6/efficiency.csv",
    "output/10_find_efficiency/2016-11-03 18:32:16 36a9e0b/efficiency.csv"]
performance_file = "output/11_find_performance/2016-11-01 16:40:13 11ba292/fa_ga_transitions.csv"

In [None]:
exp = logbook.Experiment(exp_name)

In [None]:
project_ids = []
with open(project_file, "rb") as f:
    for row in f:
        data = json.loads(row)
        project_ids.append(data["project_id"])

In [None]:
df_perf = pd.DataFrame.from_csv(performance_file)

In [None]:
df_eff = None
for f in efficiency_files:
    if df_eff is None:
        df_eff = pd.DataFrame.from_csv(f)
    else:
        df_eff = pd.concat([df_eff, pd.DataFrame.from_csv(f)])

In [None]:
df = pd.concat([df_eff, df_perf], axis=1, join="inner")
df_nz = df[df["N_a"] > 0]
df_nz = df_nz[df_nz["N_b"] > 0]
df_nz = df_nz[df_nz["N_c"] > 0]
df_nz = df_nz[df_nz["Delta_a"] > 0]
df_nz = df_nz[df_nz["Delta_b"] > 0]
df_nz = df_nz[df_nz["Delta_c"] > 0]
df_nz = df_nz[df_nz["to_ga"] > 0]
df_nz = df_nz[df_nz["to_fa"] > 0]

In [None]:
a_eff = df_nz["Delta_a"]/df_nz["N_a"]
b_eff = df_nz["Delta_b"]/df_nz["N_b"]
c_eff = df_nz["Delta_c"]/df_nz["N_c"]
mean_a = a_eff.sum() / len(a_eff)
mean_b = b_eff.sum() / len(b_eff)
mean_c = c_eff.sum() / len(c_eff)
eff = pd.DataFrame(columns=["efficiency"])
eff["efficiency"] = a_eff/mean_a * b_eff/mean_b * c_eff/mean_c
a_eff = a_eff/mean_a
b_eff = b_eff/mean_b
c_eff = c_eff/mean_c
perf = pd.DataFrame(columns=["performance"])
perf["performance"] = df_nz["to_fa"] / (df_nz["to_ga"] + df_nz["to_fa"])

In [None]:
def skew(x):
    n = len(x)
    mean = x.sum() / float(n)
    dx = x - mean
    result = ((dx*dx*dx).sum() / float(n)) \
        / np.power((dx*dx).sum() / float(n - 1), 1.5)
    return result

In [None]:
mean = {}
skew_in = {}
skew_out = {}
median_in = {}
median_out = {}
for project_id in project_ids:
    try:
        df = pd.DataFrame.from_csv(degree_file % project_id)
        if len(df) < 1:
            continue
        p_mean = df["in_degree"].sum() / float(len(df["in_degree"]))
        median_in[project_id] = df["in_degree"].median()
        median_out[project_id] = df["out_degree"].median()
        mean[project_id] = p_mean
        skew_in[project_id] = skew(df["in_degree"])
        skew_out[project_id] = skew(df["out_degree"])
    except IOError:
        pass
    except ValueError:
        pass

In [None]:
df_degree = pd.DataFrame({
    "mean_degree": mean,
    "median_in": median_in,
    "median_out": median_out,
    "skew_in": skew_in,
    "skew_out": skew_out})
df_degree_nz = df_degree[df_degree["mean_degree"] > 0]
df_degree.to_csv(exp.get_filename(stats_file))

In [None]:
df_all = pd.concat([eff, perf, df_degree_nz], axis=1, join="inner")

In [None]:
plt.figure(figsize=(8,10))
subplot(3,1,1)
plt.title('Mean degree');plt.xlabel('log(Mean degree)');plt.ylabel('Freq')
plt.hist(np.log(df_all["mean_degree"]), 60)
subplot(3,1,2)
plt.title('In-Degree Skewness');
plt.xlabel('log(Skewness)');plt.ylabel('Freq')
plt.hist(np.log(df_all["skew_in"]), 60, range=(-2,4))
subplot(3,1,3)
plt.title('Out-Degree Skewness');
plt.xlabel('log(Skewness)');plt.ylabel('Freq')
plt.hist(np.log(df_all["skew_out"]), 60, range=(-2,4))
plt.tight_layout()

In [None]:
plt.figure(figsize=(8,10))
plt.subplot(2,1,1)
r,p = spstats.pearsonr(np.log(df_all["mean_degree"]), np.log(df_all["performance"]))
plt.title("Degree-Performance r=%0.2f p=%0.4f" % (r,p))
plt.xlabel("log(Mean degree)"); ylabel("log(Performance)")
plt.plot(np.log(df_all["mean_degree"]), np.log(df_all["performance"]), ".")
plt.subplot(2,1,2)
plt.plot(np.log(df_all["mean_degree"]), np.log(df_all["efficiency"]), ".")
r,p = spstats.pearsonr(np.log(df_all["mean_degree"]), np.log(df_all["efficiency"]))
plt.title("Degree-Efficiency r=%0.2f p=%0.4f" % (r,p))
plt.xlabel("log(Mean degree)"); ylabel("log(Efficiency)")
plt.tight_layout()

In [None]:
plt.figure(figsize=(8,10))
plt.subplot(2,1,1)
r,p = spstats.pearsonr(np.log(df_all["mean_degree"]), np.log(df_all["performance"]))
plt.title("Degree-Performance r=%0.2f p=%0.4f" % (r,p))
plt.xlabel("log(Mean degree)"); ylabel("log(Performance)")
plt.plot(np.log(df_all["mean_degree"]), np.log(df_all["performance"]), ".")
plt.subplot(2,1,2)
plt.plot(np.log(df_all["mean_degree"]), np.log(df_all["efficiency"]), ".")
r,p = spstats.pearsonr(np.log(df_all["mean_degree"]), np.log(a_eff))
plt.title("Degree-Efficiency r=%0.2f p=%0.4f" % (r,p))
plt.xlabel("log(Mean degree)"); ylabel("log(Efficiency)")
plt.tight_layout()

In [None]:
plt.figure(figsize=(8,10))
plt.subplot(2,1,1)
r,p = spstats.pearsonr(np.log(df_all["skew_out"]), np.log(df_all["performance"]))
plt.title("Skew-Performance r=%0.2f p=%0.4f" % (r,p))
plt.xlabel("log(Out-degree skewness)"); ylabel("log(Performance)")
plt.plot(np.log(df_all["skew_out"]), np.log(df_all["performance"]), ".")
plt.subplot(2,1,2)
plt.plot(np.log(df_all["skew_out"]), np.log(df_all["efficiency"]), ".")
r,p = spstats.pearsonr(np.log(df_all["skew_out"]), np.log(df_all["efficiency"]))
plt.title("Skew-Efficiency r=%0.2f p=%0.4f" % (r,p))
plt.xlabel("log(Out-degree skewness)"); ylabel("log(Efficiency)")
plt.tight_layout()