In [None]:
%pylab inline
import calendar
import datetime
import json
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import scipy.stats as spstats
from sqlalchemy import distinct, func, select
import database
from database.schema import Rating, revision_table
import logbook

matplotlib.rcParams['font.size'] = 9
matplotlib.rcParams['figure.dpi'] = 150
matplotlib.rcParams['lines.markersize'] = 9

In [None]:
exp_name = "26_combine_data"
exp = logbook.Experiment(exp_name)
project_file = "data/projects-2016-10-14-dedup.json"
transition_file = "archive/11_find_performance/2017-05-03 08:54:42 b8d538c/fa_ga_transitions.csv"
bga_perf_file = "archive/11b_find_bga_perf/2017-06-09 16:36:32 2c197d3/project_bga.csv"
efficiency_files = [
    "output/10_find_efficiency/2017-04-25 23:09:13 3bd3e7d/efficiency.csv"]
stage_eff_file = "archive/10b_find_stage_efficiency/2017-08-24 15:11:01 de67c2b/efficiency.csv"
control_file = "archive/24_find_controls/2017-05-11 12:45:30 f7ca849/controls.csv"
importance_file = "archive/04b_find_importance/2017-10-03 13:54:30 14665f3/importance.utf8.tsv"
similarity_file = "archive/24c find_similarity/2017-10-02 11:17:41 14665f3/similarity_mean.csv"
article_talk_file = "archive/24d_find_article_talk/2017-11-20 11:55:04 992baf5/article_talk.csv"
degree_file = "output/19_find_degree/2017-05-01 15:38:59 41cb865/%d-degree.csv"
mincut_file = "archive/22_plot_mincut/2017-05-15 14:24:53 030d8fd/mincut_stats.csv"
path_files = [
    "output/23_plot_path/2017-06-04 10:14:06 b25b01f/path_stats.csv"
]
out_file = "combined.csv"

In [None]:
project_ids = []
with open(project_file, "rb") as f:
    for row in f:
        data = json.loads(row)
        project_ids.append(data["project_id"])

In [None]:
df_eff = None
for f in efficiency_files:
    if df_eff is None:
        df_eff = pd.DataFrame.from_csv(f)
    else:
        df_eff = pd.concat([df_eff, pd.DataFrame.from_csv(f)])
df_nz = df_eff[df_eff["N_a"] > 0]
df_nz = df_nz[df_nz["N_b"] > 0]
df_nz = df_nz[df_nz["N_c"] > 0]
df_nz = df_nz[df_nz["Delta_a"] > 0]
df_nz = df_nz[df_nz["Delta_b"] > 0]
df_nz = df_nz[df_nz["Delta_c"] > 0]

In [None]:
a_eff = df_nz["Delta_a"]/df_nz["N_a"]
b_eff = df_nz["Delta_b"]/df_nz["N_b"]
c_eff = df_nz["Delta_c"]/df_nz["N_c"]
mean_a = a_eff.sum() / len(a_eff)
mean_b = b_eff.sum() / len(b_eff)
mean_c = c_eff.sum() / len(c_eff)
df = pd.DataFrame(columns=["efficiency"])
df["efficiency"] = a_eff/mean_a * b_eff/mean_b * c_eff/mean_c
a_eff = a_eff/mean_a
b_eff = b_eff/mean_b
c_eff = c_eff/mean_c
df_eff = pd.concat([df_eff, df], axis=1)

In [None]:
df_stage_eff = pd.DataFrame.from_csv(stage_eff_file)

In [None]:
df_perf = pd.DataFrame.from_csv(transition_file)
df = pd.DataFrame(columns=["performance"])
df["performance"] = df_perf["to_ga"]/(df_perf["to_ga"]+df_perf["to_fa"])
df_perf = pd.concat([df_perf, df], axis=1)

In [None]:
df_bga_perf = pd.DataFrame.from_csv(bga_perf_file)

In [None]:
df_con = pd.DataFrame.from_csv(control_file)

In [None]:
# Degree
def skew(x):
    n = len(x)
    mean = x.sum() / float(n)
    dx = x - mean
    result = ((dx*dx*dx).sum() / float(n)) \
        / np.power((dx*dx).sum() / float(n - 1), 1.5)
    return result

mean = {}
skew_in = {}
skew_out = {}
for project_id in project_ids:
    try:
        df = pd.DataFrame.from_csv(degree_file % project_id)
        if len(df) < 1:
            continue
        p_mean = df["in_degree"].sum() / float(len(df["in_degree"]))
        mean[project_id] = p_mean
        skew_in[project_id] = skew(df["in_degree"])
        skew_out[project_id] = skew(df["out_degree"])
    except IOError:
        pass
    except ValueError:
        pass
df_degree = pd.DataFrame({
    "degree_mean": mean,
    "in_degree_skew": skew_in,
    "out_degree_skew": skew_out})

In [None]:
df_mincut = pd.DataFrame.from_csv(mincut_file)

In [None]:
harmonic = {}
path_mean = {}
connectivity = {}
for f in path_files:
    df_path = pd.DataFrame.from_csv(f)
    for index, row in df_path.iterrows():
        harmonic[index] = row["path_harmonic"]
        path_mean[index] = row["path_mean"]
        connectivity[index] = row["connectivity"]
df_path = pd.DataFrame({
    "path_harmonic": harmonic,
    "path_mean": path_mean,
    "connectivity": connectivity
})

In [None]:
df_similarity = pd.read_csv(similarity_file)

In [None]:
df_talk = pd.read_csv(article_talk_file)
df_talk["talk_fraction"] = df_talk["talk_count"] / (df_talk["article_count"] + df_talk["talk_count"])

In [None]:
df_combined = pd.concat([df_perf, df_eff, df_stage_eff, df_con, df_degree, df_mincut, df_path, df_bga_perf, df_talk, df_similarity], axis=1, join="outer")

In [None]:
df_combined.to_csv(exp.get_filename(out_file))

In [None]:
plt.figure()
plt.plot(df_combined["degree_mean"], df_combined["flow_mean"], "o")
plt.xlabel("Mean In-degree")
plt.ylabel("Mean Min-cut")
plt.title("Pearson r = 0.9795936")

In [None]:
plt.rc("font", size=16)
x = [d for d in df_combined["path_harmonic"].dropna() if d <=6]
plt.hist(x, 60)
plt.tight_layout()
plt.xlabel("Harmonic Mean Path Length")
plt.ylabel("Frequency")

In [None]:
plt.hist(df_combined["degree_mean"].dropna(), 60)
plt.tight_layout()

In [None]:
len(df_combined["degree_mean"].dropna())

In [None]:
plt.hist(df_combined["flow_mean"].dropna(), 60)
plt.tight_layout()

In [None]:
df_combined.columns

In [None]:
a_eff = df_combined["Delta_a"] / df_combined["N_a"]
a_stage = df_combined["stage_Delta_a"] / df_combined["stage_N_a"]
b_eff = df_combined["Delta_b"] / df_combined["N_b"]
b_stage = df_combined["stage_Delta_b"] / df_combined["stage_N_b"]
c_eff = df_combined["Delta_c"] / df_combined["N_c"]
c_stage = df_combined["stage_Delta_c"] / df_combined["stage_N_c"]

In [None]:
plt.figure(figsize=(9,3))
subplot(1,3,1)
plt.loglog(a_eff, a_stage, 'o')
subplot(1,3,2)
plt.loglog(b_eff, b_stage, 'o')
subplot(1,3,3)
plt.loglog(c_eff, c_stage, 'o')
spstats.pearsonr(a_eff, a_stage)

In [None]:
df_combined["b_life_eff"] = df_combined["Articles_b"] / df_combined["stage_N_b"]

In [None]:
plt.loglog(df_combined["stage_N_b"], df_combined["Articles_b"], '.')
plt.grid()

In [None]:
df_combined["comp_perf"] = (df_combined["to_fa"] + df_combined["to_ga"]) / df_con["article_count"]

In [None]:
df = df_combined[df_combined["stage_N_b"] > 0]
df = df[df["comp_perf"] > 0]
df = df[df["b_life_eff"] > 0]

In [None]:
import scipy.stats as spstats
r, p = spstats.pearsonr(np.log10(df["b_life_eff"]), np.log10(df["comp_perf"]))
m, b = np.polyfit(np.log10(df["b_life_eff"]), np.log10(df["comp_perf"]), 1)
r,p

In [None]:
xmin = df["b_life_eff"].min()
xmax = df["b_life_eff"].max()
ymin = np.power(10, np.log10(xmin)*m+b)
ymax = np.power(10, np.log10(xmax)*m+b)
xmin,xmax,ymin,ymax

In [None]:
plt.figure(figsize=(3*15.0/7.0,2*15.0/7.0))
plt.loglog(df["b_life_eff"], df["comp_perf"], '.')
plt.loglog([xmin,xmax],[ymin,ymax],'g-', linewidth=2)
plt.ylabel("Performance", fontsize=16)
plt.xlabel("B-Efficiency", fontsize=16)
plt.title("r = %0.2f, p<0.001" % r, fontsize=16)
plt.tick_params(axis='both', which='major', labelsize=16)
plt.tight_layout()
plt.savefig('fig-perf-eff.pdf')

In [None]:
xmin*m+b

In [None]:
plt.figure(figsize=(6,6))
plt.loglog(df["b_life_eff"], df["comp_perf"], '.')
plt.loglog([xmin,xmax],[ymin,ymax],'g-', linewidth=2)
plt.ylabel("Performance", fontsize=24)
plt.xlabel("B-Productivity", fontsize=24)
plt.title("r = %0.2f, p<0.001" % r, fontsize=24)
plt.tick_params(axis='both', which='major', labelsize=20)
plt.tight_layout()
plt.savefig('fig-perf-eff.png', dpi=600)
plt.savefig('fig-perf-eff.eps')

In [None]:
df_combined["a_life_eff"] = df_combined["Articles_a"] / df_combined["stage_N_a"]
df_combined["c_life_eff"] = df_combined["Articles_c"] / df_combined["stage_N_c"]

In [None]:
plt.figure(figsize=(6,6))

plt.subplot(2,2,1)
df = df_combined[df_combined["stage_N_a"] > 0]
df = df[df["a_life_eff"] > 0]
plt.hist(np.log10(df.a_life_eff), np.linspace(-4, 2, 25), zorder=2)
plt.xlim([-4, 2])
plt.ylim([0,300])
plt.grid()
plt.ylabel('Project Count')
plt.xlabel('$log_{10}$ A-Productivity')

plt.subplot(2,2,2)
df = df_combined[df_combined["stage_N_b"] > 0]
df = df[df["b_life_eff"] > 0]
plt.hist(np.log10(df.b_life_eff), np.linspace(-4, 2, 25), zorder=2)
plt.xlim([-4, 2])
plt.ylim([0,300])
plt.grid()
plt.ylabel('Project Count')
plt.xlabel('$log_{10}$ B-Productivity')

plt.subplot(2,2,3)
df = df_combined[df_combined["stage_N_c"] > 0]
df = df[df["c_life_eff"] > 0]
plt.hist(np.log10(df.c_life_eff), np.linspace(-4, 2, 25), zorder=2)
plt.xlim([-4, 2])
plt.ylim([0,300])
plt.grid()
plt.ylabel('Project Count')
plt.xlabel('$log_{10}$ C-Productivity')

ax = plt.subplot(2,2,4)
df = df_combined[df_combined.comp_perf > 0]
plt.hist(np.log10(df.comp_perf), 25, zorder=2)
plt.xlim([-6, 0])
plt.ylim([0,300])
plt.grid()
plt.ylabel('Project Count')
plt.xlabel('$log_{10}$ Performance')

plt.tight_layout()
plt.savefig(exp.get_filename('fig-eff-perf-hist.png'), dpi=600)
plt.savefig(exp.get_filename('fig-eff-perf-hist.eps'))