In [None]:
%pylab inline
import calendar
import datetime
import json
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import scipy.stats as spstats
from sqlalchemy import distinct, func, select
import database
from database.schema import Rating, revision_table
import logbook

In [None]:
exp_name = "26i_combine_data_imp"
exp = logbook.Experiment(exp_name)
project_file = "data/projects-2016-10-14-dedup.json"
transition_file = "archive/11c_find_performance_imp/2017-12-19 08:27:10 81807fc/fa_ga_transitions.csv"
stage_eff_file = "archive/10c_find_imp_stage_eff/2017-12-19 08:26:52 81807fc/efficiency.csv"
control_file = "archive/24i_find_imp_controls/2017-12-19 09:57:00 81807fc/controls.csv"
similarity_file = "archive/24ci find similarity/2017-12-19 08:31:06 81807fc/similarity_mean.csv"
article_talk_file = "archive/24di_find_imp_article_talk/2017-12-19 09:57:25 81807fc/article_talk.csv"
degree_file = "output/19_find_degree/2017-05-01 15:38:59 41cb865/%d-degree.csv"
mincut_file = "archive/22_plot_mincut/2017-05-15 14:24:53 030d8fd/mincut_stats.csv"
path_files = [
    "output/23_plot_path/2017-06-04 10:14:06 b25b01f/path_stats.csv"
]
out_file = "combined.csv"

In [None]:
project_ids = []
with open(project_file, "rb") as f:
    for row in f:
        data = json.loads(row)
        project_ids.append(data["project_id"])

In [None]:
df_stage_eff = pd.DataFrame.from_csv(stage_eff_file)

In [None]:
df_perf = pd.DataFrame.from_csv(transition_file)
df = pd.DataFrame(columns=["performance"])
df["performance"] = df_perf["to_ga"]/(df_perf["to_ga"]+df_perf["to_fa"])
df_perf = pd.concat([df_perf, df], axis=1)

In [None]:
df_con = pd.DataFrame.from_csv(control_file)

In [None]:
# Degree
def skew(x):
    n = len(x)
    mean = x.sum() / float(n)
    dx = x - mean
    result = ((dx*dx*dx).sum() / float(n)) \
        / np.power((dx*dx).sum() / float(n - 1), 1.5)
    return result

mean = {}
skew_in = {}
skew_out = {}
for project_id in project_ids:
    try:
        df = pd.DataFrame.from_csv(degree_file % project_id)
        if len(df) < 1:
            continue
        p_mean = df["in_degree"].sum() / float(len(df["in_degree"]))
        mean[project_id] = p_mean
        skew_in[project_id] = skew(df["in_degree"])
        skew_out[project_id] = skew(df["out_degree"])
    except IOError:
        pass
    except ValueError:
        pass
df_degree = pd.DataFrame({
    "degree_mean": mean,
    "in_degree_skew": skew_in,
    "out_degree_skew": skew_out})

In [None]:
df_mincut = pd.DataFrame.from_csv(mincut_file)

In [None]:
harmonic = {}
path_mean = {}
connectivity = {}
for f in path_files:
    df_path = pd.DataFrame.from_csv(f)
    for index, row in df_path.iterrows():
        harmonic[index] = row["path_harmonic"]
        path_mean[index] = row["path_mean"]
        connectivity[index] = row["connectivity"]
df_path = pd.DataFrame({
    "path_harmonic": harmonic,
    "path_mean": path_mean,
    "connectivity": connectivity
})

In [None]:
df_similarity = pd.read_csv(similarity_file)

In [None]:
df_talk = pd.read_csv(article_talk_file)
df_talk["talk_fraction"] = df_talk["talk_count"] / (df_talk["article_count"] + df_talk["talk_count"])

In [None]:
df_combined = pd.concat([df_perf, df_stage_eff, df_con, df_degree, df_mincut, df_path, df_talk, df_similarity], axis=1, join="outer")
df_combined.to_csv(exp.get_filename(out_file))

In [None]:
plt.figure()
plt.plot(df_combined["degree_mean"], df_combined["flow_mean"], "o")
plt.xlabel("Mean In-degree")
plt.ylabel("Mean Min-cut")
plt.title("Pearson r = 0.9795936")

In [None]:
plt.rc("font", size=16)
x = [d for d in df_combined["path_harmonic"].dropna() if d <=6]
plt.hist(x, 60)
plt.tight_layout()
plt.xlabel("Harmonic Mean Path Length")
plt.ylabel("Frequency")

In [None]:
plt.hist(df_combined["degree_mean"].dropna(), 60)
plt.tight_layout()

In [None]:
len(df_combined["degree_mean"].dropna())

In [None]:
plt.hist(df_combined["flow_mean"].dropna(), 60)
plt.tight_layout()

In [None]:
df_combined.columns

In [None]:
a_eff = df_combined["Delta_a"] / df_combined["N_a"]
a_stage = df_combined["stage_Delta_a"] / df_combined["stage_N_a"]
b_eff = df_combined["Delta_b"] / df_combined["N_b"]
b_stage = df_combined["stage_Delta_b"] / df_combined["stage_N_b"]
c_eff = df_combined["Delta_c"] / df_combined["N_c"]
c_stage = df_combined["stage_Delta_c"] / df_combined["stage_N_c"]

In [None]:
plt.figure(figsize=(9,3))
subplot(1,3,1)
plt.loglog(a_eff, a_stage, 'o')
subplot(1,3,2)
plt.loglog(b_eff, b_stage, 'o')
subplot(1,3,3)
plt.loglog(c_eff, c_stage, 'o')
spstats.pearsonr(a_eff, a_stage)

In [None]:
df_stage_eff