In [None]:
import calendar
import datetime
import json
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
import scipy.stats as spstats
from sqlalchemy import distinct, func, select
import database
from database.schema import Rating, revision_table
import logbook

In [None]:
exp_name = "26_combine_data"
exp = logbook.Experiment(exp_name)
project_file = "data/projects-2016-10-14-dedup.json"
transition_file = "output/11_find_performance/2017-05-03 08:54:42 b8d538c/fa_ga_transitions.csv"
efficiency_files = [
    "output/10_find_efficiency/2017-04-25 23:09:13 3bd3e7d/efficiency.csv"]
control_file = "output/24_find_controls/2017-05-11 12:45:30 f7ca849/controls.csv"
degree_file = "output/19_find_degree/2017-05-01 15:38:59 41cb865/%d-degree.csv"
#mincut_file = "output/22_plot_mincut/2017-01-16 12:35:18 a473ea7/mincut_stats.csv"
out_file = "combined.csv"

In [None]:
project_ids = []
with open(project_file, "rb") as f:
    for row in f:
        data = json.loads(row)
        project_ids.append(data["project_id"])

In [None]:
df_eff = None
for f in efficiency_files:
    if df_eff is None:
        df_eff = pd.DataFrame.from_csv(f)
    else:
        df_eff = pd.concat([df_eff, pd.DataFrame.from_csv(f)])
df_nz = df_eff[df_eff["N_a"] > 0]
df_nz = df_nz[df_nz["N_b"] > 0]
df_nz = df_nz[df_nz["N_c"] > 0]
df_nz = df_nz[df_nz["Delta_a"] > 0]
df_nz = df_nz[df_nz["Delta_b"] > 0]
df_nz = df_nz[df_nz["Delta_c"] > 0]

In [None]:
a_eff = df_nz["Delta_a"]/df_nz["N_a"]
b_eff = df_nz["Delta_b"]/df_nz["N_b"]
c_eff = df_nz["Delta_c"]/df_nz["N_c"]
mean_a = a_eff.sum() / len(a_eff)
mean_b = b_eff.sum() / len(b_eff)
mean_c = c_eff.sum() / len(c_eff)
df = pd.DataFrame(columns=["efficiency"])
df["efficiency"] = a_eff/mean_a * b_eff/mean_b * c_eff/mean_c
a_eff = a_eff/mean_a
b_eff = b_eff/mean_b
c_eff = c_eff/mean_c
df_eff = pd.concat([df_eff, df], axis=1)

In [None]:
df_perf = pd.DataFrame.from_csv(transition_file)
df = pd.DataFrame(columns=["performance"])
df["performance"] = df_perf["to_ga"]/(df_perf["to_ga"]+df_perf["to_fa"])
df_perf = pd.concat([df_perf, df], axis=1)

In [None]:
df_con = pd.DataFrame.from_csv(control_file)

In [None]:
# Degree
def skew(x):
    n = len(x)
    mean = x.sum() / float(n)
    dx = x - mean
    result = ((dx*dx*dx).sum() / float(n)) \
        / np.power((dx*dx).sum() / float(n - 1), 1.5)
    return result

mean = {}
skew_in = {}
skew_out = {}
for project_id in project_ids:
    try:
        df = pd.DataFrame.from_csv(degree_file % project_id)
        if len(df) < 1:
            continue
        p_mean = df["in_degree"].sum() / float(len(df["in_degree"]))
        mean[project_id] = p_mean
        skew_in[project_id] = skew(df["in_degree"])
        skew_out[project_id] = skew(df["out_degree"])
    except IOError:
        pass
    except ValueError:
        pass
df_degree = pd.DataFrame({
    "degree_mean": mean,
    "in_degree_skew": skew_in,
    "out_degree_skew": skew_out})

In [None]:
#df_mincut = pd.DataFrame.from_csv(mincut_file)

In [None]:
#df_combined = pd.concat([df_perf, df_eff, df_con, df_degree, df_mincut], axis=1, join="outer")
df_combined = pd.concat([df_perf, df_eff, df_con, df_degree], axis=1, join="outer")
df_combined.to_csv(exp.get_filename(out_file))