In [1]:
import os, sys, re
from functools import partial

import pandas as pd
import numpy as np

from bibtexparser.latexenc import string_to_latex

sys.path.insert(1, os.path.join(sys.path[0], '..'))

import database
from snowballing.operations import load_work_map_all_years, work_to_bibtex, reload
from snowballing.operations import match_bibtex_to_work
from snowballing.approaches import get_approaches, name, wlatex_name, wcitea
reload()

all_approaches = get_approaches()
script = [(a, m) for a, m in all_approaches if not m["binary"]]
binary = [(a, m) for a, m in all_approaches if m["binary"]]

approaches = [a for a, m in script]

len(script)

27

In [2]:
{y for a, m in script for y in m["supports"]}

{Reproducibility, Management, Comprehension, Caching, Framework}

In [3]:
from database.groups import YES, NO
from database.groups import CACHE, COMPREHENSION, FRAMEWORK, MANAGEMENT, REPRODUCIBILITY, SECURITY
columns = [CACHE, COMPREHENSION, FRAMEWORK, MANAGEMENT, REPRODUCIBILITY, SECURITY]

name = lambda x: x.display.replace("  ", "")

header = [["Approach", "Main goal"] + [column.value for column in columns]]
category_table = (
    header +
    [
        [name(approach), approach._meta[0]["goal"].value] +
        [
            (YES if column in approach._meta[0]["supports"] else NO).value
             for column in columns
        ]
        for approach in approaches
#        if name(approach) not in {"noWorkflow", "YW*NW"}
    ]
)

In [4]:
df = pd.DataFrame(category_table)
new_row = (
    ["Total", ""] +
    [df[df[i + 2] == YES.value][i + 2].count()
     for i in range(len(columns))]
)
df.loc[len(df)] = new_row

new_row = (
    ["Total main goal", ""] +
    [sum(1 for a in approaches if a._meta[0]["goal"] == column)
     for column in columns]
)
df.loc[len(df)] = new_row


df.columns = df.iloc[0]
df = df.set_index('Approach')
df.index.name = None
df = df[1:]


df

Unnamed: 0,Main goal,Caching,Comprehension,Framework,Management,Reproducibility,Security
Astro-WISE,Framework,✓,✓,✓,✗,✓,✗
Becker and Chambers,Comprehension,✗,✓,✗,✗,✓,✗
"Bochner, Gude, and Schreiber",Comprehension,✗,✓,✓,✗,✗,✗
CPL,Framework,✗,✓,✓,✗,✗,✗
CXXR,Comprehension,✗,✓,✗,✗,✗,✗
Datatrack,Management,✗,✓,✗,✓,✓,✗
ES3,Comprehension,✗,✓,✗,✗,✗,✗
ESSW,Management,✗,✓,✗,✓,✗,✗
IncPy,Caching,✓,✗,✗,✗,✗,✗
Lancet,Reproducibility,✗,✓,✗,✓,✓,✗


In [5]:
writer = pd.ExcelWriter('output/goal.xlsx')
df.to_excel(writer,'Sheet1')
writer.save()

In [6]:
len(binary)

14

In [7]:
len(script)

27

In [8]:
script[0][0].work

[Astro-wise: Tracing and using lineage for scientific data processing,
 Dynamic Pipeline Changes in Scientific Data Processing]

In [9]:
with open('../../csur/bibliography.bib') as bibtex_file:
    bibtex_str = bibtex_file.read()

matched = match_bibtex_to_work(bibtex_str.split("%Entries")[-1])
works = dict(map(reversed, matched))
latex_name = partial(wlatex_name, works=works)
citea = partial(wcitea, works=works)

In [10]:
columns = [CACHE, COMPREHENSION, FRAMEWORK, MANAGEMENT, REPRODUCIBILITY
#, SECURITY
]
approaches = [a for a, m in script]
{y for a, m in all_approaches for y in m["supports"]}

{Reproducibility, Management, Comprehension, Security, Caching, Framework}

In [11]:
category_table = (
    [
        [citea(approach), approach._meta[0]["goal"].value] +
        [
            r"\true" if column in approach._meta[0]["supports"] else r"\false"
            for column in columns
        ]
        for approach in approaches
#        if name(approach) not in {"noWorkflow", "YW*NW"}
    ]
)

In [12]:
total = (
    [r"\textbf{Total}", ""] +
    [str(sum(1 for j in range(len(category_table))
             if category_table[j][i + 2] == r"\true"))
     for i in range(len(columns))
    ]
)

total_main_goal = (
    [r"\textbf{Total Main Goal}", ""] +
    [str(sum(1 for a in approaches
             if a._meta[0]["goal"] == column))
     for column in columns]
)

slash = (
    [r"\textbf{Main Goal / Total}", ""] +
    ["{} / {}".format(
        sum(1 for a in approaches if a._meta[0]["goal"] == column),
        sum(1 for j in range(len(category_table)) if category_table[j][i + 2] == r"\true")
     ) for i, column in enumerate(columns)]
)

#category_table.append(total)
#category_table.append(total_main_goal)
category_table.append(slash)

In [13]:
RENAME = {
    CACHE: "Cache",
    COMPREHENSION: "Compr",
    FRAMEWORK: "Frame",
    MANAGEMENT: "Manag",
    REPRODUCIBILITY: "Repro",
    SECURITY: "Secur"
}

header = r"\begin{tabular}{?>{\centering\arraybackslash} m{0.3\textwidth}?>{\centering\arraybackslash} m{0.15\textwidth}?" + "S?" * len(columns) + r""" }
  \arrayrulecolor{LightBlueLine}\hline
  \rowcolor{BlueRow}
  \textbf{Approach} & \textbf{Main goal} & \multicolumn{5}{c}{\textbf{Secondary goals}}\\
  \arrayrulecolor{LightBlueLine}\hline
   &  & """ + " & ".join("\\textbf{{{}}}".format(RENAME[column]) for column in columns) + r"""\\
  \arrayrulecolor{LightBlueLine}\hline
"""
footer = r"""
\end{tabular}
"""

In [14]:
result = []
for i, row in enumerate(category_table):
    if i % 2 == 0:
        result.append(r"  \rowcolor{BlueRow}")
    result.append("  " + " & ".join(row) + r"\\")
    result.append(r"  \arrayrulecolor{LightBlueLine}\hline")

In [15]:
latex = header + "\n" + "\n".join(result) + "\n" + footer
with open("../../csur/figs/goals.tex", "w") as goals:
    goals.write(latex)

In [16]:
for a, m in binary:
    for w in a.work:
        if w.category != "binarysnowball":
            print(w.display, w.year, w)

Guo's Thesis 2012 Software tools to facilitate research programming
Guo's Thesis 2012 Software tools to facilitate research programming
PASS 2010 Foundations for provenance-aware systems
ES3 2008 Automatic capture and reconstruction of computational provenance
ES3 2004 Earth System Science Server (ES3): Local Infrastructure for Earth Science Product Management
ES3 2008 Es3: A demonstration of transparent provenance for scientific computation
ES3 2005 Tracking the lineage of arbitrary processing sequences
ES3 2010 Automatic Provenance Collection and Publishing in a Science Data Production Environment -- Early Results
ES3 2011 Provenance-enabled automatic data publishing
