In [1]:
import os, sys, re
from functools import partial

import pandas as pd
import numpy as np

from bibtexparser.latexenc import string_to_latex

sys.path.insert(1, os.path.join(sys.path[0], '..'))

import database
from snowballing.operations import load_work_map_all_years, work_to_bibtex, reload
from snowballing.operations import match_bibtex_to_work
from snowballing.approaches import get_approaches, name, wlatex_name, wcitea
reload()

all_approaches = get_approaches()
script = [(a, m) for a, m in all_approaches if not m["binary"]]
binary = [(a, m) for a, m in all_approaches if m["binary"]]
len(script)

27

In [2]:
{y for a, m in all_approaches for y in m["storage"]}

{Memory,
 Graph Database (4store),
 Logic File (Datalog),
 Graph File (DDG),
 Interoperable Format (OPM),
 Key-Value Database (BerkeleyDB (v1)),
 Shadow Files,
 Log (Compressed with GZIP),
 Package (Self-Contained),
 File System (NILFS),
 Proprietary,
 Relational Database (SQLite),
 NoSQL (MongoDB),
 VCS}

In [3]:
from database.groups import NOSQL, SHADOW_FILES, LOG, GRAPH_FILE, RELATIONAL_DB, FILE
from database.groups import FILE_SYSTEM, PACKAGE, LOGIC_FILE, INTEROPERABLE, PROPRIETARY
from database.groups import MEMORY, SOURCE, KEY_VALUE_DB, REPOSITORY, GRAPH_DB, VCS, CONTENT_DATABASE

storage_categories = {
    GRAPH_DB: 'Database',
    RELATIONAL_DB: 'Database',
    NOSQL: 'Database',
    MEMORY: 'Memory',
    CONTENT_DATABASE: 'File',
    LOG: 'File',
    INTEROPERABLE: 'File',
    LOGIC_FILE: 'File',
    GRAPH_FILE: 'File',
    PROPRIETARY: 'File',

    # Binary?
    KEY_VALUE_DB: 'Database',
    FILE_SYSTEM: 'File',
    PACKAGE: 'File',
    SHADOW_FILES: 'File',
}
{storage_categories.get(y, name(a))
 for a, m in all_approaches for y in m["storage"]}

{'Database', 'File', 'Memory'}

In [4]:
{y for a, m in all_approaches for y in m["distribution"]}

{Version Control System,
 Repository,
 Logic File (Prolog),
 Interoperable Format (PROV),
 Graph File (GraphML, GraphViz),
 Shadow Files,
 Log (Compressed with GZIP),
 Package (Self-Contained),
 Proprietary (VisTrails),
 Source,
 Content Database}

In [5]:
distribution_categories = {
    MEMORY: 'Memory',
    CONTENT_DATABASE: 'File',
    LOG: 'File',
    INTEROPERABLE: 'File',
    LOGIC_FILE: 'File',
    GRAPH_FILE: 'File',
    PROPRIETARY: 'File',
    SOURCE: 'File',
    
    VCS: 'Repository',
    REPOSITORY: 'Repository',
    
    # Binary?
    FILE_SYSTEM: 'File',
    PACKAGE: 'File',
    SHADOW_FILES: 'File',  
}
{distribution_categories.get(y, name(a))
 for a, m in all_approaches for y in m["distribution"]}

{'File', 'Repository'}

In [6]:
from database.groups import REPRODUCIBILITY
{name(a) for a, m in all_approaches if REPRODUCIBILITY in m["supports"]}

{'Arnold',
 'Astro-WISE',
 'Becker and Chambers',
 'Burrito',
 'CARE',
 'CDE',
 'Datatrack',
 'Lancet',
 'Magni',
 'Michaelides et al.',
 'PTU',
 'ReproZip',
 'Sacred',
 'SisGExp',
 'Sumatra',
 'VCR',
 'Variolite',
 'YW*NW',
 'noWorkflow',
 'versuchung'}

In [7]:
{m["evolution"] for a, m in all_approaches}

{Trial Sequence, Trial Identification, Uses VCS, ✗}

In [8]:
from database.groups import TRIAL_ID, SEQUENCE, INTENTION, NO


evolution_categories = {
    TRIAL_ID: "Trial ID",
    SEQUENCE: "Sequence",
    INTENTION: "Intention",
    NO: NO,
}
{evolution_categories.get(m["evolution"], name(a))
 for a, m in all_approaches}

{'Sequence', 'Trial ID', ✗, 'Intention'}

In [9]:
with open('../../csur/bibliography.bib') as bibtex_file:
    bibtex_str = bibtex_file.read()

matched = match_bibtex_to_work(bibtex_str.split("%Entries")[-1])
works = dict(map(reversed, matched))
latex_name = partial(wlatex_name, works=works)
citea = partial(wcitea, works=works)

In [10]:
from database.groups import YES, NO

def select(element, field):
    try:
        index = field.index(element)
        element = field[index]
        if hasattr(element, "_star") and element._star is not None:
            return element
        return YES
    except ValueError:
        return NO
    
def find(elements, abrevs, field):
    for element, abrev in zip(elements, abrevs):
        try:
            index = field.index(element)
            if (hasattr(element, "_star") and element._star is not None) or abrev is None:
                yield element
            yield abrev
        except ValueError:
            pass

writer = pd.ExcelWriter('output/management.xlsx')
dfs = {}

for mode_name, mode in [("script", script), ("binary", binary)]:

    name_header = ["Approach", "Latex Name", "Artifacts"]
    name_extra = ["Approach"] + [""] * 2
    name_data = [
        [
            name(approach),
            latex_name(approach),
            string_to_latex(meta['management_text'])
        ]
        for approach, meta in mode
    ]
    
    storage_header = ["Approach", "Database", "Memory", "File"]
    storage_extra = ["Approach"] + ["Storage"] * 3
    storage_data = [
        [
            name(approach),
            select("Database", list(map(storage_categories.get, meta["storage"]))),
            select("Memory", list(map(storage_categories.get, meta["storage"]))),
            select("File", list(map(storage_categories.get, meta["storage"]))),
        ]
        for approach, meta in mode
    ]
    
    distribution_header = ["Approach", "File", "Repository"]
    distribution_extra = ["Approach"] + ["Distribution"] * 2
    distribution_data = [
        [
            name(approach),
            select("File", list(map(distribution_categories.get, meta["distribution"]))),
            select("Repository", list(map(distribution_categories.get, meta["distribution"]))),
        ]
        for approach, meta in mode
    ]
    
    versioning_header = ["Approach", ""]
    versioning_extra = ["Approach"] + ["Versioning"] * 1
    versioning_data = [
        [
            name(approach),
            evolution_categories.get(meta["evolution"]),
        ]
        for approach, meta in mode
    ]
   
    
    names = pd.DataFrame.from_records(
        [name_header] +
        [list(map(str, elements)) for elements in name_data],
        columns=name_header,
    )
    names = names.set_index("Approach")

    storage = pd.DataFrame.from_records(
        [storage_header] +
        [list(map(str, elements)) for elements in storage_data],
        columns=storage_header,
    )
    storage = storage.set_index("Approach")

    distribution = pd.DataFrame.from_records(
        [distribution_header] +
        [list(map(str, elements)) for elements in distribution_data],
        columns=distribution_header,
    )
    distribution = distribution.set_index("Approach")

    versioning = pd.DataFrame.from_records(
        [versioning_header] +
        [list(map(str, elements)) for elements in versioning_data],
        columns=versioning_header,
    )
    versioning = versioning.set_index("Approach")

    df = pd.concat([names, storage, distribution, versioning], axis=1)
    df.index.name = None
    df.columns = name_extra[1:] + storage_extra[1:] + distribution_extra[1:] + versioning_extra[1:] 
    edf = df.drop('Latex Name', 1)
    edf.to_excel(writer,mode_name)
    dfs[mode_name] = df

    
writer.save()

In [11]:
dfs["script"]

Unnamed: 0,Unnamed: 1,Unnamed: 2,Storage,Storage.1,Storage.2,Distribution,Distribution.1,Versioning
Approach,Latex Name,Artifacts,Database,Memory,File,File,Repository,
Astro-WISE,Astro-WISE,Oracle,✓,✗,✗,✗,✗,Sequence
Becker and Chambers,\citet{becker1988a},"Proprietary, Source",✗,✗,✓,✓,✗,✗
"Bochner, Gude, and Schreiber",\citet{bochner2008a},XML Server,✓,✗,✗,✗,✗,✗
CPL,CPL,"MySQL, PostgreSQL, 4store",✓,✗,✗,✗,✗,Trial ID
CXXR,CXXR,Memory,✗,✓,✗,✗,✗,✗
Datatrack,Datatrack,"VCS, Proprietary (CSV)",✗,✗,✓,✓,✓,Intention
ES3,ES3,"XML Server, GraphML, Graphviz",✓,✗,✗,✓,✗,✗
ESSW,ESSW,"MySQL, Content DB, Graphviz",✓,✗,✓,✓,✗,Trial ID
IncPy,IncPy,Content DB,✗,✗,✓,✓,✗,✗


In [12]:
header = r"""\begin{longtable}[!htbp]{|N{0.25\textwidth}|L{0.35\textwidth}|T?T?T|T?T|L{0.12\textwidth}|}
  \caption{Provenance management classification.}
  \label{tab:script-management}
  \vspace{-4mm}
  \\
  \hline
  \rowcolor{BlueRow}
  \multicolumn{1}{|c|}{\textbf{Approach}} & \multicolumn{1}{c|}{\textbf{Artifacts}} & \multicolumn{3}{c|}{\textbf{Storage}} & \multicolumn{2}{c|}{\textbf{Dist.}} & \multicolumn{1}{c|}{\textbf{Versioning}}\\
  \hline
  
  & & \rot{\textbf{Database}} & \rot{\textbf{Memory}} & \rot{\textbf{File}} & \rot{\textbf{File}} & \rot{\textbf{Repository}} & \\
  \hline
  \endfirsthead
  \hline
  \rowcolor{BlueRow}
  \multicolumn{1}{|c|}{\textbf{Approach}} & \multicolumn{1}{c|}{\textbf{Artifacts}} & \multicolumn{3}{c|}{\textbf{Storage}} & \multicolumn{2}{c|}{\textbf{Dist.}} & \multicolumn{1}{c|}{\textbf{Versioning}}\\
  \hline
  
  & & \rot{\textbf{Database}} & \rot{\textbf{Memory}} & \rot{\textbf{File}} & \rot{\textbf{File}} & \rot{\textbf{Repository}} & \\
  \hline
  \endhead
  \hline \multicolumn{8}{|r|}{{Continued on next page}} \\ \hline
  \endfoot
  \endlastfoot
"""
footer = r"""
\end{longtable}
"""


In [13]:
def condense(row, initial, size, func=None):
    if func is None:
        func = lambda x, i, last: r"\true" if x != r"\false" else r"\false"
    CONVERT = {
        "✗": r"\false",
        "✓": r"\true",
    }
    result = [func(CONVERT.get(x, x), i, size - 1) for i, x in enumerate(row[1][initial:initial+size])]
    #if len(set(result)) == 1:
    if len(result) == 1:
        result = ["\multicolumn{{{}}}{{c|}}{{{}}}".format(size, result[0])]
    return " & ".join(result)

In [14]:
def versioning_func(x, i, last):
    if x == r"\false":
        return x
    if x == r"\true":
        return x
    return "\\textcolor{{GreenMark}}{{{}}}".format(
        ' '.join(y.strip() for y in x.split(','))
    )

storage = lambda row: condense(row, 2, 3)
distribution = lambda row: condense(row, 5, 2)
versioning = lambda row: condense(row, 7, 1, func=versioning_func)


In [15]:
iterrows = dfs['script'].iterrows()
next(iterrows)
result = []
for i, row in enumerate(iterrows):
    if i % 2 == 0:
        result.append(r"  \rowcolor{BlueRow}")
    result.append("  " + " & ".join([
        row[1][0], row[1][1], storage(row), distribution(row), versioning(row)
    ]) + r"\\")
    #result.append(r"  \arrayrulecolor{LightBlueLine}\hline")
result.append("  \hline")

In [16]:
latex = header + "\n" + "\n".join(result) + "\n" + footer
with open("../../csur/figs/script_management.tex", "w") as goals:
    goals.write(latex)