In [1]:
import os, sys, re
from functools import partial

import pandas as pd
import numpy as np

from bibtexparser.latexenc import string_to_latex

sys.path.insert(1, os.path.join(sys.path[0], '..'))

import database
from snowballing.operations import load_work_map_all_years, work_to_bibtex, reload
from snowballing.operations import match_bibtex_to_work
from snowballing.approaches import get_approaches, name, wlatex_name, wcitea
reload()

all_approaches = get_approaches()
script = [(a, m) for a, m in all_approaches if not m["binary"]]
binary = [(a, m) for a, m in all_approaches if m["binary"]]
len(script)

27

In [2]:
with open('../../csur/bibliography.bib') as bibtex_file:
    bibtex_str = bibtex_file.read()

matched = match_bibtex_to_work(bibtex_str.split("%Entries")[-1])
works = dict(map(reversed, matched))
latex_name = partial(wlatex_name, works=works)
citea = partial(wcitea, works=works)

In [3]:
from database.groups import YES, NO, PROCESS_VIEW, DATA_VIEW, COMBINED_VIEW, LOG_VIEW
from database.groups import CLUSTERING, FILTERING, DATA, PROVENANCE, INTEROPERABLE, PROPRIETARY
from database.groups import INTERNAL, EXTERNAL

def select(element, field):
    try:
        index = field.index(element)
        element = field[index]
        if hasattr(element, "_star") and element._star is not None:
            return element
        return YES
    except ValueError:
        return NO
    
def find(elements, abrevs, field):
    for element, abrev in zip(elements, abrevs):
        try:
            index = field.index(element)
            if (hasattr(element, "_star") and element._star is not None) or abrev is None:
                yield element
            yield abrev
        except ValueError:
            pass

writer = pd.ExcelWriter('output/management.xlsx')
dfs = {}

for mode_name, mode in [("script", script), ("binary", binary)]:

    name_header = ["Approach", "Latex Name"]
    name_extra = ["Approach"] + [""] * 1
    name_data = [
        [
            name(approach),
            latex_name(approach),
        ]
        for approach, meta in mode
    ]
    
    query_header = ["Approach", "Generic", "Specific"]
    query_extra = ["Approach"] + ["Query"] * 2
    query_data = [
        [
            name(approach),
            string_to_latex(meta['generic_query_text']),
            string_to_latex(meta['specific_query_text']),
        ]
        for approach, meta in mode
    ]
    
    visplace_header = ["Approach", "Internal", "External"]
    visplace_extra = ["Approach"] + ["Vis. Place"] * 2
    visplace_data = [
        [
            name(approach),
            select(INTERNAL, meta["visplace"]),
            select(EXTERNAL, meta["visplace"]),
        ]
        for approach, meta in mode
    ]
    
    visualization_header = ["Approach", "Process", "Data", "Combined", "Proprietary"]
    visualization_extra = ["Approach"] + ["Visualization"] * 4
    visualization_data = [
        [
            name(approach),
            select(LOG_VIEW, meta["visualization"]),
            select(PROCESS_VIEW, meta["visualization"]),
            select(DATA_VIEW, meta["visualization"]),
            select(COMBINED_VIEW, meta["visualization"]),
        ]
        for approach, meta in mode
    ]
    
    summarization_header = ["Approach", "Clustering", "Filtering"]
    summarization_extra = ["Approach"] + ["Summ."] * 2
    summarization_data = [
        [
            name(approach),
            select(CLUSTERING, meta["summarization"]),
            select(FILTERING, meta["summarization"]),
        ]
        for approach, meta in mode
    ]
    
    diff_header = ["Approach", "Data", "Provenance"]
    diff_extra = ["Approach"] + ["Diff"] * 2
    diff_data = [
        [
            name(approach),
            select(DATA, meta["diff"]),
            select(PROVENANCE, meta["diff"]),
        ]
        for approach, meta in mode
    ]
   
    
    names = pd.DataFrame.from_records(
        [name_header] +
        [list(map(str, elements)) for elements in name_data],
        columns=name_header,
    )
    names = names.set_index("Approach")

    query = pd.DataFrame.from_records(
        [query_header] +
        [list(map(str, elements)) for elements in query_data],
        columns=query_header,
    )
    query = query.set_index("Approach")

    visplace = pd.DataFrame.from_records(
        [visplace_header] +
        [list(map(str, elements)) for elements in visplace_data],
        columns=visplace_header,
    )
    visplace = visplace.set_index("Approach")
    
    visualization = pd.DataFrame.from_records(
        [visualization_header] +
        [list(map(str, elements)) for elements in visualization_data],
        columns=visualization_header,
    )
    visualization = visualization.set_index("Approach")

    summarization = pd.DataFrame.from_records(
        [summarization_header] +
        [list(map(str, elements)) for elements in summarization_data],
        columns=summarization_header,
    )
    summarization = summarization.set_index("Approach")
    
    diff = pd.DataFrame.from_records(
        [diff_header] +
        [list(map(str, elements)) for elements in diff_data],
        columns=diff_header,
    )
    diff = diff.set_index("Approach")

    df = pd.concat([names, query, visplace, visualization, summarization, diff], axis=1)
    df.index.name = None
    df.columns = name_extra[1:] + query_extra[1:] + visplace_extra[1:] + visualization_extra[1:] + summarization_extra[1:] + diff_extra[1:] 
    edf = df.drop('Latex Name', 1)
    edf.to_excel(writer,mode_name)
    dfs[mode_name] = df

    
writer.save()

In [4]:
dfs["script"]

Unnamed: 0,Unnamed: 1,Query,Query.1,Vis. Place,Vis. Place.1,Visualization,Visualization.1,Visualization.2,Visualization.3,Summ.,Summ..1,Diff,Diff.1
Approach,Latex Name,Generic,Specific,Internal,External,Process,Data,Combined,Proprietary,Clustering,Filtering,Data,Provenance
Astro-WISE,Astro-WISE,SQL,"Functions, Web",✓,✗,✗,✗,Tree,✗,✗,✗,✗,✓
Becker and Chambers,\citet{becker1988a},,Functions,✓,✗,✗,✓,✗,✗,✗,✗,✗,✗
"Bochner, Gude, and Schreiber",\citet{bochner2008a},"XQuery, XPath",Web,✗,✗,✗,✗,✗,✗,✗,✗,✗,✗
CPL,CPL,"SPARQL, SQL",Functions,✗,✗,✗,✗,✗,✗,✗,✗,✗,✗
CXXR,CXXR,,Functions,✗,✗,✗,✗,✗,✗,✗,✗,✗,✗
Datatrack,Datatrack,,,✓,✗,✗,✗,✓,✗,✓,✗,VCS,✗
ES3,ES3,XQuery,,✓,✗,✗,✗,✗,✓,✗,✗,✗,✗
ESSW,ESSW,SQL,Web,✓,✗,✗,✗,✗,✓,✗,✗,✗,✗
IncPy,IncPy,,,✗,✗,✗,✗,✗,✗,✗,✗,cache,✓


In [5]:
header = r"""\begin{longtable}[!htbp]{|N{0.25\textwidth}|N{0.15\textwidth}N{0.15\textwidth}|T?T|T?T?T?T|T?T|T?T|}
  \caption{Provenance analysis classification, based on Query, Visualization, and Diff. }
  \label{tab:script-analysis}
  \vspace{-4mm}
  \\
  \hline
  \rowcolor{BlueRow}
  \multicolumn{1}{|c|}{\textbf{Approach}} & \multicolumn{2}{c|}{\textbf{Query}} & \multicolumn{8}{c|}{\textbf{Visualization}} & \multicolumn{2}{c|}{\textbf{Diff}}\\
  \hline
  & & & \multicolumn{2}{c|}{\textbf{Place}} & \multicolumn{4}{c|}{\textbf{Type}} & \multicolumn{2}{c|}{\textbf{Sum.}} & & \\
  \cline{4-11}
  & \multicolumn{1}{c}{\textbf{Generic}} & \multicolumn{1}{c|}{\textbf{Specific}}  & \rot{\textbf{Internal}} & \rot{\textbf{External}} & \rot{\textbf{Log}} & \rot{\textbf{Process}} & \rot{\textbf{Data}} & \rot{\textbf{Combined}}& \rot{\textbf{Clustering}} & \rot{\textbf{Filtering}} & \rot{\textbf{Data}} & \rot{\textbf{Provenance}}\\ 
  \hline
  \endfirsthead
  \hline
  \rowcolor{BlueRow}
  \multicolumn{1}{|c|}{\textbf{Approach}} & \multicolumn{2}{c|}{\textbf{Query}} & \multicolumn{8}{c|}{\textbf{Visualization}} & \multicolumn{2}{c|}{\textbf{Diff}}\\
  \hline
  & & & \multicolumn{2}{c|}{\textbf{Place}} & \multicolumn{4}{c|}{\textbf{Type}} & \multicolumn{2}{c|}{\textbf{Sum.}} & & \\
  \cline{4-11}
  & \multicolumn{1}{c}{\textbf{Generic}} & \multicolumn{1}{c|}{\textbf{Specific}}  & \rot{\textbf{Internal}} & \rot{\textbf{External}} & \rot{\textbf{Log}} & \rot{\textbf{Process}} & \rot{\textbf{Data}} & \rot{\textbf{Combined}}& \rot{\textbf{Clustering}} & \rot{\textbf{Filtering}} & \rot{\textbf{Data}} & \rot{\textbf{Provenance}}\\
  \hline
  \endhead
  \hline \multicolumn{13}{|r|}{{Continued on next page}} \\ \hline
  \endfoot
  \endlastfoot
"""
footer = r"""
\end{longtable}
"""


In [6]:
def condense(row, initial, size, func=None, center=False, multicolumn=None):
    if func is None:
        func = lambda x, i, last: r"\true" if x != r"\false" and x else r"\false"
    if multicolumn is None:
        multicolumn = lambda size, value, sep: "\multicolumn{{{0}}}{{c{2}}}{{{1}}}".format(size, value, sep)
    CONVERT = {
        "✗": r"\false",
        "✓": r"\true",
    }
    result = [func(CONVERT.get(x, x), i, size - 1) for i, x in enumerate(row[1][initial:initial+size])]
    #if len(set(result)) == 1:
    if len(result) == 1:
        result = [multicolumn(size, result[0], '|')]
    elif center:
        result = [
            multicolumn(1, x, '|' if i == len(result) - 1 else '')
            for i, x in enumerate(result)
        ]
    return " & ".join(result)

In [7]:
def query_func(x, i, last):
    if not x:
        return r"\false"
    if x == r"\false":
        return x
    if x == r"\true":
        return x
    return "\\textcolor{{GreenMark}}{{{}}}".format(x)

def query_multicolumn(size, value, sep):
    return value

query = lambda row: condense(row, 1, 2, func=query_func, center=True, multicolumn=query_multicolumn)
visplace = lambda row: condense(row, 3, 2)
vis = lambda row: condense(row, 5, 4)
summarization = lambda row: condense(row, 9, 2)
diff = lambda row: condense(row, 11, 2)


In [8]:
iterrows = dfs['script'].iterrows()
next(iterrows)
result = []
for i, row in enumerate(iterrows):
    if i % 2 == 0:
        result.append(r"  \rowcolor{BlueRow}")
    result.append("  " + " & ".join([
        row[1][0], query(row), visplace(row), vis(row), summarization(row), diff(row)
    ]) + r"\\")
    #result.append(r"  \arrayrulecolor{LightBlueLine}\hline")
result.append("  \hline")

In [9]:
latex = header + "\n" + "\n".join(result) + "\n" + footer
with open("../../csur/figs/script_analysis.tex", "w") as goals:
    goals.write(latex)