In [1]:
import os, sys, re
from functools import partial

import pandas as pd
import numpy as np

from bibtexparser.latexenc import string_to_latex

sys.path.insert(1, os.path.join(sys.path[0], '..'))

import database
from snowballing.operations import load_work_map_all_years, work_to_bibtex, reload
from snowballing.operations import match_bibtex_to_work
from snowballing.approaches import get_approaches, name, wlatex_name, wcitea
reload()

all_approaches = get_approaches()
script = [(a, m) for a, m in all_approaches if not m["binary"]]
binary = [(a, m) for a, m in all_approaches if m["binary"]]
len(script)

27

In [2]:
with open('../../csur/bibliography.bib') as bibtex_file:
    bibtex_str = bibtex_file.read()

matched = match_bibtex_to_work(bibtex_str.split("%Entries")[-1])
works = dict(map(reversed, matched))
latex_name = partial(wlatex_name, works=works)
citea = partial(wcitea, works=works)

In [3]:
TEMPLATE = """
TITLE: {tname} – Could you check if the following information is correct?
TO: {emails}
CC: Juliana.freire@nyu.edu; leomurta@ic.uff.br; vanessa@ic.uff.br

Dear {authors},

We are currently working on a survey on provenance in scripts. In the survey, we propose a taxonomy for the 
state-of-the-art approaches in this research area and we characterize the approaches accordingly. We would 
also like to include {name} in this survey{middle}

To avoid any misunderstandings, we would kindly ask you to check and briefly comment on the following list 
of features. We got these features based on {papers} but want to make sure we got everything
correctly. We would highly appreciate if you could check our classification and make corrections when needed.
Please send your answer before {date}. This would allow us to include your comments in our paper.

Please, note that this is not a competition and we do not intend to rank the included systems.

Collection
        ({annotations}) Annotations
                Placement: ({internal}) Internal; ({external}) External
                Extraction: ({extract_parseable}) Parseable; ({extract_execution}) Execution 
                Inclusiveness: ({inclusive}) Inclusive; ({exclusive}) Exclusive
                Target: ({target_definition}) Definition; ({target_provenance}) Provenance
                Necessity: ({optional}) Optional; ({mandatory}) Mandatory
        ({execution}) Execution 
                ({passive_monitoring}) Passive Monitoring
                ({overriding}) Overriding
                ({post_mortem}) Post-Mortem
                ({instrumentation}) Instrumentation
        ({deployment}) Deployment 
                ({before}) Before
                ({during}) During
        ({definition}) Definition 
                How: ({reading}) Reading; ({parsing}) Parsing
                When: ({static}) Static; ({dynamic}) Dynamic
Management
        ({storage}) Storage:
                ({database}) Database. Specify: {database_specify}
                ({memory}) Transient Memory
                ({file}) File. Specify: {file_specify}
        ({distribution}) Distribution
                ({file_dist}) File. Specify: {file_dist_specify}
                ({repository}) Repository
        ({reproducibility}) Reproducibility
        ({versioning}) Versioning
                ({trialid}) Trial ID
                ({sequence}) Sequence
                ({intention}) Intention
Analysis
        ({query}) Query
                ({generic}) Generic. Specify: {generic_specify}
                ({specific}) Specific. Specify: {specific_specify}
        ({visualization}) Visualization
                Place: ({place_internal}) Internal; ({place_external}) External
                Type: ({log}) Log; ({data}) Data; ({process}) Process; ({combined}) Combined
                Completeness: ({complete}) No Summarization; ({clustering}) Clustering; ({filtering}) Filtering
        ({diff}) Comparison:
                ({diff_data}) Data
                ({diff_provenance}) Provenance

A glossary of the taxonomy describing each one of these categories can be found at https://joaofelipe.github.io/pins

Many thanks in advance!

All the best,
João Felipe Pimentel

{end}
"""

In [4]:
from database.groups import REPRODUCIBILITY
from database.groups import INTERNAL, EXTERNAL, PARSEABLE, EXECUTABLE, INCLUSIVE, EXCLUSIVE
from database.groups import MANDATORY, OPTIONAL, DEFINITION, PROVENANCE
from database.groups import PASSIVE_MONITORING, OVERRIDING, POST_MORTEM, INSTRUMENTATION
from database.groups import BEFORE_EXECUTION, DURING_EXECUTION
from database.groups import READING, PARSING, STATIC, DYNAMIC, ASKS
from database.groups import YES, NO
from database.groups import NOSQL, SHADOW_FILES, LOG, GRAPH_FILE, RELATIONAL_DB, FILE, WEB
from database.groups import FILE_SYSTEM, PACKAGE, LOGIC_FILE, INTEROPERABLE, PROPRIETARY
from database.groups import MEMORY, SOURCE, KEY_VALUE_DB, REPOSITORY, GRAPH_DB, VCS, CONTENT_DATABASE
from database.groups import TRIAL_ID, SEQUENCE, INTENTION, NO
from database.groups import YES, NO, PROCESS_VIEW, DATA_VIEW, COMBINED_VIEW, LOG_VIEW
from database.groups import CLUSTERING, FILTERING, DATA, PROVENANCE, INTEROPERABLE, PROPRIETARY
from database.groups import INTERNAL, EXTERNAL


storage_categories = {
    GRAPH_DB: 'Database',
    RELATIONAL_DB: 'Database',
    NOSQL: 'Database',
    MEMORY: 'Memory',
    CONTENT_DATABASE: 'File',
    LOG: 'File',
    INTEROPERABLE: 'File',
    LOGIC_FILE: 'File',
    GRAPH_FILE: 'File',
    PROPRIETARY: 'File',

    # Binary?
    KEY_VALUE_DB: 'Database',
    FILE_SYSTEM: 'File',
    PACKAGE: 'File',
    SHADOW_FILES: 'File',
}

distribution_categories = {
    MEMORY: 'Memory',
    CONTENT_DATABASE: 'File',
    LOG: 'File',
    INTEROPERABLE: 'File',
    LOGIC_FILE: 'File',
    GRAPH_FILE: 'File',
    PROPRIETARY: 'File',
    SOURCE: 'File',
    
    VCS: 'Remote',
    REPOSITORY: 'Remote',
    WEB: 'Remote',

    
    # Binary?
    FILE_SYSTEM: 'File',
    PACKAGE: 'File',
    SHADOW_FILES: 'File',  
}


def select(element, field):
    try:
        index = field.index(element)
        element = field[index]
        return "x"
    except ValueError:
        return "  "
    
def select_rev(element, categories, field):
    for value in field:
        if categories[value] == element:
            yield str(value)
    
def find(elements, abrevs, field):
    for element, abrev in zip(elements, abrevs):
        try:
            index = field.index(element)
            if (hasattr(element, "_star") and element._star is not None) or abrev is None:
                yield element
            yield abrev
        except ValueError:
            pass


def write_authors(authors):
    if len(authors) <= 2:
        return " and ".join(authors)
    authors = list(authors)
    return ", ".join(authors[:-1]) + ", and " + authors[-1]

def load_data(approach):
    work = approach.work
    
    refs = ", ".join(map(str, range(1, len(work) + 1)))
    references = []
    for i, w in enumerate(work):
        references.append(
            "[{}] {}. {}. {}, {}".format(
                i + 1, w.authors, w.name, w.place.name, w.year
            )
        )
    
    end = ""
    middle = "."
    tname = aname = name(approach)
    pname = "the {} paper".format(tname) + ("s" if len(work) > 1 else "")
    if approach.approach_name == "-":
        tname = "Provenance in Scripts"
        aname = pname = "your paper" + ("s" if len(work) > 1 else "")
        aname = "{} [{}]".format(aname, refs)
        middle = ":\n\n{}".format("\n".join(references))
    else:
        end = "References:\n{}".format("\n".join(references))
        pname = "{} [{}]".format(pname, refs)
    if getattr(approach, "to", None):
        authors = {
            " ".join(reversed(author.split(", "))): ""
            for author in approach.to.split(' and ')
        }
    else:
        authors = {
            " ".join(reversed(author.split(", "))): "" for w in work
            for author in w.authors.split(' and ')
        }
    
   
    return {
        'emails': approach.emails,
        'authors': write_authors(authors),
        'name': aname,
        'tname': tname,
        'papers': pname,
        'refs': refs,
        'end': end,
        'middle': middle,
        'date': 'December 14, 2018',
        # Collection
        ##
        'annotations': 'x' if meta["annotations"] else '  ',
        ### placement
        'internal': select(INTERNAL, meta["annotations"]),
        'external': select(EXTERNAL, meta["annotations"]),
        ### extraction
        'extract_parseable': select(PARSEABLE, meta["annotations"]),
        'extract_execution': select(EXECUTABLE, meta["annotations"]),
        ### inclusiveness
        'inclusive': select(INCLUSIVE, meta["annotations"]),
        'exclusive': select(EXCLUSIVE, meta["annotations"]),
        ### target
        'target_definition': select(DEFINITION, meta["annotations"]),
        'target_provenance': select(PROVENANCE, meta["annotations"]),
        ### necessity
        'optional': select(OPTIONAL, meta["annotations"]),
        'mandatory': select(MANDATORY, meta["annotations"]),
        ##
        'execution': 'x' if meta["execution"] else '  ',
        ###
        'passive_monitoring': select(PASSIVE_MONITORING, meta["execution"]),
        'overriding': select(OVERRIDING, meta["execution"]),
        'post_mortem': select(POST_MORTEM, meta["execution"]),
        'instrumentation': select(INSTRUMENTATION, meta["execution"]),
        ##
        'deployment': 'x' if meta["deployment"] else '  ',
        ###
        'before': select(BEFORE_EXECUTION, meta["deployment"]),
        'during': select(DURING_EXECUTION, meta["deployment"]),
        ##
        'definition': 'x' if meta["definition"] else '  ',
        ### how
        'reading': select(READING, meta["definition"]),
        'parsing': select(PARSING, meta["definition"]),
        ### when
        'static': select(STATIC, meta["definition"]),
        'dynamic': select(DYNAMIC, meta["definition"]),
        # Management
        ##
        'storage': 'x' if meta["storage"] else '  ',
        ###
        'database': select("Database", list(map(storage_categories.get, meta["storage"]))),
        'database_specify': ", ".join(select_rev("Database", storage_categories, meta["storage"])) or "__________",
        'memory': select("Memory", list(map(storage_categories.get, meta["storage"]))),
        'file': select("File", list(map(storage_categories.get, meta["storage"]))),
        'file_specify': ", ".join(select_rev("File", storage_categories, meta["storage"])) or "__________",
        ##
        'distribution': 'x' if meta["distribution"] else '  ',
        ###
        'file_dist': select("File", list(map(distribution_categories.get, meta["distribution"]))),
        'file_dist_specify': ", ".join(select_rev("File", distribution_categories, meta["distribution"])) or "__________",
        'repository': select("Repository", list(map(distribution_categories.get, meta["distribution"]))),
        ##
        'reproducibility': select(REPRODUCIBILITY, meta["supports"]),
        ##
        'versioning': 'x' if meta["evolution"] != NO else '  ',
        ###
        'trialid': 'x' if meta["evolution"] == TRIAL_ID else '  ',
        'sequence': 'x' if meta["evolution"] == SEQUENCE else '  ',
        'intention': 'x' if meta["evolution"] == INTENTION else '  ',
        # Analysis
        ##
        'query':  'x' if meta["generic_query_text"] or meta["specific_query_text"] else '  ',
        ###
        'generic': 'x' if meta["generic_query_text"] else '  ',
        'generic_specify': meta["generic_query_text"] or "__________",
        'specific': 'x' if meta["specific_query_text"] else '  ',
        'specific_specify': meta["specific_query_text"] or "__________",
        ##
        'visualization':  'x' if meta["visplace"] or meta["visualization"] else '  ',
        ### Place
        'place_internal': select(INTERNAL, meta["visplace"]),
        'place_external': select(EXTERNAL, meta["visplace"]),
        ### Type
        'log': select(LOG_VIEW, meta["visualization"]),
        'data': select(DATA_VIEW, meta["visualization"]),
        'process': select(PROCESS_VIEW, meta["visualization"]),
        'combined': select(COMBINED_VIEW, meta["visualization"]),
        ### Completeness
        'complete': 'x' if (meta["visplace"] or meta["visualization"]) and not meta["summarization"] else '  ',
        'clustering': select(CLUSTERING, meta["summarization"]),
        'filtering': select(FILTERING, meta["summarization"]),
        ##
        'diff': 'x' if meta["diff"] else '  ',
        ###
        'diff_data': select(DATA, meta["diff"]),
        'diff_provenance': select(PROVENANCE, meta["diff"]),
    }




for i, (approach, meta) in enumerate(script):
    data = load_data(approach)
    print(TEMPLATE.format(**data))
    with open("../../emails/{}-{}.txt".format(i, name(approach)).replace("*", "-"), "w") as f:
        f.write(TEMPLATE.format(**data))


TITLE: Astro-WISE – Could you check if the following information is correct?
TO: Jmwebaze@gmail.com; d.r.boxhoorn@astro.rug.nl; valentyn@astro.rug.nl
CC: Juliana.freire@nyu.edu; leomurta@ic.uff.br; vanessa@ic.uff.br

Dear Johnson Mwebaze, Danny Boxhoorn, and Edwin Valentijn,

We are currently working on a survey on provenance in scripts. In the survey, we propose a taxonomy for the 
state-of-the-art approaches in this research area and we characterize the approaches accordingly. We would 
also like to include Astro-WISE in this survey.

To avoid any misunderstandings, we would kindly ask you to check and briefly comment on the following list 
of features. We got these features based on the Astro-WISE papers [1, 2] but want to make sure we got everything
correctly. We would highly appreciate if you could check our classification and make corrections when needed.
Please send your answer before December 14, 2018. This would allow us to include your comments in our paper.

Please, note th

In [5]:
bool(select(PROVENANCE, meta["annotations"]) == '  ' and meta["annotations"])

True