In [52]:
import json # for reading files
import re # for parsing the library strings
import os
from collections import OrderedDict, Counter

In [2]:
def get_prov_jsons(path_to_datasets):
    """Collect prov for all datasets in the given directory
    Parameters
    ----------
    path_to_datasets : string 
                       path to the directory containing processed datasets
    Returns
    -------
    prov_jsons : dict of dict
                 dict mapping doi to dict mapping json_file strings to parsed JSON
    """
    # get list of dataset directories, ignoring macOS directory metadata file (if present)
    doi_directs = [doi for doi in os.listdir(path_to_datasets) if doi != '.DS_Store']
    # initialize empty list to store JSONs
    prov_jsons = {}

    # iterate through directories and collect JSONs
    for my_doi in doi_directs:
        json_files = [my_file for my_file in os.listdir(path_to_datasets + '/' + my_doi + '/prov_data')\
                      if re.match('.*\.json', my_file)]
        if json_files:
            prov_jsons[my_doi] = {}
            for json_file in json_files:
                with open('/'.join([path_to_datasets, my_doi, 
                                    'prov_data', json_file]), 'r') as infile:
                    try:
                        prov_jsons[my_doi][json_file] = json.load(infile, 
                                                                  object_pairs_hook=OrderedDict)
                    except:
                        pass
    return prov_jsons

In [49]:
def get_io_from_prov_json(prov_json):
    """Identify input and output files from provenance JSON
    Parameters
    ----------
    prov_json : OrderedDict
                ordered dictionary generated from json prov file using python's json module
                i.e. json.load(path_to_json_file, object_pairs_hook=OrderedDict)
                
    Returns
    -------
    (input_files, output_files, file_locs) : tuple of (list, list, dict)
                                             input files and output files (empty lists if none) and 
                                             dict mapping files to location (empty if none)
    """
    # initializing data structures
    entity_to_file = {}
    file_locs = {}
    input_files = []
    output_files = []
    
    # get file entity names and locations
    for key, value in prov_json['entity'].items():
        if value['rdt:type'] == 'File':
            filename = value['rdt:name']
            entity_to_file[key] = filename
            file_locs[filename] = value['rdt:location']
    
    entities = set(entity_to_file.keys())
    
    # if a file entity is used by an activity, it must be an input file
    for value in prov_json['used'].values():
        if value['prov:entity'] in entities:
            input_files.append(entity_to_file[value['prov:entity']])
            
    # if a file entity was generated by an activity, it must be an output file
    for value in prov_json['wasGeneratedBy'].values():
        if value['prov:entity'] in entities:
            output_files.append(entity_to_file[value['prov:entity']])

    return input_files, output_files, file_locs

In [43]:
def get_pkgs_from_prov_json(prov_json):
    """Identify packages used from provenance JSON
    Parameters
    ----------
    prov_json : OrderedDict
                ordered dictionary generated from json prov file using python's json module
                i.e. json.load(path_to_json_file, object_pairs_hook=OrderedDict)
                
    Returns
    -------
    packages : list of tuple of (string, string)
               list of (package_name, version) tuples
    """
    # regular expression to capture library name
    library_regex = re.compile(r"library\((?P<lib_name>.*)\)", re.VERBOSE)

    # set of used libraries
    used_packages = set()

    # Identify libraries being used in script and add them to set
    for command in prov_json['activity'].values():
        code_line = command['rdt:name']
        # extract the package name from the JSON
        package_match = re.match('^\s*library\s*\((?:.*?package\s*=\s*|\s*)[\"\']([^\"]+)[\"\']', 
                                 code_line)
        if not package_match:
            package_match = re.match('^\s*require\s*\((?:.*?package\s*=\s*|\s*)[\"\']([^\"]+)[\"\']',
                                     code_line)
        # if a package name was found, add to the set
        if package_match:
            used_packages.add(package_match.group(1))
    
    # filter out pre-installed packages
    used_packages -= set(['datasets', 'utils', 'graphics', 'grDevices', 
                          'methods', 'stats', 'provR', 'devtools'])
    
    # list of (package, version) tuples
    packages = []
    
    # Filter packages in user's environment by which ones were used
    for package_dict in prov_json['activity']["environment"]["rdt:installedPackages"]:
        if package_dict["package"] in used_packages:
            packages.append((package_dict["package"], package_dict["version"]))
    
    return packages

In [3]:
prov_jsons = get_prov_jsons("rdata_odyc")

In [7]:
messycode_path = "encap_ex/messycode_prov/messycode.json"

In [8]:
# read in JSON file
with open(messycode_path, 'r') as infile:
    messycode_json = json.load(infile, object_pairs_hook=OrderedDict)

In [61]:
get_pkgs_from_prov_json(messycode_json)

[(u'gdata', u'2.18.0'), (u'txtplot', u'1.0-3'), (u'vegan', u'2.4-4')]

In [60]:
# if a file entity is used, it must be an input file
for json_dict in prov_jsons.values():
    for json_file in json_dict.values():
        for package, _ in get_pkgs_from_prov_json(json_file):
            package_usages[package] += 1
        get_io_from_prov_json(json_file)

In [58]:
package_usages

Counter({u'broom': 6,
         u'car': 3,
         u'dplyr': 6,
         u'foreign': 6,
         u'knitr': 6,
         u'magrittr': 6,
         u'stringr': 6})

In [66]:
def build_docker_package_install(package, version):
	"""Outputs formatted dockerfile command to install a specific version
	   of an R package into a docker image
	Parameters
	----------
	package : string
			  Name of the R package to be installed
	version : string
			  Version number of the desired package
	"""
	return 'RUN R -e \"require(\'devtools\');install_version(\'' +\
		package + '\', version=\'' + version + '\', repos=\'http://cran.rstudio.com\')\"\n'

In [67]:
print build_docker_package_install("vegan", "2.4-4")

RUN R -e "require('devtools');install_version('vegan', version='2.4-4', repos='http://cran.rstudio.com')"

