In [None]:
%%javascript
console.log("Teste");
require(['notebook/js/codecell'], function(codecell) {
  codecell.CodeCell.options_default.highlight_modes['text/x-rustsrc'] = {'reg':[/^%%prov/]} ;
  Jupyter.notebook.events.one('kernel_ready.Kernel', function(){
  Jupyter.notebook.get_cells().map(function(cell){
      if (cell.cell_type == 'code'){ cell.auto_highlight(); } }) ;
  });
});


In [None]:
ip = get_ipython()

In [None]:
ip.run_cell??

In [None]:
from xml.dom import minidom
from IPython.display import SVG

from IPython.core.magic import Magics, magics_class, cell_magic
from IPython.core.magic_arguments import  magic_arguments, argument, parse_argstring
from IPython.display import Image
from IPython.utils.text import DollarFormatter

@magics_class
class ProvMagic(Magics):
    
    @magic_arguments()
    @argument('-s', '--source', type=str, help="Source svg file")
    @argument('-t', '--target', type=str, help="Target files")
    @cell_magic
    def edit_svg(self, line, cell):
        formatter = DollarFormatter()
        line = formatter.vformat(
            line, args=[], kwargs=self.shell.user_ns.copy()
        )
        args = parse_argstring(self.edit_svg, line)
        with open(args.source + ".svg", "r") as source:
            svg = minidom.parseString(source.read())
            self.shell.user_ns["svg"] = svg
            self.shell.run_cell(cell)
            with open(args.target + ".svg", "w") as target:
                target.write(svg.toxml())
        self.shell.system(f"inkscape -f {args.target}.svg -e {args.target}.png -d 300")
        return Image(f"{args.target}.png")
            
    
    @cell_magic
    def prov(self, line, cell):
        path = line.split(" ", 1)[-1]
        result = self.shell.run_cell_magic("provn", f"-o {path} -e pdf svg provn dot -s paper4", cell)
        self.shell.system(f"inkscape -f {path}.svg -e {path}.png -d 300")
        return Image(f"{path}.png")
get_ipython().register_magics(ProvMagic)


def remove(svg, text, index=0):
    elements = [
        x for x in svg.getElementsByTagName("title")
        if x.childNodes[0].data == text
    ]
    if len(elements) <= index:
        return
    element = elements[index]
    gnode = element.parentNode
    parent = gnode.parentNode
    parent.removeChild(gnode)
    
def remove_node_with_attrs(svg, node):
    remove(svg, node)
    remove(svg, f"{node}-attrs")
    remove(svg, f"{node}-attrs->{node}")

In [None]:
%reload_ext extensible_provn.prov_magics
import extensible_provn.view.versioned_prov
BASE = "../generated/presentation"

# Versioned-PROV: A PROV extension to support mutable data entities

Authors:
- João Felipe Pimentel
- Paolo Missier
- Leonardo Murta
- Vanessa Braganholo

# Agenda

- Motivation
  - PROV limitations for fine-grained provenance from scripts
- Approach
  - Versioned-PROV
- Related Work
  - PROV-Dictionary
- Evaluation
  - Running example and general analysis
- Conclusion
  - Limitations and future work

# PROV

- Entensible model
- Domain-agnostic
  - Operating Systems
  - Workflow Systems
  - Scripts
  - ...
 
![PROV key concepts overview. The figure shows an entity, an agent, and an activity](https://www.w3.org/TR/2013/NOTE-prov-primer-20130430/images/key-concepts.png)
Image from PROV Model Primer
https://www.w3.org/TR/2013/NOTE-prov-primer-20130430/
![image.png](attachment:image.png)

# PROV for Scripts

- Existing approaches describe coarse-grained provenance

```python
i = 534
j = 237
k = 789
min(i, j, k)
```

In [None]:
%%prov $BASE/call
entity(i, [value="534", type="script:name", label="i"])
entity(j, [value="237", type="script:name", label="j"])
entity(k, [value="789", type="script:name", label="k"])

entity(result, [value="237", type="script:evaluation"])
activity(min, [type="script:call"])
wasDerivedFrom(result, j, min, g1, u1)
wasGeneratedBy(g1; result, min)
used(u1; min, j, [dot:dist="0"])
used(min, i)
used(min, k, [dot:dist="2"])

# Fine-Grained Provenance

- Assignment

```python
m = 10000
```

In [None]:
%%prov $BASE/assign
prefix script <https://dew-uff.github.io/versioned-prov/ns/script#>
prefix version <https://dew-uff.github.io/versioned-prov/ns#>
    
entity(10000, [value="10000", type="script:literal"])
entity(m, [value="10000", type="script:name", label="m"])

activity(assign1, [type="script:assign"])
wasDerivedFrom(m, 10000, assign1, -, u1, [dot:dist="1.5"])
wasGeneratedBy(g1; m, assign1, [dot:dist="0"])
used(u1; assign1, 10000, [dot:dist="1.5"])

# Challenges

- PROV entities are immutable
  - But some variables can be mutable
- Two main problems
  - P1: assignments to collection variables need to be represented in duplicity in PROV
  - P2: changes in collections also cause duplicate representation in PROV


# P1

When an assignment to a collection entity occurs, the new entity should have all the members of the original entity.

- N `hadMember` statements

a = [2, 6, 0]

b = a
```

In [None]:
%%prov $BASE/p1_1
newrank=true;
splines = "spline"
rankdir=TB
subgraph cluster_1 {
    labeljust="r"
    labelloc="t"
    color = "#333333"
    fontcolor = "#333333"
    fontsize = "24"
    fontcolor="white"
    label = "Members";
    "g/2"; "g/6";  "g/0";
    "g/2-attrs"; "g/6-attrs";  "g/0-attrs"; 

}
##H##
document
default <g>
prefix script <https://dew-uff.github.io/versioned-prov/ns/script#>
prefix version <https://dew-uff.github.io/versioned-prov/ns#>


entity(2, [value="2", type="script:literal"])
entity(6, [value="6", type="script:literal"])
entity(0, [value="0", type="script:literal"])

entity(list, [value="[2, 6, 0]", type="script:list"])

hadMember(list, 2, [dot:specific="true"])
hadMember(list, 6, [dot:specific="true"])
hadMember(list, 0, [dot:specific="true"])
endDocument
##F##
{rank=same "g/2" "g/6" "g/0"}
{rank=same "g/list" "g/list-attrs"}

"g/2" -> "g/6" -> "g/0" [style=invis]
"g/2" -> "g/list"  [style=invis]


In [None]:
%%prov $BASE/p1_3
newrank=true;
splines = "spline"

subgraph cluster_0 {
    labeljust="r"
    labelloc="b"
    color = "#333333"
    fontcolor = "#333333"
    fontsize = "24"
    label = "Assign";
    fontcolor="white"
    "g/list";
    "g/a#1"; "g/assign2"
    "g/list-attrs";
    "g/a#1-attrs"; "g/assign2-attrs"
    
}
subgraph cluster_1 {
    labeljust="r"
    labelloc="b"
    color = "#333333"
    fontcolor = "#333333"
    fontsize = "24"
    label = "Members";
    fontcolor="white"
    "g/2"; "g/6";  "g/0"; 
}
##H##
document
default <g>
prefix script <https://dew-uff.github.io/versioned-prov/ns/script#>
prefix version <https://dew-uff.github.io/versioned-prov/ns#>


entity(2, [value="2", type="script:literal", dot:hide2="true"])
entity(6, [value="6", type="script:literal", dot:hide2="true"])
entity(0, [value="0", type="script:literal", dot:hide2="true"])

entity(list, [value="[2, 6, 0]", type="script:list"])
entity(a#1, [value="[2, 6, 0]", type="script:name", label="a"])

hadMember(list, 2, [dot:hide2="true"])
hadMember(list, 6, [dot:hide2="true"])
hadMember(list, 0, [dot:hide2="true"])

hadMember(a#1, 2, [dot:specific="true"])
hadMember(a#1, 6, [dot:specific="true"])
hadMember(a#1, 0, [dot:specific="true"])

activity(assign2, [type="script:assign"])
wasDerivedFrom(a#1, list, assign2, u2, g2, [dot:dist="2"])
used(u2; assign2, list, -, [dot:dist="0"])
wasGeneratedBy(g2; a#1, assign2, -, [dot:dist="0"])
endDocument
##F##
{rank=same "g/list" "g/list-attrs"}
{rank=same "g/2" "g/6" "g/0"}
"g/0" -> "g/6" -> "g/2" [style=invis]


In [None]:
%%edit_svg -s $BASE/p1_3 -t $BASE/p1_2b
remove(svg, "g/a#1->g/2")
remove(svg, "g/a#1->g/6")
remove(svg, "g/a#1->g/0")

In [None]:
%%edit_svg -s $BASE/p1_2b -t $BASE/p1_2a
remove_node_with_attrs(svg, "g/a#1")
remove_node_with_attrs(svg, "g/assign2")
remove(svg, "cluster_0")
remove(svg, "g/a#1->g/assign2")
remove(svg, "g/a#1->g/list")
remove(svg, "g/assign2->g/list")

In [None]:
%%prov $BASE/p1_5
newrank=true;
splines = "spline"

subgraph cluster_0 {
    labeljust="r"
    labelloc="b"
    color = "#333333"
    fontcolor = "#333333"
    fontsize = "24"
    label = "Assign";
    fontcolor="white"

    "g/a#1";
    "g/b#1"; "g/assign3"
    "g/a#1-attrs";
    "g/b#1-attrs"; "g/assign3-attrs"
    
}
subgraph cluster_1 {
    labeljust="r"
    labelloc="b"
    color = "#333333"
    fontcolor = "#333333"
    fontsize = "24"
    fontcolor="white"

    label = "Members";
    "g/2"; "g/6";  "g/0"; 
}
##H##
document
default <g>
prefix script <https://dew-uff.github.io/versioned-prov/ns/script#>
prefix version <https://dew-uff.github.io/versioned-prov/ns#>


entity(2, [value="2", type="script:literal", dot:hide2="true"])
entity(6, [value="6", type="script:literal", dot:hide2="true"])
entity(0, [value="0", type="script:literal", dot:hide2="true"])

entity(a#1, [value="[2, 6, 0]", type="script:name", label="a"])
entity(b#1, [value="[2, 6, 0]", type="script:name", label="b"])

hadMember(a#1, 2, [dot:hide2="true"])
hadMember(a#1, 6, [dot:hide2="true"])
hadMember(a#1, 0, [dot:hide2="true"])

hadMember(b#1, 2, [dot:specific="true"])
hadMember(b#1, 6, [dot:specific="true"])
hadMember(b#1, 0, [dot:specific="true"])

activity(assign3, [type="script:assign"])
wasDerivedFrom(b#1, a#1, assign3, u3, g3, [dot:dist="2"])
used(u3; assign3, a#1, -, [dot:dist="0"])
wasGeneratedBy(g3; b#1, assign3, -, [dot:dist="0"])
endDocument
##F##
{rank=same "g/a#1" "g/a#1-attrs"}
{rank=same "g/2" "g/6" "g/0"}
"g/0" -> "g/6" -> "g/2" [style=invis]


In [None]:
%%edit_svg -s $BASE/p1_5 -t $BASE/p1_4b
remove(svg, "g/b#1->g/2")
remove(svg, "g/b#1->g/6")
remove(svg, "g/b#1->g/0")

In [None]:
%%edit_svg -s $BASE/p1_4b -t $BASE/p1_4a
remove_node_with_attrs(svg, "g/b#1")
remove_node_with_attrs(svg, "g/assign3")
remove(svg, "cluster_0")
remove(svg, "g/b#1->g/assign3")
remove(svg, "g/b#1->g/a#1")
remove(svg, "g/assign3->g/a#1")

# P2-simplified

When an entity that represents a collection is changed, a new entity should be created, together with multiple new relationships.

- 1 `entity` + 2 `wasDerivedFrom` + N `hadMember` = 3 + N statements

```python
a = [2, 6, 0]

a[1] = 3
```

In [None]:
%%prov $BASE/p2simple
newrank=true;
splines = "spline"

subgraph cluster_0 {
    labeljust="r"
    labelloc="b"
    color = "#333333"
    fontcolor = "#333333"
    fontsize = "24"
    fontcolor="white"
    label = "Part Assign";
    "g/a@1"; "g/1"; "g/3"
    "g/a#1"; "g/assign4"
    "g/a@1-attrs"; "g/1-attrs"; "g/3-attrs"
    "g/a#1-attrs"; "g/assign4-attrs"
    
}
subgraph cluster_1 {
    labeljust="r"
    labelloc="b"
    color = "#333333"
    fontcolor = "#333333"
    fontsize = "24"
    fontcolor="white"
    label = "Members";
    "g/0"; "g/6"; "g/2";
}

subgraph cluster_2 {
    labeljust="r"
    labelloc="t"
    color = "#333333"
    fontcolor = "#333333"
    fontsize = "24"
    fontcolor="white"
    label = "Overhead";
    "g/a#2"; "g/a#2-attrs" 
}
##H##
document
default <g>
prefix script <https://dew-uff.github.io/versioned-prov/ns/script#>
prefix version <https://dew-uff.github.io/versioned-prov/ns#>

entity(2, [value="2", type="script:literal", dot:hide2="true"])
entity(6, [value="6", type="script:literal", dot:hide2="true"])
entity(0, [value="0", type="script:literal", dot:hide2="true"])

entity(3, [value="3", type="script:literal"])
entity(1, [value="1", type="script:literal"])
entity(a#2, [value="[2, 3, 0]", type="script:name", label="a", dot:specific="true"])
entity(a#1, [value="[2, 6, 0]", type="script:name", label="a"])

entity(a@1, [value="3", type="script:access", label="a[1]"])
       
hadMember(a#1, 2, [dot:hide2="true"])
hadMember(a#1, 6, [dot:hide2="true"])
hadMember(a#1, 0, [dot:hide2="true"])

hadMember(a#2, 2, [dot:specific="true"])
hadMember(a#2, a@1, [dot:specific="true"])
hadMember(a#2, 0, [dot:specific="true"])

activity(assign4, [type="script:assign"])
wasDerivedFrom(a#2, a#1, assign4, g4, u4, [dot:specific="true"])
used(u4; assign4, a#1, -)
wasDerivedFrom(a#2, 3, assign4, g4, u5, [dot:specific="true"])
wasDerivedFrom(a@1, 3, assign4, g5, u5)
used(assign4, 1, -)
endDocument
##F##
{rank=same "g/2" "g/6" "g/0"}
{rank=same "g/a@1" "g/a#2"}
"g/0" -> "g/6" -> "g/2" [style=invis]
"g/1" -> "g/2" [style=invis]
"g/a#2" -> "g/6" [style=invis]


In [None]:
%%edit_svg -s $BASE/p2simple -t $BASE/p2simple_2
remove_node_with_attrs(svg, "g/a#2")
remove(svg, "cluster_2")
remove(svg, "g/a#2->g/a@1")
remove(svg, "g/a#2->g/assign4")
remove(svg, "g/a#2->g/3")
remove(svg, "g/a#2->g/a#1")
remove(svg, "g/a#2->g/2")
remove(svg, "g/a#2->g/0")

In [None]:
%%edit_svg -s $BASE/p2simple_2 -t $BASE/p2simple_1
remove_node_with_attrs(svg, "g/a@1")
remove(svg, "g/a@1->g/assign4")
remove(svg, "g/a@1->g/3")
remove_node_with_attrs(svg, "g/assign4")
remove(svg, "g/assign4->g/1")
remove(svg, "g/assign4->g/3")
remove(svg, "g/assign4->g/a#1")
remove_node_with_attrs(svg, "g/1")
remove_node_with_attrs(svg, "g/3")
remove(svg, "cluster_0")


# P2

When more than one variable is assigned to the same collection, and one of the variables changes, all other variables should also change, as they refer to the same memory area.

- 3 + N statements **for each** reference

```python
a = [2, 6, 0]
b = a

a[1] = 3
```

In [None]:
%%prov $BASE/p2_2
newrank=true;
splines = "spline"

subgraph cluster_0 {
    labeljust="r"
    labelloc="b"
    color = "#333333"
    fontcolor = "#333333"
    fontsize = "36"
    fontcolor = "white"
    label = "Assignment";
    "g/a@1"; "g/1"; "g/3"
    "g/a#1"; "g/assign5"
    
    "g/a@1-attrs"; "g/1-attrs"; "g/3-attrs"
    "g/a#1-attrs"; "g/assign5-attrs"
    
    
}
subgraph cluster_1 {
    labeljust="r"
    labelloc="b"
    color = "#333333"
    fontcolor = "#333333"
    fontsize = "36"
    fontcolor = "white"
    label = "Members";
    "g/2"; "g/6"; "g/0"; "g/b#1";
}

subgraph cluster_2 {
    labeljust="r"
    labelloc="t"
    color = "#333333"
    fontcolor = "#333333"
    fontsize = "36"
    fontcolor = "white"
    label = "Overhead";
    "g/a#2";
    "g/b#2"; "g/b#2-attrs" 
}

##H##
document
default <g>
prefix script <https://dew-uff.github.io/versioned-prov/ns/script#>
prefix version <https://dew-uff.github.io/versioned-prov/ns#>


entity(2, [value="2", type="script:literal", dot:hide2="true"])
entity(6, [value="6", type="script:literal", dot:hide2="true"])
entity(0, [value="0", type="script:literal", dot:hide2="true"])
entity(a#1, [value="[2, 6, 0]", type="script:name", label="a"])
entity(b#1, [value="[2, 6, 0]", type="script:name", label="b", dot:hide2="true"])
entity(3, [value="3", type="script:literal"])
entity(1, [value="1", type="script:literal"])
entity(a#2, [value="[2, 3, 0]", type="script:name", label="a", dot:hide2="true"])
entity(b#2, [value="[2, 3, 0]", type="script:name", label="b", dot:specific="true"])

       
       
entity(a@1, [value="3", type="script:access", label="a[1]"])
       
hadMember(a#1, 2, [dot:hide2="true"])
hadMember(a#1, 6, [dot:hide2="true"])
hadMember(a#1, 0, [dot:hide2="true"])
          
hadMember(b#1, 2, [dot:hide2="true"])
hadMember(b#1, 6, [dot:hide2="true"])
hadMember(b#1, 0, [dot:hide2="true"])

hadMember(a#2, 2, [dot:hide2="true"])
hadMember(a#2, a@1, [dot:hide2="true"])
hadMember(a#2, 0, [dot:hide2="true"])

hadMember(b#2, 2, [dot:specific="true"])
hadMember(b#2, a@1, [dot:specific="true"])
hadMember(b#2, 0, [dot:specific="true"])

          
activity(assign5, [type="script:assign"])
wasDerivedFrom(a#2, a#1, assign5, g6, u6, [dot:hide2="true"])
wasDerivedFrom(a#2, 3, assign5, g6, u6, [dot:hide2="true", dot:dist="0.5", dot:angle="270.0"])
used(u6; assign5, a#1, -, [dot:dist="2", dot:angle="-30.0"])
wasGeneratedBy(g6; a#2, assign5, -, [dot:hide2="true", dot:dist="1", dot:angle="270.0"])
wasDerivedFrom(b#2, b#1, assign5, g8, u8, [dot:specific="true"])
wasDerivedFrom(b#2, 3, assign5, g8, u7, [dot:specific="true"])
wasDerivedFrom(a@1, 3, assign5, g7, u7)

used(assign5, 1, -)
endDocument
##F##
{rank=same "g/2" "g/6" "g/0"}
{rank=same "g/a@1" "g/a#2" "g/b#2"}
//{rank=same "g/a#1" "g/a#1-attrs"}
{rank=same "g/1" "g/3"}
"g/0" -> "g/6" -> "g/2" [style=invis]
"g/1" -> "g/2" [style=invis]
//"g/2" -> "g/3" [style=invis]
"g/a#2" -> "g/6" [style=invis]
//"g/b#2" -> "g/6" [style=invis]


In [None]:
%%edit_svg -s $BASE/p2_2 -t $BASE/p2_1b
remove_node_with_attrs(svg, "g/b#2")
remove(svg, "g/assign5->g/b#1")
remove(svg, "g/b#2->g/a@1")
remove(svg, "g/b#2->g/assign5")
remove(svg, "g/b#2->g/3")
remove(svg, "g/b#2->g/b#1")
remove(svg, "g/b#2->g/2")
remove(svg, "g/b#2->g/0")

In [None]:
%%edit_svg -s $BASE/p2_1b -t $BASE/p2_1a
remove_node_with_attrs(svg, "g/a#2")
remove(svg, "cluster_2")
remove(svg, "g/a#2->g/a@1")
remove(svg, "g/a#2->g/assign5")
remove(svg, "g/a#2->g/3")
remove(svg, "g/a#2->g/a#1")
remove(svg, "g/a#2->g/2")
remove(svg, "g/a#2->g/0")

remove_node_with_attrs(svg, "g/a@1")
remove(svg, "g/a@1->g/assign5")
remove(svg, "g/a@1->g/3")
remove_node_with_attrs(svg, "g/assign5")
remove(svg, "g/assign5->g/1")
remove(svg, "g/assign5->g/3")
remove(svg, "g/assign5->g/a#1")
remove_node_with_attrs(svg, "g/1")
remove_node_with_attrs(svg, "g/3")
remove(svg, "cluster_0")


# Challenges

- PROV entities are immutable
- Two main problems
  - P1: $O(N)$ new elements in collection assignments
  - P2: $\Omega(R\times N)$ new elements in collection changes
- N: number of elements
- R: number of variables referring to the same collection

# Goal

Reduce the storage overhead in collection assignments and changes to 𝑂(1)

# Versioned-PROV

- PROV extension
  - Checkpoints, Reference Sharing, Accesses
- An entity may represent multiple versions of a data object
  - Checkpoints identify these versions

In [None]:
%%prov $BASE/versioned
ranksep=0.1; 

subgraph cluster_1 {
    color=white
    "g/modify" [shape=box color=white fixedsize=true width=1.5]
    "g/remove" [shape=box color=white fixedsize=true width=1.5]
    "g/insert" [shape=box color=white fixedsize=true width=1.5]

    edge [minlen=6]
##H##
document
default <g>
prefix script <https://dew-uff.github.io/versioned-prov/ns/script#>
prefix version <https://dew-uff.github.io/versioned-prov/ns#>

entity(a)
activity(insert)
activity(remove)
activity(modify)
used(insert, a, [dot:dist="2", dot:angle="-30"])
used(remove, a)
used(modify, a, [dot:dist="2", dot:angle="30"])
endDocument
##F##
}

subgraph cluster_0 {
    color=white
    node [shape=box color=white fixedsize=true width=1.5]
    chk1 [label="checkpoint: 1" ]
    chk10 [label="checkpoint: 10"]
    chk4 [label="checkpoint: 4" ]
    chk1 -> "g/insert" [style=invis]
    chk10 -> "g/remove" [style=invis]
    chk4 -> "g/modify" [style=invis]
}

In [None]:
%%prov $BASE/timeline
ranksep=0.05
color=white
"g/modify" [shape=box color=white fixedsize=true width=1.5]
"g/remove" [shape=box color=white fixedsize=true width=1.5]
"g/insert" [shape=box color=white fixedsize=true width=1.5]
edge [minlen=2]
##H##
document
default <g>
prefix script <https://dew-uff.github.io/versioned-prov/ns/script#>
prefix version <https://dew-uff.github.io/versioned-prov/ns#>

entity(a)
activity(insert)
activity(remove)
activity(modify)
endDocument
##F##

    color=white
    node [shape=box color=white fixedsize=true width=1.5]
    chk1 [label="checkpoint: 1" ]
    chk10 [label="checkpoint: 10"]
    chk4 [label="checkpoint: 4" ]
    "g/insert" -> chk1  [style=invis minlen=1]
    "g/remove" -> chk10 [style=invis minlen=1]
    "g/modify" -> chk4  [style=invis minlen=1]
    
    "g/a" -> "g/insert" -> "g/modify" -> "g/remove" [arrowhead=none]
    end [label="" shape=box color=white fixedsize=true width=1.5]
    "g/remove" -> end

    
{rank=same "g/a" "g/insert"  "g/modify"  "g/remove" end}

# PROV Constraint

- Unique-generation constraint:
  - A PROV entity can only be generated once
- Versioned-PROV follows PROV semantics
  - Mutability occurs only in collection entities
  - A Versioned-PROV collection may have different **members** at different moments

# Membership

Source
```python
a = [2,6,0]
a.append(4) # insert 
a[1] = 3    # modify
a.pop()     # remove
```

Modify operation
```Rust
hadMember(a, 3, [type="version:Put", version:key="1", version:checkpoint="4"])
```

In [None]:
%%prov $BASE/incremental
document
default <g>
prefix script <https://dew-uff.github.io/versioned-prov/ns/script#>
prefix version <https://dew-uff.github.io/versioned-prov/ns#>


entity(6, [value="6", type="script:literal", dot:hide3="true"])
entity(2, [value="2", type="script:literal", dot:hide3="true"])
entity(0, [value="0", type="script:literal", dot:hide3="true"])
entity(4, [value="4", type="script:literal", dot:hide3="true"])
entity(3, [value="3", type="script:literal", dot:hide3="true"])
entity(list, [value="[2, 6, 0]", type="script:list", dot:hide3="true"])
hadMember(list, 2, [type="version:Put", version:key="0", version:checkpoint="0"])
hadMember(list, 6, [type="version:Put", version:key="1", version:checkpoint="0"])
hadMember(list, 0, [type="version:Put", version:key="2", version:checkpoint="0"])
hadMember(list, 4, [type="version:Put", version:key="3", version:checkpoint="1"])
hadMember(list, 3, [type="version:Put", version:key="1", version:checkpoint="4"])
hadMember(list, 4, [type="version:Del", version:key="3", version:checkpoint="10"])

endDocument
##F##
"g/2"->"g/6"->"g/0"->"g/4"->"g/3" [style=invis]
{rank=same "g/2" "g/6" "g/0" "g/3" "g/4" }


In [None]:
%%edit_svg -s $BASE/incremental -t $BASE/incremental_1
remove_node_with_attrs(svg, "g/4")
remove_node_with_attrs(svg, "g/3")
remove(svg, "g/list->g/4")
remove(svg, "g/list->g/4")
remove(svg, "g/list->g/3")


# Derivation By Reference

In [None]:
%%prov $BASE/by_reference
newrank=true;
splines = "spline"

subgraph cluster_0 {
    labeljust="r"
    labelloc="b"
    color = "#333333"
    fontcolor = "#333333"
    fontsize = "24"
    fontcolor = "white"
    label = "Assign";
    "g/list";
    "g/a"; "g/assign6"
    "g/list-attrs";
    "g/a-attrs"; "g/assign6-attrs"
    
}
subgraph cluster_1 {
    labeljust="r"
    labelloc="b"
    color = "#333333"
    fontcolor = "#333333"
    fontsize = "24"
    fontcolor = "white"
    label = "Member";
    "g/2"; "g/6";  "g/0"; 
}
##H##
document
default <g>
prefix script <https://dew-uff.github.io/versioned-prov/ns/script#>
prefix version <https://dew-uff.github.io/versioned-prov/ns#>

entity(6, [value="6", type="script:literal", dot:hide2="true"])
entity(2, [value="2", type="script:literal", dot:hide2="true"])
entity(0, [value="0", type="script:literal", dot:hide2="true"])
entity(list, [value="[2, 6, 0]", type="script:list"])
hadMember(list, 2, [type="version:Put", version:key="0", version:checkpoint="0", dot:hide2="true"])
hadMember(list, 6, [type="version:Put", version:key="1", version:checkpoint="0", dot:hide2="true"])
hadMember(list, 0, [type="version:Put", version:key="2", version:checkpoint="0", dot:hide2="true"])
entity(a, [value="[2, 6, 0]", type="script:name", label="a"])

activity(assign6, [type="script:assign"])
wasDerivedFrom(a, list, assign6, g9, u9, [type="version:Reference", version:checkpoint="0"])
used(u1; assign6, list, [version:checkpoint="0"])
endDocument
##F##
{rank=same "g/a" "g/a-attrs"}

In [None]:
%%edit_svg -s $BASE/by_reference -t $BASE/by_refence_a
remove_node_with_attrs(svg, "g/a")
remove_node_with_attrs(svg, "g/assign6")
remove(svg, "g/a->g/assign6")
remove(svg, "g/a->g/list")
remove(svg, "g/assign6->g/list")
remove(svg, "cluster_0")


In [None]:
%%prov $BASE/incremental_2x
document
default <g>
prefix script <https://dew-uff.github.io/versioned-prov/ns/script#>
prefix version <https://dew-uff.github.io/versioned-prov/ns#>


entity(6, [value="6", type="script:literal", dot:hide2="true"])
entity(2, [value="2", type="script:literal", dot:hide2="true"])
entity(0, [value="0", type="script:literal", dot:hide2="true"])
entity(4, [value="4", type="script:literal", dot:hide3="true"])
entity(3, [value="3", type="script:literal", dot:hide2="true"])
entity(list, [value="[2, 6, 0]", type="script:list", dot:hide3="true"])
hadMember(list, 2, [type="version:Put", version:key="0", version:checkpoint="0", dot:hide2="true"])
hadMember(list, 6, [type="version:Put", version:key="1", version:checkpoint="0", dot:hide2="true"])
hadMember(list, 0, [type="version:Put", version:key="2", version:checkpoint="0", dot:hide2="true"])
hadMember(list, 4, [type="version:Put", version:key="3", version:checkpoint="1"])
hadMember(list, 3, [type="version:Put", version:key="1", version:checkpoint="4", dot:hide2="true"])
hadMember(list, 4, [type="version:Del", version:key="3", version:checkpoint="10", dot:hide2="true"])

endDocument
##F##
"g/2"->"g/6"->"g/0"->"g/4"->"g/3" [style=invis]
{rank=same "g/2" "g/6" "g/0" "g/3" "g/4" }


In [None]:
%%edit_svg -s $BASE/incremental_2x -t $BASE/incremental_2
remove_node_with_attrs(svg, "g/3")
remove(svg, "g/list->g/4", 1)
remove(svg, "g/list->g/3")


In [None]:
%%prov $BASE/incremental_3x
document
default <g>
prefix script <https://dew-uff.github.io/versioned-prov/ns/script#>
prefix version <https://dew-uff.github.io/versioned-prov/ns#>


entity(6, [value="6", type="script:literal", dot:hide2="true"])
entity(2, [value="2", type="script:literal", dot:hide2="true"])
entity(0, [value="0", type="script:literal", dot:hide2="true"])
entity(4, [value="4", type="script:literal", dot:hide2="true"])
entity(3, [value="3", type="script:literal", dot:hide3="true"])
entity(list, [value="[2, 6, 0]", type="script:list", dot:hide3="true"])
hadMember(list, 2, [type="version:Put", version:key="0", version:checkpoint="0", dot:hide2="true"])
hadMember(list, 6, [type="version:Put", version:key="1", version:checkpoint="0", dot:hide2="true"])
hadMember(list, 0, [type="version:Put", version:key="2", version:checkpoint="0", dot:hide2="true"])
hadMember(list, 4, [type="version:Put", version:key="3", version:checkpoint="1", dot:hide2="true"])
hadMember(list, 3, [type="version:Put", version:key="1", version:checkpoint="4"])
hadMember(list, 4, [type="version:Del", version:key="3", version:checkpoint="10", dot:hide2="true"])

endDocument
##F##
"g/2"->"g/6"->"g/0"->"g/4"->"g/3" [style=invis]
{rank=same "g/2" "g/6" "g/0" "g/3" "g/4" }


In [None]:
%%edit_svg -s $BASE/incremental_3x -t $BASE/incremental_3
remove(svg, "g/list->g/4", 1)


In [None]:
%%prov $BASE/incremental_4
document
default <g>
prefix script <https://dew-uff.github.io/versioned-prov/ns/script#>
prefix version <https://dew-uff.github.io/versioned-prov/ns#>


entity(6, [value="6", type="script:literal", dot:hide2="true"])
entity(2, [value="2", type="script:literal", dot:hide2="true"])
entity(0, [value="0", type="script:literal", dot:hide2="true"])
entity(4, [value="4", type="script:literal", dot:hide3="true"])
entity(3, [value="3", type="script:literal", dot:hide2="true"])
entity(list, [value="[2, 6, 0]", type="script:list", dot:hide3="true"])
hadMember(list, 2, [type="version:Put", version:key="0", version:checkpoint="0", dot:hide2="true"])
hadMember(list, 6, [type="version:Put", version:key="1", version:checkpoint="0", dot:hide2="true"])
hadMember(list, 0, [type="version:Put", version:key="2", version:checkpoint="0", dot:hide2="true"])
hadMember(list, 4, [type="version:Put", version:key="3", version:checkpoint="1", dot:hide2="true"])
hadMember(list, 3, [type="version:Put", version:key="1", version:checkpoint="4", dot:hide2="true"])
hadMember(list, 4, [type="version:Del", version:key="3", version:checkpoint="10"])

endDocument
##F##
"g/2"->"g/6"->"g/0"->"g/4"->"g/3" [style=invis]
{rank=same "g/2" "g/6" "g/0" "g/3" "g/4" }


In [None]:
%%prov $BASE/incremental_5
document
default <g>
prefix script <https://dew-uff.github.io/versioned-prov/ns/script#>
prefix version <https://dew-uff.github.io/versioned-prov/ns#>


entity(6, [value="6", type="script:literal", dot:hide2="true"])
entity(2, [value="2", type="script:literal", dot:hide3="true"])
entity(0, [value="0", type="script:literal", dot:hide3="true"])
entity(4, [value="4", type="script:literal", dot:hide3="true"])
entity(3, [value="3", type="script:literal", dot:hide3="true"])
entity(list, [value="[2, 6, 0]", type="script:list", dot:hide3="true"])
hadMember(list, 2, [type="version:Put", version:key="0", version:checkpoint="0"])
hadMember(list, 6, [type="version:Put", version:key="1", version:checkpoint="0", dot:hide2="true"])
hadMember(list, 0, [type="version:Put", version:key="2", version:checkpoint="0"])
hadMember(list, 4, [type="version:Put", version:key="3", version:checkpoint="1"])
hadMember(list, 3, [type="version:Put", version:key="1", version:checkpoint="4"])
hadMember(list, 4, [type="version:Del", version:key="3", version:checkpoint="10", dot:hide2="true"])

endDocument
##F##
"g/2"->"g/6"->"g/0"->"g/4"->"g/3" [style=invis]
{rank=same "g/2" "g/6" "g/0" "g/3" "g/4" }


# Events

- Events associated to Versioned-PROV entities should indicate the checkpoint
  - used(u1; a1, e1, t1, [version:checkpoint="c1"])
  - wasGeneratedBy(g1; e2, a1, t2, [version:checkpoint="c2"])
  - wasDerivedFrom(e2, e1, a1, g1, u1, [version:checkpoint="c3"])
    - Derivation occurs after use: 𝑐1<𝑐3
    - Derivation may occur during generation: c2≤𝑐3 


# P1 Revisited

(P1) When an assignment to a collection entity occurs, Versioned-PROV does not recreate the
membership.

```python
a = [2,6,0]

b = a
```

PROV-N
```Rust
wasDerivedFrom(
  b, a, assign2, 
  g1, u1, [
    type="version:Reference",
    version:checkpoint="1"
])
```

In [None]:
%%prov $BASE/versioned_p1_1
newrank=true;
splines = "spline"

subgraph cluster_0 {
    labeljust="r"
    labelloc="b"
    color = "#333333"
    fontcolor = "#333333"
    fontsize = "24"
    fontcolor = "white"
    label = "Assign";
    "g/list";
    "g/a"; "g/assign2"
    "g/list-attrs";
    "g/a-attrs"; "g/assign2-attrs"
    
}
subgraph cluster_1 {
    labeljust="r"
    labelloc="b"
    color = "#333333"
    fontcolor = "#333333"
    fontsize = "24"
    fontcolor="white"
    label = "Members";
    "g/2"; "g/6";  "g/0"; 
}
##H##
document
default <g>
prefix script <https://dew-uff.github.io/versioned-prov/ns/script#>
prefix version <https://dew-uff.github.io/versioned-prov/ns#>

entity(6, [value="6", type="script:literal", dot:hide2="true"])
entity(2, [value="2", type="script:literal", dot:hide2="true"])
entity(0, [value="0", type="script:literal", dot:hide2="true"])
entity(list, [value="[2, 6, 0]", type="script:list"])
hadMember(list, 2, [type="version:Put", version:key="0", version:checkpoint="0"])
hadMember(list, 6, [type="version:Put", version:key="1", version:checkpoint="0"])
hadMember(list, 0, [type="version:Put", version:key="2", version:checkpoint="0"])
entity(a, [value="[2, 6, 0]", type="script:name", label="a"])

activity(assign2, [type="script:assign"])
wasDerivedFrom(a, list, assign2, g1, u1, [type="version:Reference", version:checkpoint="1"])
used(u1; assign2, list, [version:checkpoint="0"])
endDocument
##F##
{rank=same "g/list" "g/list-attrs"}


In [None]:
%%edit_svg -s $BASE/versioned_p1_1 -t $BASE/versioned_p1_1a
remove_node_with_attrs(svg, "g/a")
remove_node_with_attrs(svg, "g/assign2")
remove(svg, "g/assign2->g/list")
remove(svg, "g/a->g/list")
remove(svg, "g/a->g/assign2")
remove(svg, "cluster_0")


In [None]:
%%prov $BASE/versioned_p1_2
newrank=true;
splines = "spline"

subgraph cluster_0 {
    labeljust="r"
    labelloc="b"
    color = "#333333"
    fontcolor = "#333333"
    fontsize = "24"
    label = "A";
    "g/a";
    "g/b"; "g/assign3"
    "g/a-attrs";
    "g/b-attrs"; "g/assign3-attrs"
    
}
subgraph cluster_1 {
    labeljust="r"
    labelloc="b"
    color = "#333333"
    fontcolor = "#333333"
    fontsize = "24"
    label = "B";
    "g/list";   "g/2"; "g/6";  "g/0"; 
}
##H##
document
default <g>
prefix script <https://dew-uff.github.io/versioned-prov/ns/script#>
prefix version <https://dew-uff.github.io/versioned-prov/ns#>

entity(6, [value="6", type="script:literal", dot:hide2="true"])
entity(2, [value="2", type="script:literal", dot:hide2="true"])
entity(0, [value="0", type="script:literal", dot:hide2="true"])
entity(a, [value="[2, 6, 0]", type="script:name", label="a"])
entity(list, [value="[2, 6, 0]", type="script:list", dot:hide2="true"])

hadMember(list, 2, [type="version:Put", version:key="0", version:checkpoint="0"])
hadMember(list, 6, [type="version:Put", version:key="1", version:checkpoint="0"])
hadMember(list, 0, [type="version:Put", version:key="2", version:checkpoint="0"])
entity(b, [value="[2, 6, 0]", type="script:name", label="b"])

activity(assign3, [type="script:assign"])
wasDerivedFrom(b, a, assign3, g2, u2, [type="version:Reference", version:checkpoint="2"])
wasDerivedFrom(a, list, -, -, -, [type="version:Reference", version:checkpoint="2"])
used(u2; assign3, a, [version:checkpoint="1"])
endDocument
##F##
{rank=same "g/b" "g/b-attrs"}
{rank=same "g/a" "g/a-attrs"}

In [None]:
%%prov $BASE/versioned_p1_2

##H##
document
default <g>
prefix script <https://dew-uff.github.io/versioned-prov/ns/script#>
prefix version <https://dew-uff.github.io/versioned-prov/ns#>

entity(a, [value="[2, 6, 0]", type="script:name", label="a"])
entity(b, [value="[2, 6, 0]", type="script:name", label="b"])

activity(assign3, [type="script:assign"])
wasDerivedFrom(b, a, assign3, g2, u2, [type="version:Reference", version:checkpoint="2"])
used(u2; assign3, a, [version:checkpoint="1"])
endDocument
##F##
{rank=same "g/b" "g/b-attrs"}
{rank=same "g/a" "g/a-attrs"}


# P2 Revisited

When an entity that represents a collection changes, Versioned-PROV does not create new collection entities.

```python
0: a = [2,6,0]
1: b = a

4: a[1] = 3
```

PROV-N
```Rust
wasDerivedFrom(a@1, 3, assign3, g5, u5, [type="version:Reference", version:checkpoint="4", version:collection="a", version:key="1", version:access="w"])
```

- b derives by reference
- Access Representation

In [None]:
%%prov $BASE/versioned_p2
newrank=true;
splines = "spline"

subgraph cluster_0 {
    labeljust="r"
    labelloc="b"
    color = "#333333"
    fontcolor = "#333333"
    fontsize = "24"
     fontcolor = "white"
    label = "Assign";
    "g/a@1"; "g/1"; "g/3"
    "g/a"; "g/assign3"
    
    "g/a@1-attrs"; "g/1-attrs"; "g/3-attrs"
    "g/a-attrs"; "g/assign3-attrs"; "bn0"
    
    
}
subgraph cluster_1 {
    labeljust="r"
    labelloc="b"
    color = "#333333"
    fontcolor = "#333333"
    fontsize = "24"
    fontcolor = "white"
    label = "Members";
    "g/2"; "g/6"; "g/0"; "g/list"; "g/b"
}

##H##
document
default <g>
prefix script <https://dew-uff.github.io/versioned-prov/ns/script#>
prefix version <https://dew-uff.github.io/versioned-prov/ns#>

entity(6, [value="6", type="script:literal", dot:hide2="true"])
entity(2, [value="2", type="script:literal", dot:hide2="true"])
entity(0, [value="0", type="script:literal", dot:hide2="true"])
entity(list, [value="[2, 6, 0]", type="script:list", dot:hide2="true"])
entity(b, [value="[2, 6, 0]", type="script:name", label="b", dot:hide2="true"])

hadMember(list, 2, [type="version:Put", version:key="0", version:checkpoint="0", dot:hide2="true"])
hadMember(list, 6, [type="version:Put", version:key="1", version:checkpoint="0", dot:hide2="true"])
hadMember(list, 0, [type="version:Put", version:key="2", version:checkpoint="0", dot:hide2="true"])
entity(a, [value="[2, 6, 0]", type="script:name", label="a"])
entity(a@1, [value="3", type="script:access", label="a[1]"])
hadMember(list, a@1, [type="version:Put", version:key="1", version:checkpoint="4"])
entity(3, [value="3", type="script:literal"])
entity(1, [value="1", type="script:literal"])

          
activity(assign3, [type="script:assign"])
used(u4; assign3, a, -, [dot:dist="2", dot:angle="-30.0"])
wasDerivedFrom(a@1, 3, assign3, g5, u5, [type="version:Reference", version:checkpoint="4", version:collection="a", version:key="1", version:access="w"])
wasDerivedFrom(a, list, -, -, -, [type="version:Reference", version:checkpoint="1", dot:hide2="true"])
wasDerivedFrom(b, a, -, -, -, [type="version:Reference", version:checkpoint="2", dot:hide2="true"])

used(assign3, 1, -)

endDocument
##F##
{rank=same "g/2" "g/6" "g/0" "g/b"}
{rank=same "g/1" "g/3" "g/a"}
{rank=same "g/list" "g/a"}

{rank=same "g/1-attrs" "g/assign3-attrs" "g/a@1-attrs"}
"g/0" -> "g/6" -> "g/2" [style=invis]
"g/3" -> "g/1" -> "g/a" [style=invis]

In [None]:
%%edit_svg -s $BASE/versioned_p2 -t $BASE/versioned_p2_1a
remove_node_with_attrs(svg, "g/3")
remove_node_with_attrs(svg, "g/1")
remove_node_with_attrs(svg, "g/a@1")
remove_node_with_attrs(svg, "g/assign3")
remove(svg, "g/assign3->g/a")
remove(svg, "g/assign3->g/3")
remove(svg, "g/assign3->g/1")
remove(svg, "g/list->g/a@1")
remove(svg, "g/a@1->g/assign3")
remove(svg, "g/a@1->bn0")
remove(svg, "bn0->g/a")
remove(svg, "bn0->g/3")
remove(svg, "bn0")
remove(svg, "g/a-attrs")
remove(svg, "g/a-attrs->g/a")
remove(svg, "cluster_0")


In [None]:
%%prov $BASE/versioned_p2_2
newrank=true;
splines = "spline"

subgraph cluster_0 {
    labeljust="r"
    labelloc="b"
    color = "#333333"
    fontcolor = "#333333"
    fontsize = "24"
    fontcolor="white"
    label = "Assign";
    "g/a@1"; "g/1"; "g/3"
    "g/a"; "g/assign3"
    
    "g/a@1-attrs"; "g/1-attrs"; "g/3-attrs"
    "g/a-attrs"; "g/assign3-attrs"; "bn0"
    
    
}
subgraph cluster_1 {
    labeljust="r"
    labelloc="b"
    color = "#333333"
    fontcolor = "#333333"
    fontsize = "24"
    fontcolor="white"
    label = "Members";
    "g/list";
}

##H##
document
default <g>
prefix script <https://dew-uff.github.io/versioned-prov/ns/script#>
prefix version <https://dew-uff.github.io/versioned-prov/ns#>

entity(list, [value="[2, 6, 0]", type="script:list", dot:hide2="true"])

entity(a, [value="[2, 6, 0]", type="script:name", label="a"])
entity(a@1, [value="3", type="script:access", label="a[1]"])
hadMember(list, a@1, [type="version:Put", version:key="1", version:checkpoint="4"])
entity(3, [value="3", type="script:literal"])
entity(1, [value="1", type="script:literal"])

          
activity(assign3, [type="script:assign"])
used(u4; assign3, a, -, [dot:dist="2", dot:angle="-30.0"])
wasDerivedFrom(a@1, 3, assign3, g5, u5, [type="version:Reference", version:checkpoint="4", version:collection="a", version:key="1", version:access="w"])
wasDerivedFrom(a, list, -, -, -, [type="version:Reference", version:checkpoint="1", dot:hide2="true"])

used(assign3, 1, -)

endDocument
##F##
{rank=same "g/1" "g/3" "g/a"}
{rank=same "g/list" "g/a"}

{rank=same "g/1-attrs" "g/assign3-attrs" "g/a@1-attrs"}
"g/3" -> "g/1" -> "g/a" [style=invis]

# Related Work

- PROV-Dictionary extension
  - Adds Insertion and Removal derivations to PROV
  - Reduces the storage 
  - Still assumes entity immutability
    - High overhead in comparison to Versioned-PROV
- Most of the other PROV extensions are domain specific
  - Do not improve the PROV support for data structures

- StarFlow and Tariq et al. (2012) export provenance from scripts to OPM
  - Coarse-grained provenance with no data structures
- RDataTracker, noWorkflow, CXXR collect fine grained provenance
  - Use non-interoperable formats
  - Statement level provenance with no support to data structure changes
- Michaelides et al. (2016) collect provenance from Blockly variables and export it to plain PROV
  - 𝑂(𝑁) collection assignments, Ω(𝑁×𝑅) collection changes

# Evaluation

https://dew-uff.github.io/versioned-prov/comparison.html

In [None]:
import pandas as pd
import json
%matplotlib inline

with open("../generated/plain_prov/floydwarshall.json", "r") as f:
    prov = json.load(f)
with open("../generated/prov_dictionary/floydwarshall.json", "r") as f:
    prov_dictionary = json.load(f)
with open("../generated/versioned_prov/floydwarshall.json", "r") as f:
    versioned_prov = json.load(f)

In [None]:
order = [
    'approach',
    'entity', 'activity',
    'used', 'wasDerivedFrom', 'wasGeneratedBy',
    'hadMember', 'derivedByInsertionFrom',
]
nodes = ['entity', 'activity', 'value']
relationships = [x for x in order if x not in nodes and x != "approach"]

In [None]:
df = pd.DataFrame([
    dict(**{'approach': 'PROV'}, **dict(prov["all"]["global"])),
    dict(**{'approach': 'PROV-Dictionary'}, **dict(prov_dictionary["all"]["global"])),
    dict(**{'approach': 'Versioned-PROV'}, **dict(versioned_prov["all"]["global"])),

], columns=order)

df[order[1:]] = df[order[1:]].fillna(0.0).astype(int)
df['nodes'] = sum(df[x] for x in nodes if x in df.columns)
df['edges'] = sum(df[x] for x in relationships if x in df.columns)
df

In [None]:
import matplotlib.pyplot as plt
import numpy as np

parts = len(order) - 1

colors1 = plt.cm.GnBu(np.linspace(0.5, 1, 2))
colors2 = plt.cm.Purples(np.linspace(0.2, 0.7, 3))
colors3 = plt.cm.Reds(np.linspace(0.2, 0.7, parts - 3 - 3))


# combine them and build a new colormap
colors = np.vstack((colors1, colors2, colors3))
#colors = plt.cm.GnBu(np.linspace(0, 1, 3))


gdf = df.set_index(["approach"])[df.columns[1:-2]]
f = plt.figure()
ax = f.gca()
gdf.plot(kind='bar', stacked=True, ax=ax, color=colors)
lgd = plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
ax.set_xlabel("")
ax.set_ylabel("Count")
plt.xticks(rotation=0)
f.set_size_inches(10, 6)
#plt.savefig("../generated/graphs/comparison.png", bbox_extra_artists=(lgd,), bbox_inches='tight')
#plt.savefig("../generated/graphs/comparison.svg", bbox_extra_artists=(lgd,), bbox_inches='tight')
#plt.savefig("../generated/graphs/comparison.pdf", bbox_extra_artists=(lgd,), bbox_inches='tight')


In [None]:
sdf = pd.DataFrame([
    dict(**{'approach': 'PROV'}, **dict(prov["specific"]["global"])),
    dict(**{'approach': 'PROV-Dictionary'}, **dict(prov_dictionary["specific"]["global"])),
    dict(**{'approach': 'Versioned-PROV'}, **dict(versioned_prov["specific"]["global"])),

], columns=df.columns)

sdf[df.columns[1:]] = sdf[df.columns[1:]].fillna(0.0).astype(int)
sdf['nodes'] = sum(sdf[x] for x in nodes if x in sdf.columns)
sdf['edges'] = sum(sdf[x] for x in relationships if x in sdf.columns)
sdf = sdf.set_index(["approach"]).rename(index={
    "PROV": "Plain",
    "PROV-Dictionary": "Dict",
    "Versioned-PROV": "Ver",
})
gdf = (
    df.set_index(["approach"])
    #.drop(['Mutable-PROV', 'Intertwined-PROV'])
    .rename(index={
        "PROV": "Plain",
        "PROV-Dictionary": "Dict",
        "Versioned-PROV": "Ver",
    })
)

In [None]:
def plot_comparison(gdf, name):

    from collections import Counter
    import matplotlib.patches as mpatches
    def annotate(ax):
        w2 = ax.containers[0].get_children()[0].get_width() / 2.0
        c = Counter()
        for p in ax.patches:
            c[p.get_x()] += int(p.get_height())

        for key, value in c.items():
            ax.text(
                key + w2, value * 1.05, str(value), ha='center'
            )
        ax.yaxis.set_ticklabels([])

        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        #ax.spines['bottom'].set_visible(False)
        ax.spines['left'].set_visible(False)
        ax.tick_params(top=False, bottom=False, left=False, right=False, labelleft=False, labelbottom=True)

    colors = plt.cm.PuOr(np.linspace(0.2, 0.8, 2))

    f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=False)

    gdf["nodes"].plot(kind='bar', color=colors[0], ax=ax1, rot=0)
    ax1.set_xlabel("(Nodes)")
    ax1.set_ylabel("Count")
    annotate(ax1)

    gdf["edges"].plot(kind='bar', color=colors[1], ax=ax2, rot=0)
    ax2.set_xlabel("(Edges)")
    ax2.set_ylabel("")
    annotate(ax2)

    sdf = gdf[["nodes", "edges"]]
    sdf.plot(kind='bar', stacked=True, ax=ax3, color=colors)
    ax3.set_xlabel("(Both)")
    ax3.set_ylabel("")
    ax3.legend().set_visible(False)
    annotate(ax3)


    lnodes = mpatches.Patch(color=colors[0], label='Nodes')
    lrelationships = mpatches.Patch(color=colors[1], label='Edges')


    #handles, labels = ax3.get_legend_handles_labels()
    lgd = ax2.legend(
        handles=[lnodes, lrelationships],
        loc='center', ncol=2,
        bbox_to_anchor=(0.5, 1.15))


    plt.xticks(rotation=0)
    f.set_size_inches(4.8, 2.5)
    plt.savefig(name + ".png", bbox_extra_artists=(lgd,), bbox_inches='tight')
    plt.savefig(name + ".svg", bbox_extra_artists=(lgd,), bbox_inches='tight')
    plt.savefig(name + ".pdf", bbox_extra_artists=(lgd,), bbox_inches='tight')
    
plot_comparison(gdf, "../generated/presentation/comparison")

In [None]:
plot_comparison(sdf, "../generated/presentation/specific_comparison")

# Conclusion

# Limitations

- Introduces overhead for querying
  - Reconstruct versions
- Dictionary-like structure to represent lists
  - Indexes mapped to keys
  - Inserting an element at the beginning of the list requires the shift of most elements
- Optional PROV-N attributes
  - Overhead in disk due to name repetition
- Parallelism
  - Hard to keep total checkpoint order


# Future Work

- Unfolding algorithm
  - Convert Versioned-PROV into plain PROV
- Efficient querying algorithm for Versioned-PROV
- Incremental membership definition of lists
  - Add operation for adding elements and shifting them without explicit Put operations
- Adopt the proposed model in noWorkflow


# PROV-Dictionary

In [None]:
import extensible_provn.view.prov_dictionary

In [None]:
%%prov $BASE/dictionary_p1
newrank=true;
splines = "spline"

subgraph cluster_0 {
    labeljust="r"
    labelloc="b"
    color = "#333333"
    fontcolor = "#333333"
    fontsize = "24"
    fontcolor="white"
    label = "Assign";
    "g/a#1";
    "g/b#1"; "g/assign2"
    "g/a#1-attrs";
    "g/b#1-attrs"; "g/assign2-attrs"
    
}
subgraph cluster_1 {
    labeljust="r"
    labelloc="b"
    color = "#333333"
    fontcolor = "#333333"
    fontsize = "24"
    fontcolor="white"
    label = "Members";
    "g/2"; "g/6";  "g/0"; 
}

subgraph cluster_2 {
    labeljust="r"
    labelloc="b"
    color = "#333333"
    fontcolor = "#333333"
    fontsize = "24"
    fontcolor="white"
    label = "Overhead";
    "g/empty";
}
##H##
document
default <g>
prefix script <https://dew-uff.github.io/versioned-prov/ns/script#>
prefix version <https://dew-uff.github.io/versioned-prov/ns#>

entity(empty, [value="[]", type="EmptyDictionary", dot:specific="true"])
entity(2, [value="2", type="script:literal", dot:hide2="true"])
entity(6, [value="6", type="script:literal", dot:hide2="true"])
entity(0, [value="0", type="script:literal", dot:hide2="true"])

entity(a#1, [value="[2, 6, 0]", type="Dictionary", label="a"])
entity(b#1, [value="[2, 6, 0]", type="Dictionary", label="b"])

derivedByInsertionFrom(
  a#1, empty, {
    ("0", 2),
    ("1", 6),
    ("2", 0)
}, [dot:specific="true"])

derivedByInsertionFrom(
  b#1, empty, {
    ("0", 2),
    ("1", 6),
    ("2", 0)
}, [dot:specific="true"])


activity(assign2, [type="script:assign"])
wasDerivedFrom(b#1, a#1, assign2, u2, g2, [dot:dist="2"])
used(u2; assign2, a#1, -, [dot:dist="0"])
wasGeneratedBy(g2; b#1, assign2, -, [dot:dist="0"])
endDocument
##F##
{rank=same "g/a#1" "g/a#1-attrs"}
{rank=same "g/2" "g/6" "g/0"}
"g/0" -> "g/6" -> "g/2" [style=invis]


In [None]:
%%prov $BASE/dictionary_p2
newrank=true;
splines = "spline"

subgraph cluster_0 {
    labeljust="r"
    labelloc="b"
    color = "#333333"
    fontcolor = "#333333"
    fontsize = "36"
    fontcolor="white"
    label = "Assign";
    "g/a@1"; "g/1"; "g/3"
    "g/a#1"; "g/assign3"
    
    "g/a@1-attrs"; "g/1-attrs"; "g/3-attrs"
    "g/a#1-attrs"; "g/assign3-attrs"
    
    
}
subgraph cluster_1 {
    labeljust="r"
    labelloc="b"
    color = "#333333"
    fontcolor = "#333333"
    fontsize = "36"
    
    fontcolor="white"
    label = "Members";
    "g/2"; "g/6"; "g/0"; "g/b#1"; "g/empty"
}

subgraph cluster_2 {
    labeljust="r"
    labelloc="t"
    color = "#333333"
    fontcolor = "#333333"
    fontsize = "36"
    
    fontcolor="white"
    label = "Overhead";
    "g/a#2";
    "g/b#2"; "g/b#2-attrs" 
}

##H##
document
default <g>
prefix script <https://dew-uff.github.io/versioned-prov/ns/script#>
prefix version <https://dew-uff.github.io/versioned-prov/ns#>

entity(empty, [value="[]", type="EmptyDictionary", dot:hide2="true"])
entity(2, [value="2", type="script:literal", dot:hide2="true"])
entity(6, [value="6", type="script:literal", dot:hide2="true"])
entity(0, [value="0", type="script:literal", dot:hide2="true"])
entity(a#1, [value="[2, 6, 0]", type="script:name", label="a"])
entity(b#1, [value="[2, 6, 0]", type="script:name", label="b", dot:hide2="true"])
entity(3, [value="3", type="script:literal"])
entity(1, [value="1", type="script:literal"])
entity(a#2, [value="[2, 3, 0]", type="script:name", label="a", dot:specific="true"])
entity(b#2, [value="[2, 3, 0]", type="script:name", label="b", dot:specific="true"])

       
       
entity(a@1, [value="3", type="script:access", label="a[1]"])
       
derivedByInsertionFrom(
  a#1, empty, {
    ("0", 2),
    ("1", 6),
    ("2", 0)
}, [dot:hide2="true"])

derivedByInsertionFrom(
  b#1, empty, {
    ("0", 2),
    ("1", 6),
    ("2", 0)
}, [dot:hide2="true"])

derivedByInsertionFrom(
  a#2, empty, {
    ("1", a@1)
}, [dot:specific="true"])

derivedByInsertionFrom(
  b#2, empty, {
    ("1", a@1)
}, [dot:specific="true"])

activity(assign3, [type="script:assign"])
wasDerivedFrom(a#2, a#1, assign3, g4, u4, [dot:specific="true"])
wasDerivedFrom(a#2, 3, assign3, g4, u4, [dot:specific="true", dot:dist="0.5", dot:angle="270.0"])
used(u4; assign3, a#1, -, [dot:dist="2", dot:angle="-30.0"])
wasGeneratedBy(g4; a#2, assign3, -, [dot:specific="true", dot:dist="1", dot:angle="270.0"])
wasDerivedFrom(b#2, b#1, assign3, g6, u6, [dot:specific="true"])
wasDerivedFrom(b#2, 3, assign3, g6, u5, [dot:specific="true"])
wasDerivedFrom(a@1, 3, assign3, g5, u5)

used(assign3, 1, -)
endDocument
##F##
{rank=same "g/2" "g/6" "g/0"}
{rank=same "g/a@1" "g/a#2" "g/b#2"}
//{rank=same "g/a#1" "g/a#1-attrs"}
{rank=same "g/1" "g/3"}
"g/0" -> "g/6" -> "g/2" [style=invis]
"g/1" -> "g/2" [style=invis]
//"g/2" -> "g/3" [style=invis]
"g/a#2" -> "g/6" [style=invis]
//"g/b#2" -> "g/6" [style=invis]
