# Visualizing Multiprocess Dataflows

Scientific applications can be the composition of many different services often spread across smaller processes and building blocks.
While this streamlines software engineering and recomposition it can be much harder to analyse I/O behavior in a network of independent applications.
This demonstration walks through the process of reconstructing data flows between different darshan instrumented applications using networkx and holoviews.

In [25]:
import darshan
darshan.enable_experimental()

In [26]:
# find relevant darshan logs
import glob
darshan_logs = glob.glob("darshan-graph/*.darshan")
darshan_logs

['darshan-graph/pq_app_read_id71317_7-31-5657-2037904274838284930_55623.darshan',
 'darshan-graph/pq_app_read_id71344_7-31-5658-2037904274838284930_55623.darshan',
 'darshan-graph/pq_app_write_id71303_7-31-5657-2037904274838284930_55623.darshan',
 'darshan-graph/pq_app_write_id71310_7-31-5657-2037904274838284930_55623.darshan',
 'darshan-graph/pq_app_write_id71296_7-31-5657-2037904274838284930_55623.darshan',
 'darshan-graph/pq_app_readAB_writeC_id71326_7-31-5658-2037904274838284930_55623.darshan']

In [27]:
# Optionally, get an overview of what is in the logs
def print_infos(darshan_logs):
    for log in darshan_logs:
        report = darshan.DarshanReport(log, read_all=True)  # Default behavior
        report.info()
        print(report.name_records)
        print()
        
#print_infos(darshan_logs) # uncomment, to show

### Aggregating and Populating a Graph Representation

A suitable data structure to represent and analyse these relations are graphs. For this example we will use networkx, a powerfull python graph library to manage the data flow graph.

In [28]:
import networkx as nx

# Empty graph to hold relationships
G = nx.DiGraph()

# Example Usage:
# G.add_node(nodeid, attr1="", attr2=42)
# G.add_edge(src_nodeid, tgt_nodeid, attr5=82, attr32="")

In [29]:
# Determine largest dataflow, later used to normalize edge thickness
rwmax = 0

# A color map used to color by ntype
node_color_map = {
    "app": "orange",
    "file": "silver"
}



# Loop through log files and populate graph with files and dataflows
# Add a node for each log, and nodes for every file
for i, log in enumerate(darshan_logs):  
    
    
    print(log)
    name = log
    ntype = "app"

    aidx = name
    G.add_node(aidx, ntype=ntype, fill_color=node_color_map[ntype])
        
    r = darshan.DarshanReport(log, read_all=True)
    #r.info()
    
    
    # Aggregate records but preserve distinct name_records and modules
    flows = r.reduce(mods="unique", name_records="unique").records
    #print(flows)
    
    
    # for each modules and filename add dataflows to graph as edges
    for mod in flows:    
        for rec in flows[mod]:
            
            name = r.name_records[rec['id']]
            ntype = "file"

            print(name)
            bytesr = 0
            bytesw = 0

            # counters as dictionary:
            cdic = dict(zip(r.counters[mod]['counters'], rec['counters']))
            #print(cdic)
            
            
            # Aggregate custom counters: Here total bytes read/written across levels
            # Note: This may be inaccurate
            if "POSIX_BYTES_WRITTEN" in cdic:
                bytesw += cdic["POSIX_BYTES_WRITTEN"]
            if "POSIX_BYTES_READ" in cdic:
                bytesr += cdic["POSIX_BYTES_READ"]
                
            if "STDIO_BYTES_WRITTEN" in cdic:
                bytesw += cdic["STDIO_BYTES_WRITTEN"]
            if "STDIO_BYTES_WRITTEN" in cdic:
                bytesw += cdic["STDIO_BYTES_WRITTEN"]
                
            if "MPIIO_BYTES_WRITTEN" in cdic:
                bytesw += cdic["MPIIO_BYTES_WRITTEN"]
            if "MPIIO_BYTES_READ" in cdic:
                bytesr += cdic["MPIIO_BYTES_READ"]
                
                
            print("r", bytesr, "w", bytesw)

            rwmax = max(rwmax, bytesr, bytesw)
                
            
            
            if name in ['<STDIN>', '<STDOUT>', '<STDERR>']:
                # prefix stdX with logname, to prevent undesired relations
                fidx = aidx + "/" + name
            else:
                fidx = name

                
            # add/update node which represents file
            G.add_node(fidx, ntype=ntype, fill_color=node_color_map[ntype])

            
            # Add edges for read/write flows (also for zero-flows to associate lockfiles)
            G.add_edge(aidx, fidx, op="w", weight=bytesw+0.1)
            G.add_edge(fidx, aidx, op="r", weight=bytesr+0.1)
            
            # NOTE: Depending on the analysis: Do not add dataflows if 0 bytes read/written
            #if bytesw > 0:
            #    G.add_edge(aidx, fidx, op="w", weight=bytesw+0.1)   
            #if bytesr > 0:
            #    G.add_edge(fidx, aidx, op="r", weight=bytesr+0.1)

            print()
        
    print()
    print()

        
print("rwmax:", rwmax)

darshan-graph/pq_app_read_id71317_7-31-5657-2037904274838284930_55623.darshan
/home/pq/p/software/darshan-pydarshan/darshan-util/pydarshan/examples/darshan-graph/A
r 10000 w 0

<STDOUT>
r 0 w 696



darshan-graph/pq_app_read_id71344_7-31-5658-2037904274838284930_55623.darshan
/home/pq/p/software/darshan-pydarshan/darshan-util/pydarshan/examples/darshan-graph/C
r 2300 w 0

<STDOUT>
r 0 w 696



darshan-graph/pq_app_write_id71303_7-31-5657-2037904274838284930_55623.darshan
/home/pq/p/software/darshan-pydarshan/darshan-util/pydarshan/examples/darshan-graph/B
r 0 w 10000

<STDOUT>
r 0 w 756



darshan-graph/pq_app_write_id71310_7-31-5657-2037904274838284930_55623.darshan
/home/pq/p/software/darshan-pydarshan/darshan-util/pydarshan/examples/darshan-graph/Z
r 0 w 10000

<STDOUT>
r 0 w 756



darshan-graph/pq_app_write_id71296_7-31-5657-2037904274838284930_55623.darshan
/home/pq/p/software/darshan-pydarshan/darshan-util/pydarshan/examples/darshan-graph/A
r 0 w 10000

<STDOUT>
r 0 w 756



dar

### Visualize and Explore

Graph representations are especially useful for visualization. With holoviews fairly interactive graphs with tooltips on mouseover can be generated:

In [30]:
import holoviews as hv
import networkx as nx
from holoviews import opts
import hvplot.networkx as hvnx

try:
    import pygraphviz  # noqa
    from networkx.drawing.nx_agraph import graphviz_layout
except ImportError:
    try:
        import pydot  # noqa
        from networkx.drawing.nx_pydot import graphviz_layout
    except ImportError:
        raise ImportError("This example needs Graphviz and either PyGraphviz or pydot.")

        
# for a a more complex example for this layout to shine:       
#G = nx.balanced_tree(3, 5)

pos = graphviz_layout(G, prog='sfdp')
hvnx.draw(G, 
          pos, 
          node_size=250, 
          node_color=hv.dim('fill_color'), 
          edge_width=hv.dim('weight')/rwmax * 5 + 0.1, 
          alpha=0.66, 
          with_labels=False, 
          width=600, height=800)

# NOTE: Be sure to hover on nodes to display filename and other information ;)