In [1]:
import os
import json
import sympy
import requests

import pyciemss
import pyciemss.visuals.plots as plots
import pyciemss.visuals.vega as vega
import pyciemss.visuals.trajectories as trajectories

from mira.metamodel import *
from mira.modeling.amr.petrinet import AMRPetriNetModel, template_model_to_petrinet_json
from mira.sources.amr.petrinet import template_model_from_amr_json

In [2]:
%load_ext autoreload
%autoreload 1

In [3]:
MODEL_PATH = "https://raw.githubusercontent.com/DARPA-ASKEM/simulation-integration/main/data/models/"

model1 = os.path.join(MODEL_PATH, "SEIRD_base_model01_petrinet.json")
model2 = os.path.join(MODEL_PATH, "SEIRHD_base_model01_petrinet.json")
model3 = os.path.join(MODEL_PATH, "LV_sheep_foxes.json")

In [4]:
start_time = 0.0
end_time = 10.0
logging_step_size = 1
num_samples = 100
n = 10

In [5]:
result1 = pyciemss.sample(model1, end_time, logging_step_size, num_samples, start_time=start_time)
display(result1['data'].head())
result1['data'][['timepoint_id', 'sample_id', 'S_state', 'I_state', "R_state"]].to_csv('sir.csv', index= False) # 


Unnamed: 0,timepoint_id,sample_id,timepoint_unknown,persistent_beta_param,persistent_death_param,persistent_gamma_param,persistent_I0_param,S_state,I_state,E_state,R_state,D_state,infected_observable_state,dead_observable_state
0,0,0,0.0,0.757549,0.002142,0.441039,6.052447,19339994.0,6.052447,40.0,0.0,0.0,6.052447,0.0
1,1,0,1.0,0.757549,0.002142,0.441039,6.052447,19339988.0,11.608709,37.320126,3.999501,0.008584,11.608709,0.008584
2,2,0,2.0,0.757549,0.002142,0.441039,6.052447,19339978.0,15.060238,38.123539,9.915862,0.021281,15.060238,0.021281
3,3,0,3.0,0.757549,0.002142,0.441039,6.052447,19339966.0,17.655766,40.718361,17.132839,0.03677,17.655766,0.03677
4,4,0,4.0,0.757549,0.002142,0.441039,6.052447,19339952.0,19.971563,44.360088,25.416597,0.054548,19.971563,0.054548


### Plot histogram per state combination


In [6]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from itertools import combinations
import networkx as nx
from pyciemss.visuals import plots, vega
bin_outliers = False
log = False

# loading SIR data
sir_dataset = pd.read_csv("sir.csv")


def process_dataset(sir_dataset, bin_outliers, log):
    """
    This function bins the columns in the dataset that includes "_state" and plots histograms for each column.
    
    Inputs:
    sir_dataset (pd.DataFrame): The dataset with states, timepoint_id and sample_id
    bin_outliers (bool): A boolean value indicating whether to bin outliers separately in their own bin.
    log (bool): A boolean value indicating whether to use a log scale for the bins.
    
    Outputs:
    sir_dataset (pd.DataFrame): The processed dataset with added bin columns.
    bin_dict (dict): A dictionary storing the min and max values of each bin per column.
    """
    # Determine the number of bins using Sturges' rule and the nu
    num_bins =  math.ceil(math.log2(sir_dataset.shape[0]))

    sir_dataset_state = sir_dataset.loc[:, sir_dataset.columns.str.contains('_state')]

    # dictionary to store the min and max values of each bin per column
    bin_dict = {}

    sir_dataset_output = sir_dataset.copy()

    for col in sir_dataset_state.columns:
        max_val = sir_dataset_output[col].max()
        min_val = sir_dataset_output[col].min()

        # Calculate the 5th and 95th percentiles
        p5, p95 = sir_dataset_output[col].quantile([0.05, 0.95])

        # get histogram in the logspace
        if log:
            if bin_outliers:
                # put all outliers in their own bin at the end (trying to avoid the massive histogram bin for the S_state)
                p5_log = np.log10(p5)
                p95_log = np.log10(p95)

                bin_edges = np.logspace(np.max([0.0, p5_log]), p95_log, num=num_bins)
                bin_edges = np.concatenate(([min_val], bin_edges, [max_val]))
            else:
                min_log= np.log10(min_val)
                max_log = np.log10(max_val)
                bin_edges = np.logspace(np.max([0.0, min_log]), max_log, num=num_bins)
        else:
            if bin_outliers:
                bin_edges = np.linspace(p5, p95, num=num_bins)
                bin_edges = np.concatenate(([min_val], bin_edges, [max_val]))
            else: 
                bin_edges = np.linspace(min_val, max_val, num=num_bins)

        # saving bins edges in dictionary by state and bin number
        bin_dict[col] = [{i:(bin_edges[i], bin_edges[i+1])} for i in range(len(bin_edges)-1)]
        # add column with the bin number per histogram
        sir_dataset_output[col + '_bin'] = sir_dataset_output[col].apply(lambda x: np.digitize(x, bin_edges))

    # create a new column that combines the bin information for each state
    sir_dataset_output['combined_bin'] = sir_dataset_output.apply(lambda row: '_'.join([f"{col.replace('_state_bin','')}_{int(row[col])}" for col in sir_dataset_output.columns if '_bin' in col]), axis=1)

    # create a mapping from original labels to shorter labels
    unique_bins = sir_dataset_output['combined_bin'].unique()
    label_mapping = {bin: f'bin_{i}' for i, bin in enumerate(unique_bins)}
    sir_dataset_output['short_bin'] = sir_dataset_output['combined_bin'].map(label_mapping)

    # save the mapping to a CSV file
    pd.DataFrame(list(label_mapping.items()), columns=['original_label', 'short_label']).to_csv('label_mapping.csv', index=False)

    return sir_dataset_output, bin_dict




In [7]:

def remove_consecutive_duplicates(lst):
    if not lst:  # if the list is empty, return it
        return lst
    new_lst = [lst[0]]  # add the first item of lst to new_lst
    for item in lst[1:]:  # iterate over the rest of lst
        if item != new_lst[-1]:  # if the current item is not the same as the last item in new_lst
            new_lst.append(item)  # add it to new_lst
    return new_lst

def plot_paths(binned_data, *, n=None, fig_width=10):
    bins_list = []
    for sample_id in binned_data['sample_id'].unique()[:n]:
        sample_df = binned_data[binned_data['sample_id'] == sample_id]
        bins = sample_df['combined_bin'].values
        bins_list.append(' '.join(bins.tolist()))

    return bins_list



In [8]:
sir_dataset_output, bin_dict= process_dataset(sir_dataset, bin_outliers = False, log = True)
bins_list =  plot_paths(sir_dataset_output)[:10]


  min_log= np.log10(min_val)


In [9]:
%load_ext autoreload
%autoreload 1

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
import sententree
import sententree_vega
import networkx as nx
import vega
import vl_convert

%aimport sententree
%aimport sententree_vega
%aimport vega

In [11]:
logging_keywords = ["TRACE", "DEBUG", "INFO", "LOG"]
def contains_any(line, keywords):
    return any([w in line for w in keywords])

def clean_line(l):
    no_date = " ".join(l.split(" ")[1:]).strip()
    return no_date.replace("::", "-").replace(":", "-")


In [12]:
# Install Graphviz (run this in a terminal if 'brew' is not accessible from Jupyter)
# !brew install graphviz

# Update the PATH environment variable
import os
os.environ['PATH'] += os.pathsep + '/usr/local/bin'

# Verify PATH update and executable
# The following should print the path to the 'dot' executable
!which dot

# Confirm that the 'dot' command works
# The following should print the version of Graphviz installed
!dot -V

/usr/local/bin/dot
dot - graphviz version 12.0.0 (20240704.0754)


In [26]:
# %%prun -D build_sentence.profile

cluster = [clean_line(l) for l in bins_list if contains_any(l, logging_keywords)][:200]

G = sententree.build_sententree(bins_list,
                                min_support = 1,
                                num_exemplars=3,
                                tag_with=sententree.tag_words_with_index)



In [24]:
edges = G.edges(data=True)
edges

OutEdgeDataView([('S_10_I_5_R_0_0', 'S_10_I_6_R_5_1', {'weight': 1}), ('S_10_I_5_R_0_0', 'S_10_I_5_R_4_1', {'weight': 1}), ('S_10_I_5_R_0_0', 'S_10_I_6_R_4_1', {'weight': 1}), ('S_10_I_5_R_0_0', 'S_10_I_6_R_3_1', {'weight': 2}), ('S_10_I_6_R_5_1', 'S_10_I_6_R_6_2', {'weight': 1}), ('S_10_I_6_R_6_2', 'S_9_I_6_R_7_3', {'weight': 3}), ('S_9_I_6_R_7_3', 'S_9_I_6_R_8_4', {'weight': 2}), ('S_9_I_6_R_7_3', 'S_9_I_6_R_7_4', {'weight': 1}), ('S_9_I_6_R_8_4', 'S_9_I_6_R_9_5', {'weight': 1}), ('S_9_I_6_R_8_4', 'S_9_I_6_R_8_5', {'weight': 1}), ('S_9_I_6_R_9_5', 'S_9_I_6_R_9_6', {'weight': 1}), ('S_9_I_6_R_9_6', 'S_8_I_7_R_9_7', {'weight': 2}), ('S_8_I_7_R_9_7', 'S_8_I_7_R_10_8', {'weight': 2}), ('S_8_I_7_R_9_7', 'S_8_I_7_R_9_8', {'weight': 1}), ('S_8_I_7_R_10_8', 'S_8_I_7_R_10_9', {'weight': 2}), ('S_8_I_7_R_10_9', 'S_7_I_7_R_10_10', {'weight': 2}), ('S_8_I_7_R_10_9', 'S_8_I_7_R_10_10', {'weight': 1}), ('S_10_I_5_R_4_1', 'S_10_I_6_R_6_2', {'weight': 1}), ('S_9_I_6_R_8_5', 'S_9_I_6_R_9_6', {'weight

In [25]:
nodes = G.nodes(data =True)
nodes

NodeDataView({'S_10_I_5_R_0_0': {'count': 1}, 'S_10_I_6_R_5_1': {'count': 1}, 'S_10_I_6_R_6_2': {'count': 1}, 'S_9_I_6_R_7_3': {'count': 1}, 'S_9_I_6_R_8_4': {'count': 1}, 'S_9_I_6_R_9_5': {'count': 1}, 'S_9_I_6_R_9_6': {'count': 1}, 'S_8_I_7_R_9_7': {'count': 1}, 'S_8_I_7_R_10_8': {'count': 1}, 'S_8_I_7_R_10_9': {'count': 1}, 'S_7_I_7_R_10_10': {'count': 1}, 'S_10_I_5_R_4_1': {'count': 1}, 'S_9_I_6_R_8_5': {'count': 1}, 'S_10_I_6_R_4_1': {'count': 1}, 'S_9_I_6_R_7_4': {'count': 1}, 'S_9_I_7_R_8_5': {'count': 1}, 'S_9_I_7_R_9_6': {'count': 1}, 'S_8_I_7_R_9_8': {'count': 1}, 'S_7_I_8_R_10_9': {'count': 1}, 'S_7_I_8_R_10_10': {'count': 1}, 'S_10_I_6_R_3_1': {'count': 1}, 'S_10_I_6_R_5_2': {'count': 1}, 'S_10_I_6_R_6_3': {'count': 1}, 'S_9_I_7_R_7_4': {'count': 1}, 'S_9_I_7_R_7_5': {'count': 1}, 'S_9_I_7_R_8_6': {'count': 1}, 'S_9_I_7_R_8_7': {'count': 1}, 'S_9_I_7_R_9_8': {'count': 1}, 'S_8_I_7_R_9_9': {'count': 1}, 'S_8_I_7_R_9_10': {'count': 1}, 'S_10_I_6_R_4_2': {'count': 1}, 'S_9_I_7

In [32]:
import networkx as nx
from typing import List, Dict, Tuple, Any, Optional
import random
import json

def prepare_vega_data(
    graph: nx.Graph,
    y_axis_attributes
) -> Dict[str, List[Dict[str, Any]]]:
    """Prepare the graph data in a format suitable for Vega plotting."""

    def extract_position(node_id: str, coord: str) -> Tuple[float, float]:
        components = node_id.split('_')
        x_base = int(components[-1])
        y_base = int(components[components.index(coord) + 1])
        return x_base, y_base

    graph = nx.convert_node_labels_to_integers(graph, label_attribute="original_label")
    gjson = nx.json_graph.node_link_data(graph)

    nodes = []
    for attr_id, y_axis_attr in enumerate(y_axis_attributes):
        for index, item in enumerate(gjson["nodes"]):
            node_id = item["original_label"]
            x, y = extract_position(node_id, y_axis_attr)
            nodes.append({
                "id": node_id,
                "x": x,
                "y": y,
                "count": item.get("count", 1),
                "exemplar": item.get("exemplar", False),
                "graph": attr_id + 1,
                "label_id": y_axis_attr + "_" + str(x)
            })

    # Prepare edges
    edges = [
        {
            "source": gjson["nodes"][item["source"]]["original_label"],
            "target": gjson["nodes"][item["target"]]["original_label"],
            "weight": item.get("weight", 1)
        }
        for item in gjson["links"]
    ]

    return {"nodes": nodes, "edges": edges}

# Example usage
import random


# Function to convert the graph
def convert_graph(G) -> Tuple[Dict[str, Dict[str, Any]], List[Tuple[str, str, int]]]:
    # Prepare the nodes dictionary
    nodes = {
        node: {
            "count": data.get("count", 1),
            "exemplar": data.get("exemplar", False),
            "graph": data.get("graph", 1)
        }
        for node, data in G.nodes(data=True)
    }

    # Prepare the edges list
    edges = [
        (u, v, data.get("weight", 1))
        for u, v, data in G.edges(data=True)
    ]

    return nodes, edges


# List of y-axis attributes to cycle through
y_axis_attributes = ['I', 'S', "R"]

# Prepare the data using the list of attributes
vega_data = prepare_vega_data(G, y_axis_attributes)
print("Data prepared:")
print(json.dumps(vega_data, indent=2))


Data prepared:
{
  "nodes": [
    {
      "id": "S_10_I_5_R_0_0",
      "x": 0,
      "y": 5,
      "count": 1,
      "exemplar": false,
      "graph": 1,
      "label_id": "I_0"
    },
    {
      "id": "S_10_I_6_R_5_1",
      "x": 1,
      "y": 6,
      "count": 1,
      "exemplar": false,
      "graph": 1,
      "label_id": "I_1"
    },
    {
      "id": "S_10_I_6_R_6_2",
      "x": 2,
      "y": 6,
      "count": 1,
      "exemplar": false,
      "graph": 1,
      "label_id": "I_2"
    },
    {
      "id": "S_9_I_6_R_7_3",
      "x": 3,
      "y": 6,
      "count": 1,
      "exemplar": false,
      "graph": 1,
      "label_id": "I_3"
    },
    {
      "id": "S_9_I_6_R_8_4",
      "x": 4,
      "y": 6,
      "count": 1,
      "exemplar": false,
      "graph": 1,
      "label_id": "I_4"
    },
    {
      "id": "S_9_I_6_R_9_5",
      "x": 5,
      "y": 6,
      "count": 1,
      "exemplar": false,
      "graph": 1,
      "label_id": "I_5"
    },
    {
      "id": "S_9_I_6_R_9_6",
  

In [33]:
with open("/Users/oost464/Library/CloudStorage/OneDrive-PNNL/Desktop/projects/pyciemss/SentenTree_for_Logs/code/bins_sententree.vg.json", "r") as f:
    schema = json.load(f)


schema["data"] = vega.replace_named_with(
    schema["data"], "nodes", ["values"], vega_data['nodes']
)

schema["data"] = vega.replace_named_with(
    schema["data"], "edges", ["values"], vega_data['edges']
    )
print(schema)

with open("bin_example.png", "wb") as f:
    png = vl_convert.vega_to_png(schema, scale=4)
    f.write(png)

vega.display(schema, format="interactive")


{'$schema': 'https://vega.github.io/schema/vega/v5.json', 'width': 1200, 'height': 800, 'padding': 5, 'signals': [{'name': 'numGraphs', 'value': 3}, {'name': 'graphWidth', 'update': 'width / numGraphs'}, {'name': 'graphHeight', 'value': 400}, {'name': 'hoveredNodeID', 'value': None, 'on': [{'events': 'symbol:mouseover', 'update': 'datum.id'}, {'events': 'symbol:mouseout', 'update': 'null'}]}], 'scales': [{'name': 'x', 'domain': [0, 10], 'range': [0, {'signal': 'graphWidth'}]}, {'name': 'y', 'domain': [0, 10], 'range': [0, {'signal': 'graphHeight'}]}, {'name': 'color', 'type': 'ordinal', 'domain': [True, False], 'range': ['red', 'blue']}], 'data': [{'name': 'nodes', 'values': [{'id': 'S_10_I_5_R_0_0', 'x': 0, 'y': 5, 'count': 1, 'exemplar': False, 'graph': 1, 'label_id': 'I_0'}, {'id': 'S_10_I_6_R_5_1', 'x': 1, 'y': 6, 'count': 1, 'exemplar': False, 'graph': 1, 'label_id': 'I_1'}, {'id': 'S_10_I_6_R_6_2', 'x': 2, 'y': 6, 'count': 1, 'exemplar': False, 'graph': 1, 'label_id': 'I_2'}, {'i