In [42]:
import os
import json
import sympy
import requests

import pyciemss
import pyciemss.visuals.plots as plots
import pyciemss.visuals.vega as vega
import pyciemss.visuals.trajectories as trajectories

from mira.metamodel import *
from mira.modeling.amr.petrinet import AMRPetriNetModel, template_model_to_petrinet_json
from mira.sources.amr.petrinet import template_model_from_amr_json

In [43]:
%load_ext autoreload
%autoreload 1

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [44]:
MODEL_PATH = "https://raw.githubusercontent.com/DARPA-ASKEM/simulation-integration/main/data/models/"

model1 = os.path.join(MODEL_PATH, "SEIRD_base_model01_petrinet.json")
model2 = os.path.join(MODEL_PATH, "SEIRHD_base_model01_petrinet.json")
model3 = os.path.join(MODEL_PATH, "LV_sheep_foxes.json")

In [45]:
start_time = 0.0
end_time = 20.0
logging_step_size = 1
num_samples = 100

In [46]:
result1 = pyciemss.sample(model1, end_time, logging_step_size, num_samples, start_time=start_time)
display(result1['data'].head())
result1['data'][['timepoint_id', 'sample_id', 'S_state', 'I_state', 'R_state']].to_csv('sir.csv', index= False)


Unnamed: 0,timepoint_id,sample_id,timepoint_unknown,persistent_beta_param,persistent_death_param,persistent_gamma_param,persistent_I0_param,S_state,I_state,E_state,R_state,D_state,infected_observable_state,dead_observable_state
0,0,0,0.0,0.123341,0.00189,0.168237,8.854443,19339992.0,8.854443,40.0,0.0,0.0,8.854443,0.0
1,1,0,1.0,0.123341,0.00189,0.168237,8.854443,19339992.0,15.752962,32.537804,2.108264,0.003991,15.752962,0.003991
2,2,0,2.0,0.123341,0.00189,0.168237,8.854443,19339992.0,20.149252,27.327955,5.151094,0.009752,20.149252,0.009752
3,3,0,3.0,0.123341,0.00189,0.168237,8.854443,19339992.0,22.858267,23.648386,8.78119,0.016625,22.858267,0.016625
4,4,0,4.0,0.123341,0.00189,0.168237,8.854443,19339986.0,24.431896,21.00972,12.76456,0.024167,24.431896,0.024167


### Plot histogram per state combination


In [47]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from itertools import combinations
import networkx as nx
from pyciemss.visuals import plots, vega
bin_outliers = False
log = False

# loading SIR data
sir_dataset = pd.read_csv("sir.csv")


def process_dataset(sir_dataset, bin_outliers, log):
    """
    This function bins the columns in the dataset that includes "_state" and plots histograms for each column.
    
    Inputs:
    sir_dataset (pd.DataFrame): The dataset with states, timepoint_id and sample_id
    bin_outliers (bool): A boolean value indicating whether to bin outliers separately in their own bin.
    log (bool): A boolean value indicating whether to use a log scale for the bins.
    
    Outputs:
    sir_dataset (pd.DataFrame): The processed dataset with added bin columns.
    bin_dict (dict): A dictionary storing the min and max values of each bin per column.
    """
    # Determine the number of bins using Sturges' rule and the nu
    num_bins =  math.ceil(math.log2(sir_dataset.shape[0]))

    sir_dataset_state = sir_dataset.loc[:, sir_dataset.columns.str.contains('_state')]

    # dictionary to store the min and max values of each bin per column
    bin_dict = {}

    sir_dataset_output = sir_dataset.copy()

    for col in sir_dataset_state.columns:
        max_val = sir_dataset_output[col].max()
        min_val = sir_dataset_output[col].min()

        # Calculate the 5th and 95th percentiles
        p5, p95 = sir_dataset_output[col].quantile([0.05, 0.95])

        # get histogram in the logspace
        if log:
            if bin_outliers:
                # put all outliers in their own bin at the end (trying to avoid the massive histogram bin for the S_state)
                p5_log = np.log10(p5)
                p95_log = np.log10(p95)

                bin_edges = np.logspace(np.max([0.0, p5_log]), p95_log, num=num_bins)
                bin_edges = np.concatenate(([min_val], bin_edges, [max_val]))
            else:
                min_log= np.log10(min_val)
                max_log = np.log10(max_val)
                bin_edges = np.logspace(np.max([0.0, min_log]), max_log, num=num_bins)
        else:
            if bin_outliers:
                bin_edges = np.linspace(p5, p95, num=num_bins)
                bin_edges = np.concatenate(([min_val], bin_edges, [max_val]))
            else: 
                bin_edges = np.linspace(min_val, max_val, num=num_bins)

        # saving bins edges in dictionary by state and bin number
        bin_dict[col] = [{i:(bin_edges[i], bin_edges[i+1])} for i in range(len(bin_edges)-1)]
        # add column with the bin number per histogram
        sir_dataset_output[col + '_bin'] = sir_dataset_output[col].apply(lambda x: np.digitize(x, bin_edges))

    # create a new column that combines the bin information for each state
    sir_dataset_output['combined_bin'] = sir_dataset_output.apply(lambda row: ''.join([f"{col.replace('_state_bin','')}{int(row[col])}" for col in sir_dataset_output.columns if '_bin' in col]), axis=1)

    # create a mapping from original labels to shorter labels
    unique_bins = sir_dataset_output['combined_bin'].unique()
    label_mapping = {bin: f'bin_{i}' for i, bin in enumerate(unique_bins)}
    sir_dataset_output['short_bin'] = sir_dataset_output['combined_bin'].map(label_mapping)

    # save the mapping to a CSV file
    pd.DataFrame(list(label_mapping.items()), columns=['original_label', 'short_label']).to_csv('label_mapping.csv', index=False)

    return sir_dataset_output, bin_dict




In [48]:

def remove_consecutive_duplicates(lst):
    if not lst:  # if the list is empty, return it
        return lst
    new_lst = [lst[0]]  # add the first item of lst to new_lst
    for item in lst[1:]:  # iterate over the rest of lst
        if item != new_lst[-1]:  # if the current item is not the same as the last item in new_lst
            new_lst.append(item)  # add it to new_lst
    return new_lst

def plot_paths(binned_data, *, n=None, fig_width=10):
    """
    Turn the dataset column into a list based on sample_id
    Remove duplicates
    Input: dataframe with sample_id and bins
    Outputs: List of bins as words
    """
    bins_list = []
    for sample_id in binned_data['sample_id'].unique()[:n]:
        sample_df = binned_data[binned_data['sample_id'] == sample_id]
        bins = sample_df['combined_bin'].values
        bins_list.append(' '.join(remove_consecutive_duplicates(bins.tolist())))

    return bins_list



In [49]:
sir_dataset_output, bin_dict= process_dataset(sir_dataset, bin_outliers = False, log = True)
bins_list =  plot_paths(sir_dataset_output)
len(bins_list)
print(bins_list)

['S11I4R0 S11I4R2 S11I5R3 S11I5R4 S11I5R5 S11I5R6 S11I5R7', 'S11I4R0 S11I5R2 S11I5R3 S11I5R4 S11I6R4 S11I6R5 S11I6R6 S11I7R6 S11I7R7 S11I8R8 S10I8R8 S10I9R9 S9I9R9 S8I10R10 S7I10R10', 'S11I3R0 S11I4R1 S11I5R2 S11I5R3 S11I6R4 S11I6R5 S11I7R6 S11I7R7 S11I8R7 S11I8R8 S10I8R8 S10I9R9 S9I9R9', 'S11I4R0 S11I5R2 S11I5R3 S11I6R4 S11I6R5 S11I6R6 S11I7R6 S11I7R7 S11I8R7 S11I8R8 S10I8R8 S10I9R8 S10I9R9 S9I9R9 S8I10R10 S7I10R10 S6I11R11', 'S11I4R0 S11I4R3 S11I5R4 S11I5R5 S11I5R6 S11I6R7 S11I6R8 S11I7R8 S11I7R9 S10I8R9 S10I8R10', 'S11I4R0 S11I5R2 S11I5R4 S11I6R5 S11I6R6 S11I7R7 S11I7R8 S11I8R8 S10I8R8 S10I8R9 S10I9R9 S9I9R10 S8I9R10 S8I10R10 S7I10R11 S5I10R11', 'S11I1R0 S11I4R1 S11I4R3 S11I4R4 S11I5R5 S11I4R5 S11I4R6 S11I3R7', 'S11I2R0 S11I4R1 S11I4R3 S11I4R4 S11I4R5 S11I4R6 S11I3R7', 'S11I4R0 S11I4R2 S11I5R3 S11I5R4 S11I6R5 S11I6R6 S11I7R6 S11I7R7 S11I8R8 S10I8R8 S10I8R9 S10I9R9 S9I9R9 S9I9R10 S8I10R10 S7I10R10 S6I10R11 S5I11R11', 'S11I4R0 S11I5R3 S11I5R4 S11I5R5 S11I5R6 S11I6R6 S11I6R7 S11I6R8 S1

  min_log= np.log10(min_val)


In [50]:
%load_ext autoreload
%autoreload 1

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [51]:
import sententree
import sententree_vega
import networkx as nx
import vega
import vl_convert

%aimport sententree
%aimport sententree_vega
%aimport vega

In [52]:
logging_keywords = ["TRACE", "DEBUG", "INFO", "LOG"]
def contains_any(line, keywords):
    return any([w in line for w in keywords])

def clean_line(l):
    no_date = " ".join(l.split(" ")[1:]).strip()
    return no_date.replace("::", "-").replace(":", "-")


In [53]:
# %%prun -D build_sentence.profile


G = sententree.build_sententree(bins_list,
                                min_support = 3,
                                num_exemplars=3,
                                tag_with=sententree.tag_numbers_with_words_words_with_occurance)
schema = sententree_vega.vega_sententree(G, w=800, h=300)

with open("../figures/bin_example.png", "wb") as f:
    png = vl_convert.vega_to_png(schema, scale=4)
    f.write(png)

vega.display(schema, format="interactive")


ExecutableNotFound: failed to execute PosixPath('dot'), make sure the Graphviz executables are on your systems' PATH