In [1]:
import json
import random
from itertools import chain
from pathlib import Path
import os
import networkx as nx
import numpy as np
import pandas as pd
import pytest

import pyciemss
from pyciemss.integration_utils.result_processing import convert_to_output_format
from pyciemss.visuals import plots, vega



Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md

Install h5py to use hdf5 features: http://docs.h5py.org/
  warn(h5py_msg)


In [2]:

name = "Phase_Change"
timepoint_list = []
sine_list = []
sample_list = []


def save_result(data, name, ref_ext):
    """Save new reference files"""
    _output_root = 'output'
    mode = "w" if ref_ext == "svg" else "wb"
    with open(os.path.join(_output_root, f"{name}.{ref_ext}"), mode) as f:
        f.write(data)
kmean = False
run_functions =  [name] #["Straight", "Amp_Change", "Freq_Change", "Phase_Change"]# #, [name]
n_cluster = 1
# create sine fucntions
def return_sine_function_3(function_name):
        if function_name == "Straight":
                a, b, c, d = 0, 0, 0, random.uniform(0, 2)
                a2, b2, c2, d2 = 0, 0, 0, 1
        if function_name == "Amp_Change":
                a, b, c, d = 1*random.uniform(0, 2), .25, 2*np.pi, random.uniform(0, 2)
                a2, b2, c2, d2 = 1, .25, 2*np.pi, 1
        if function_name == "Freq_Change":
                a, b, c, d = 1, .25*random.uniform(0, 2), 2*np.pi, random.uniform(0, 2)
                a2, b2, c2, d2 = 1, .25, 2*np.pi, 1
        if function_name == "Phase_Change":
                a, b, c, d = 1, .25, 2*np.pi*random.uniform(0, 2), random.uniform(0, 2)
                a2, b2, c2, d2 = 1, .25, 2*np.pi, 1
        def sine_function(x):
                return np.power(x, .25) + a*np.sin(b*x + c) + d
        def sine_function_default(x):
                return np.power(x, .25) + a2*np.sin(b2*x + c2) + d2
        function_parameters_individual = {'a': a,  'b': b, 'c': c,  'd': d}
        function_parameters_default = {'a': a2, 'b': b2, 'c': c2, 'd': d2}
        
        return sine_function, sine_function_default, function_parameters_individual, function_parameters_default

function_dataframe = pd.DataFrame(columns=['functions', 'sample_id', 'a', 'b',  'c',  'd'])
function_list = []
j = 0
all_functions ={}

# create sine dataframes and save the parameters used
for funct_name in run_functions:
        for _ in range(100):
                j+=1
                timepoint_list.extend([x for x in range(100)])
                sample_list.extend([j]*100)
                random_function,  sine_function_default, function_parameters_individual, function_parameters_default = return_sine_function_3(funct_name)

                function_parameters_individual['functions'] = funct_name
                function_parameters_individual['sample_id'] = j
                function_list.append(function_parameters_individual)
                sine_list.extend([random_function(x) for x in range(100)])
        all_functions[funct_name] = sine_function_default 
        function_parameters_default['functions'] = funct_name + " Default"
        function_parameters_default['sample_id'] = None

#keep track of parameters
function_dataframe = pd.DataFrame(function_list)
function_default_dataframe =  pd.DataFrame(function_parameters_default, index=[0])


sine_distribution= pd.DataFrame({'timepoint_id' : timepoint_list,
                   'sample_id' : sample_list,
                   'Example': sine_list})


traces_list = []   

# get examplary lines
#change kmean here to use clustering
examplary_line, trajectory_dict = pyciemss.visuals.trajectories.select_traces(sine_distribution,  select_by_list = ['mean', "variance", 'chaos', 'random'], kmean=kmean, n_clusters = n_cluster)
# get the sample'id for the examplary data used per sample_id and select_by
best_samples_cluster = examplary_line[['sample_id', 'select_by', 'cluster']].drop_duplicates()
best_samples_df_list = []
parameters_by_sample_id_list = []
# for each cluster create a new plot
for cluster in list(np.unique(examplary_line['cluster'])): 
        # for distribution only use the sample_ids in the cluster
       
        cluster_sample_id = trajectory_dict[cluster.lower()]['cluster_sample_id']  
        cluster_sine_distribution =  sine_distribution[sine_distribution["sample_id"].isin(cluster_sample_id)]
        current_cluster_df = examplary_line[examplary_line['cluster'] == cluster]
        current_cluster_df = current_cluster_df.reset_index()
        # get right format to use as traces
        examplary_lines_pivot = current_cluster_df[['examplary_line', 'envelope_mean', 'timepoint', 'select_by']].pivot_table(
        values="examplary_line", index = ["timepoint", 'envelope_mean'], columns="select_by"
        )
        examplary_lines_pivot = examplary_lines_pivot.reset_index()
        examplary_lines_pivot = examplary_lines_pivot.rename(columns = {'envelope_mean': 'Envelope_Mean'})
        if len(run_functions) == 1 and not kmean:
                # get baseline for sine functions
                for function_name, function in all_functions.items():
                        examplary_lines_pivot["Baseline_" + function_name.title()] = [function(x) for x in range(100)]
        #plot the traces along with the baselines
        schema = plots.trajectories(cluster_sine_distribution[['timepoint_id', 'sample_id', 'Example']], traces = examplary_lines_pivot)
        path = "{}_{}".format(name, cluster)
        image = plots.ipy_display(schema, format="PNG").data
        save_result(image, path, "png")
        plots.ipy_display(schema, format="interactive")

        # get average and std of parameter cluster
        all_parameters_cluster = cluster_sine_distribution.merge(function_dataframe, how="inner", on="sample_id")[['sample_id', 'a', 'b', 'c', 'd']]
        parameters_by_sample_id = all_parameters_cluster.groupby("sample_id").agg(np.mean)
        parameters_by_sample_id  = parameters_by_sample_id.agg([np.mean, np.std]).reset_index()
        parameters_by_sample_id['cluster'] = cluster
        parameters_by_sample_id['num'] = len(cluster_sample_id)
        parameters_by_sample_id_list.append(parameters_by_sample_id)
parameters_metrics = pd.concat(parameters_by_sample_id_list)
parameters_metrics.to_csv("output/{}_parameter_info.csv".format(name))
if len(run_functions) == 1 and not kmean:
        # clean up table to get the parameters of the samples id's used for examplary
        all_clusters = best_samples_cluster.merge(function_dataframe, how="inner", on="sample_id")
        all_clusters = pd.concat([all_clusters, function_default_dataframe])[['functions', 'select_by', 'a', 'b', 'c', 'd']]
        all_clusters = all_clusters.rename(columns = {'functions': 'Function Name', 'select_by': "Selection Method"})
        all_clusters["Function Name"] = all_clusters["Function Name"].apply(lambda x: x.replace("_", " ").title())
        all_clusters["Examplar #"] = all_clusters["Selection Method"].apply(lambda x: 1 if len(str(x).split("_")) < 2 else str(x).split("_")[1])
        all_clusters["Selection Method"] = all_clusters["Selection Method"].apply(lambda x:  str(x).split("_")[0])
        all_clusters = all_clusters.round(2)


        all_clusters.to_csv("output/{}.csv".format(name))
        # table with sine variables of starting function and the formula of the best select_by lines and metrics1

Value(False)



  parameters_by_sample_id = all_parameters_cluster.groupby("sample_id").agg(np.mean)
  parameters_by_sample_id  = parameters_by_sample_id.agg([np.mean, np.std]).reset_index()
  parameters_by_sample_id  = parameters_by_sample_id.agg([np.mean, np.std]).reset_index()


In [28]:

name = "random"
timepoint_list = []
sine_list = []
sample_list = []
kmean = False
n_clusters = 3

def save_result(data, name, ref_ext):
    """Save new reference files"""
    _output_root = 'output'
    mode = "w" if ref_ext == "svg" else "wb"
    with open(os.path.join(_output_root, f"{name}.{ref_ext}"), mode) as f:
        f.write(data)

#https://stackoverflow.com/questions/71681417/generating-a-1d-random-walk-with-random-module
def randomwalk1D(n):
    x = 0
    start = x
    xposition = [start]
    probabilities = [-1, 1]
    for i in range(1, n + 1):
        x += random.choice(probabilities)
        xposition.append(x)
    return xposition

timepoint_list = []
sample_list = []
sine_list = []

j = 0
for _ in range(300):
        j+=1
        timepoint_list.extend([x for x in range(100)])
        sample_list.extend([j]*100)
        sine_list.extend(randomwalk1D(99))

random_distribution= pd.DataFrame({'timepoint_id' : timepoint_list,
                   'sample_id' : sample_list,
                   'Example': sine_list})


#change kmean here to use clustering
examplary_line, trajectory_dict = pyciemss.visuals.trajectories.select_traces(random_distribution, kmean= kmean, n_clusters = n_clusters, select_by_list = ['mean', 'granger', "variance", 'chaos'])


for cluster in list(np.unique(examplary_line['cluster'])): 
        cluster_sample_id = trajectory_dict[cluster.lower()]['cluster_sample_id']  
        cluster_random_distribution =  random_distribution[random_distribution["sample_id"].isin(cluster_sample_id)]
        cluster_random_distribution = cluster_random_distribution.reset_index()
        print(len(cluster_sample_id))
        # get examplary lines form that cluster
        current_cluster_df = examplary_line[examplary_line['cluster'] == cluster]
        current_cluster_df = current_cluster_df.reset_index()

        # get right format to use as traces
        examplary_lines_pivot = current_cluster_df[['examplary_line', 'envelope_mean', 'timepoint', 'select_by']].pivot_table(
        values="examplary_line", index = ["timepoint", 'envelope_mean'], columns="select_by")
        examplary_lines_pivot = examplary_lines_pivot.reset_index()
        examplary_lines_pivot = examplary_lines_pivot.rename(columns = {'envelope_mean': 'Baseline_Mean'})

        # plot the traces along with the baselines
        schema = plots.trajectories(cluster_random_distribution[['timepoint_id', 'sample_id', 'Example']], traces = examplary_lines_pivot)
        path = "{}_{}".format(name, cluster)
        image = plots.ipy_display(schema, format="PNG").data
        save_result(image, path, "png")
        plots.ipy_display(schema, format="interactive")




Granger Causality
number of lags (no zero) 10
ssr based F test:         F=1.1024  , p=0.3729  , df_denom=69, df_num=10
ssr based chi2 test:   chi2=14.3796 , p=0.1564  , df=10
likelihood ratio test: chi2=13.3402 , p=0.2053  , df=10
parameter F test:         F=1.1024  , p=0.3729  , df_denom=69, df_num=10

Granger Causality
number of lags (no zero) 10
ssr based F test:         F=1.8457  , p=0.0685  , df_denom=69, df_num=10
ssr based chi2 test:   chi2=24.0749 , p=0.0074  , df=10
likelihood ratio test: chi2=21.3341 , p=0.0189  , df=10
parameter F test:         F=1.8457  , p=0.0685  , df_denom=69, df_num=10

Granger Causality
number of lags (no zero) 10
ssr based F test:         F=1.4706  , p=0.1694  , df_denom=69, df_num=10
ssr based chi2 test:   chi2=19.1812 , p=0.0380  , df=10
likelihood ratio test: chi2=17.3879 , p=0.0662  , df=10
parameter F test:         F=1.4706  , p=0.1694  , df_denom=69, df_num=10

Granger Causality
number of lags (no zero) 10
ssr based F test:         F=1.4215  , 

In [None]:
def distributions():
    model_1_path = (
            "https://raw.githubusercontent.com/DARPA-ASKEM/simulation-integration"
            "/main/data/models/SEIRHD_NPI_Type1_petrinet.json"
    )
    start_time = 0.0
    end_time = 100.0
    logging_step_size = 1
    num_samples = 30
    sample = pyciemss.sample(
        model_1_path,
        end_time,
        logging_step_size,
        num_samples,
        start_time=start_time,
        solver_method="euler",
    )["unprocessed_result"]

    for e in sample.values():
        if len(e.shape) > 1:
            num_timepoints = e.shape[1]

    return convert_to_output_format(
        sample,
        timepoints=np.linspace(start_time, end_time, num_timepoints),
        time_unit="notional",
    )

distributions = distributions()
print(distributions.head())
print(distributions.columns)
all_columns = ["timepoint_id", "sample_id", 'infected_observable_state',
       'exposed_observable_state', 'hospitalized_observable_state',
       'dead_observable_state']
all_columns = ["timepoint_id", "sample_id", 'infected_observable_state']

distributions = distributions[all_columns]
distributions

   timepoint_id  sample_id  persistent_beta_c_param  persistent_kappa_param  \
0             0          0                  0.71391                 0.09432   
1             1          0                  0.71391                 0.09432   
2             2          0                  0.71391                 0.09432   
3             3          0                  0.71391                 0.09432   
4             4          0                  0.71391                 0.09432   

   persistent_gamma_param  persistent_hosp_param  persistent_death_hosp_param  \
0                0.226554               0.081048                     0.091152   
1                0.226554               0.081048                     0.091152   
2                0.226554               0.081048                     0.091152   
3                0.226554               0.081048                     0.091152   
4                0.226554               0.081048                     0.091152   

   persistent_I0_param   D_state    E_

Unnamed: 0,timepoint_id,sample_id,infected_observable_state
0,0,0,19.609041
1,1,0,22.959484
2,2,0,24.065012
3,3,0,23.884687
4,4,0,22.994741
...,...,...,...
2965,94,29,0.000162
2966,95,29,0.000130
2967,96,29,0.000104
2968,97,29,0.000083


In [None]:
#'timepoint_id', 'sample_id'
sine_distribution = pd.read_csv("paper_sine_05/paper_sine_05_new_df.csv")
sine_distribution = sine_distribution.rename(
            columns={
                'Unnamed: 0.1': "timepoint_id",
                'Unnamed: 0': "100",
            }
        )
sine_distribution = sine_distribution.melt(id_vars=["timepoint_id"], 
        var_name="sample_id", 
        value_name="same_stat_state")

traces_list = []   
 
#select_by "mean", "var", "granger"
for select_by in ['mean', 'var', 'granger']:
    examplary_line, mean_line_df = pyciemss.visuals.trajectories.select_traces(sine_distribution, select_by = select_by,  kmean=True)
    traces_list.append(examplary_line)


new_df = pd.concat(traces_list, axis = 1)
combined_df = pd.concat([new_df, mean_line_df], axis =1)

schema = plots.trajectories(sine_distribution, traces = combined_df,  keep=".*_state")

plots.save_schema(schema, "_schema.json")

plots.ipy_display(schema, format="interactive")

FileNotFoundError: [Errno 2] No such file or directory: 'paper_sine_05/paper_sine_05_new_df.csv'