In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from copy import deepcopy
from collections import defaultdict

import sympy
import pandas as pd

# BIOMD0000000955 - https://pmc.ncbi.nlm.nih.gov/articles/PMC7175834/
# BIOMD0000000956 - https://pmc.ncbi.nlm.nih.gov/articles/PMC7382213/
# BIOMD0000000957 - https://pmc.ncbi.nlm.nih.gov/articles/PMC7104073/
# BIOMD0000000958 - https://pmc.ncbi.nlm.nih.gov/articles/PMC7184012/
# BIOMD0000000960 - https://pmc.ncbi.nlm.nih.gov/articles/PMC7394373/

model_ids = [
    "BIOMD0000000955",
    "BIOMD0000000956",
    "BIOMD0000000957",
    "BIOMD0000000958",
    "BIOMD0000000960",
]


# Old TA1 Extraction results
url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vQ2G-ZFSotS2qo94mnTWCDvj9Y49ai-9O61DA7940sPYynEdBXq2cT2-Wl3nNldIb3gkpbPFaTFY2PJ/pub?output=xlsx"
xls = pd.ExcelFile(url)

sheets = {
    sheet_name: pd.read_excel(xls, sheet_name) for sheet_name in xls.sheet_names
}
for sheet in sheets.values():
    sheet["grounding"] = sheet["grounding"].map(eval)
set(sheets)

{'BIOMD0000000955',
 'BIOMD0000000956',
 'BIOMD0000000957',
 'BIOMD0000000958',
 'BIOMD0000000960'}

In [3]:
# Store just compartment identifiers for TA1 evaluation using comparison with just identifiers stored in "dd_ta1"
# Store compartment identifiers and context for llm grounding evaluation with both identifiers and context stored in "dd_llm"
ground_truth_df = pd.read_csv(
    "../mira/resources/mapped_biomodels_groundings.csv", sep=","
)

dd_llm = defaultdict(dict)
dd_ta1 = defaultdict(set)

for row_index, entry in ground_truth_df.iterrows():
    model_id = entry["model"]
    if model_id not in model_ids:
        continue

    concept_curie = None

    # Store name and identifiers
    concept_dict = {}
    concept_name = entry["name"]
    concept_dict["name"] = concept_name
    concept_dict["context"] = set()
    curie = None
    curie_tuple = None
    if not pd.isna(entry["mapped_identifiers"]):
        curie = entry["mapped_identifiers"].split("/")[0]
        concept_dict["identifier"] = curie
        dd_ta1[model_id].add(curie)
        split_curie = curie.split(":")
        curie_tuple = (split_curie[0], split_curie[1])
    # Store contexts
    if not pd.isna(entry["mapped_context"]):
        contexts = entry["mapped_context"].split("|")

        context = contexts[0]
        context_kv_pair = context.split("=")

        key = context_kv_pair[0]
        value = context_kv_pair[1].split("/")[0]

        concept_dict["context"].add((key, value))

        for context_str in contexts[1:]:
            context_str_kv_pair = context_str.split("=")
            key = context_str_kv_pair[0]
            value = context_str_kv_pair[1].split("/")[0]
            concept_dict["context"].add((key, value))

    key = (curie_tuple, tuple(concept_dict["context"]))
    dd_llm[model_id][key] = concept_dict

dd_llm = dict(dd_llm)
dd_ta1 = dict(dd_ta1)

In [4]:
# Ta1 Extraction result table using only identifiers
# There are multiple concepts listed in th ground truth table that have the same identifier but with different contexts
# Since the Ta1 extraction results only list identifiers, we only do identifier comparison and do not factor in contexts which lead
# to a lower number of listed concepts

ta1_rows = []
for model_id, sheet in sheets.items():
    sheet_curies = {
        grounding["id"]
        for groundings in sheet["grounding"]
        for grounding in groundings
    }

    ground_truth_curies = dd_ta1[model_id]

    intersection_curies = sheet_curies & ground_truth_curies
    n_intersection = len(intersection_curies)
    n_ground_truth = len(ground_truth_curies)
    ta1_rows.append(
        (
            model_id,
            n_ground_truth,
            n_intersection,
            round(n_intersection / n_ground_truth, 2),
        )
    )

print("The way to interpret the precision is the percent")
pd.DataFrame(ta1_rows, columns=["model", "Concepts", "TA-1", "precision"])

The way to interpret the precision is the percent


Unnamed: 0,model,Concepts,TA-1,precision
0,BIOMD0000000955,4,0,0.0
1,BIOMD0000000956,3,0,0.0
2,BIOMD0000000957,3,0,0.0
3,BIOMD0000000958,5,1,0.2
4,BIOMD0000000960,5,0,0.0


In [5]:
from mira.openai import OpenAIClient
from mira.sources.sympy_ode.llm_util import (
    execute_template_model_from_sympy_odes,
    image_file_to_odes_str,
)
from IPython.display import Image

client = OpenAIClient()

In [6]:
# full pipeline option (sometimes works), failure points are returning "lambda" symbols or not defining "theta" parameter
# ode_str0 = image_file_to_odes_str('images/BIOMD0000000955_odes.png', client=client)

# Output returned from llm grounding service option, no modification done to ode string
ode_str0 = """
import sympy 

# Define time variable
t = sympy.symbols("t")

# Define the time-dependent variables
S, I, D, A, R, T, H, E = sympy.symbols("S I D A R T H E", cls=sympy.Function)

# Define the parameters
alpha, beta, gamma, delta, epsilon, zeta, lambda_, eta, rho, theta, mu, kappa, nu, xi, sigma, tau, psi, lambda_ = sympy.symbols("alpha beta gamma delta epsilon zeta lambda eta rho theta mu kappa nu xi sigma tau psi lambda")

odes = [
    sympy.Eq(S(t).diff(t), - S(t) * (alpha * I(t) + beta * D(t) + gamma * A(t) + delta * R(t))),
    sympy.Eq(I(t).diff(t), S(t) * (alpha * I(t) + beta * D(t) + gamma * A(t) + delta * R(t)) - (epsilon + zeta + lambda_) * I(t)),
    sympy.Eq(D(t).diff(t), epsilon * I(t) - eta * D(t) - rho * D(t)),
    sympy.Eq(A(t).diff(t), zeta * I(t) - (theta + mu + kappa) * A(t)),
    sympy.Eq(R(t).diff(t), eta * D(t) + theta * A(t) - (nu + xi) * R(t)),
    sympy.Eq(T(t).diff(t), mu * A(t) + nu * R(t) - (sigma + tau) * T(t)),
    sympy.Eq(H(t).diff(t), lambda_ * I(t) + rho * D(t) + kappa * A(t) + xi * R(t) + sigma * T(t)),
    sympy.Eq(E(t).diff(t), tau * T(t))
]
"""

tm0 = execute_template_model_from_sympy_odes(
    ode_str=ode_str0, attempt_grounding=True, client=client
)

In [7]:
# full pipeline option (sometimes works), failure on defining "N" parameter
# ode_str1 = image_file_to_odes_str('images/BIOMD0000000956_odes.png', client=client)

# Output returned from llm grounding service option, had to add N parameter
ode_str1 = """
import sympy

# Define time variable
t = sympy.symbols("t")

# Define the time-dependent variables
S, E, I, R = sympy.symbols("S E I R", cls=sympy.Function)

# Define the parameters
beta, gamma, alpha, N = sympy.symbols("beta gamma alpha N")

N = S(t) + E(t) + I(t) + R(t)

odes = [
    sympy.Eq(S(t).diff(t), -beta * S(t) * I(t) / N),
    sympy.Eq(E(t).diff(t), beta * S(t) * I(t) / N - alpha * E(t)),
    sympy.Eq(I(t).diff(t), alpha * E(t) - gamma * I(t)),
    sympy.Eq(R(t).diff(t), gamma * I(t))
]"""

tm1 = execute_template_model_from_sympy_odes(
    ode_str=ode_str1, attempt_grounding=True, client=client
)

In [8]:
ode_str2 = image_file_to_odes_str(
    "images/BIOMD0000000957_odes.png", client=client
)

tm2 = execute_template_model_from_sympy_odes(
    ode_str=ode_str2, attempt_grounding=True, client=client
)

In [9]:
# Output returned from llm grounding service option, had to add N, rho_1, and gamma_r parameter
ode_str3 = """
import sympy 

# Define time variable
t = sympy.symbols("t")

# Define the time-dependent variables
S, E, I, P, A, H, R, F = sympy.symbols("S E I P A H R F", cls=sympy.Function)

# Define the parameters
beta, l, beta_prime, kappa_1, gamma_alpha, delta_I, kappa_2, gamma_i, delta_P, delta_H, kappa, N, rho_1, gamma_r = sympy.symbols("beta l beta_prime kappa_1 gamma_alpha delta_I kappa_2 gamma_i delta_P delta_H kappa N rho_1 gamma_r")

odes = [
    sympy.Eq(S(t).diff(t), -beta * I(t) * S(t) / N - l * beta * H(t) / N - beta_prime * P(t) * S(t)),
    sympy.Eq(E(t).diff(t), beta * I(t) * S(t) / N + l * beta * H(t) / N + beta_prime * P(t) * S(t) - kappa * E(t)),
    sympy.Eq(I(t).diff(t), kappa_1 * E(t) - (gamma_alpha + delta_I) * I(t)),
    sympy.Eq(P(t).diff(t), kappa_2 * E(t) - (gamma_alpha + gamma_i) * P(t) - delta_P * P(t)),
    sympy.Eq(A(t).diff(t), kappa * (1 - rho_1) * E(t)),
    sympy.Eq(H(t).diff(t), gamma_alpha * (I(t) + P(t)) - gamma_r * H(t) - delta_H * H(t)),
    sympy.Eq(R(t).diff(t), gamma_i * (I(t) + P(t)) * H(t)),
    sympy.Eq(F(t).diff(t), delta_I * I(t) + delta_P * P(t) + delta_H * H(t))
]
"""
tm3 = execute_template_model_from_sympy_odes(
    ode_str=ode_str3, attempt_grounding=True, client=client
)

In [10]:
# full pipeline option (sometimes works), failure points are not defining "gamma_I" parameter
# ode_str4 = image_file_to_odes_str('images/BIOMD0000000960_odes.png', client=client)

# Output returned from llm grounding service option, this is what was returned, no modification done to ode string
ode_str4 = """
import sympy

# Define time variable
t = sympy.symbols("t")

# Define the time-dependent variables
S, E, I, A, H, R, D = sympy.symbols("S E I A H R D", cls=sympy.Function)

# Define the parameters
beta, kappa, gamma_a, gamma_I, gamma_H, mu, delta_H, delta_I, delta_A, rho, N = sympy.symbols("beta kappa gamma_a gamma_I gamma_H mu delta_H delta_I delta_A rho N")

odes = [
    sympy.Eq(S(t).diff(t), -S(t) * beta * (I(t) + delta_A * A(t) + delta_H * H(t)) / N),
    sympy.Eq(E(t).diff(t), S(t) * beta * (I(t) + delta_A * A(t) + delta_H * H(t)) / N - kappa * E(t)),
    sympy.Eq(I(t).diff(t), kappa * rho * E(t) - (gamma_a + gamma_I + delta_I) * I(t)),
    sympy.Eq(A(t).diff(t), kappa * (1 - rho) * E(t) - mu * A(t)),
    sympy.Eq(H(t).diff(t), gamma_a * I(t) - (gamma_H + delta_H) * H(t)),
    sympy.Eq(R(t).diff(t), gamma_I * I(t) + gamma_H * H(t) + mu * (1 - delta_A) * A(t)),
    sympy.Eq(D(t).diff(t), delta_H * H(t) + delta_I * I(t) + mu * delta_A * A(t))
]
"""
tm4 = execute_template_model_from_sympy_odes(
    ode_str=ode_str4, attempt_grounding=True, client=client
)

In [11]:
tms = {
    "BIOMD0000000955": tm0,
    "BIOMD0000000956": tm1,
    "BIOMD0000000957": tm2,
    "BIOMD0000000958": tm3,
    "BIOMD0000000960": tm4,
}

In [12]:
# Comparing identifiers and context using our LLM grounding tool

llm_identifier_rows = []
missed_identifiers = defaultdict(set)

for model_id in model_ids:
    if model_id not in tms:
        continue

    ground_truth_curies = set(dd_llm[model_id].keys())
    tm_curies = set(tms[model_id].get_concepts_map().keys())

    superset_tm_curies = deepcopy(tm_curies)

    # This will make sure to add all concepts from the ground truth whose contexts are a subset of a identifier matching concept's context
    # from the llm grounded model
    for ground_truth_entry in ground_truth_curies:
        ground_truth_set = set(ground_truth_entry[1])
        matched_entries = [
            tm_entry
            for tm_entry in tm_curies
            if tm_entry[0] == ground_truth_entry[0]
        ]
        if any(
            ground_truth_set.issubset(set(matched_entry[1]))
            for matched_entry in matched_entries
        ):
            superset_tm_curies.add(ground_truth_entry)

    intersection_curies = ground_truth_curies & superset_tm_curies

    print(intersection_curies)

    for missed_identifier in ground_truth_curies - superset_tm_curies:
        missed_identifiers[model_id].add(missed_identifier)

    n_intersection = len(intersection_curies)
    n_ground_truth = len(ground_truth_curies)
    llm_identifier_rows.append(
        (
            model_id,
            n_ground_truth,
            n_intersection,
            round(n_intersection / n_ground_truth, 2),
        )
    )
missed_identifiers = dict(missed_identifiers)
print("The way to interpret the precision is the percent")
pd.DataFrame(
    llm_identifier_rows,
    columns=[
        "model",
        "ground_truths",
        "LLM Grounding Identifiers",
        "precision",
    ],
)

{(('ido', '0000592'), ()), (('ido', '0000514'), ()), (('ido', '0000511'), ())}
{(('ido', '0000592'), ()), (('ido', '0000514'), ()), (('ido', '0000511'), ())}
{(('ido', '0000592'), ()), (('ido', '0000514'), ()), (('ido', '0000511'), ())}
{(('ido', '0000511'), (('hospitalization', 'ncit:C25179'),)), (('ido', '0000592'), ()), (('ido', '0000514'), ()), (('apollosv', '00000154'), ())}
{(('ido', '0000592'), ()), (('ido', '0000514'), ()), (('apollosv', '00000154'), ())}
The way to interpret the precision is the percent


Unnamed: 0,model,ground_truths,LLM Grounding Identifiers,precision
0,BIOMD0000000955,7,3,0.43
1,BIOMD0000000956,3,3,1.0
2,BIOMD0000000957,4,3,0.75
3,BIOMD0000000958,8,4,0.5
4,BIOMD0000000960,7,3,0.43


In [13]:
for model_id, missing_identifier_set in missed_identifiers.items():
    print(
        f"For biomodel {model_id}, the identifiers our LLM grounding tool was not able to extract from the model are listed below \n"
    )
    for missed_identifier in missing_identifier_set:
        print(f"{dd_llm[model_id][missed_identifier]} \n")

For biomodel BIOMD0000000955, the identifiers our LLM grounding tool was not able to extract from the model are listed below 

{'name': 'Threatened', 'context': {('disease_severity', 'ncit:C25467')}, 'identifier': 'ido:0000511'} 

{'name': 'Extinct', 'context': set(), 'identifier': 'ncit:C28554'} 

{'name': 'Recognized', 'context': {('diagnosis', 'ncit:C15220')}, 'identifier': 'ido:0000511'} 

{'name': 'Ailing', 'context': {('disease_severity', 'ncit:C25269'), ('diagnosis', 'ncit:C113725')}, 'identifier': 'ido:0000511'} 

For biomodel BIOMD0000000957, the identifiers our LLM grounding tool was not able to extract from the model are listed below 

{'name': 'Confirmed', 'context': {('diagnosis', 'ncit:C15220')}, 'identifier': 'ido:0000511'} 

For biomodel BIOMD0000000958, the identifiers our LLM grounding tool was not able to extract from the model are listed below 

{'name': 'Fatalities', 'context': set(), 'identifier': 'ncit:C28554'} 

{'name': 'Super_spreaders', 'context': {('transmis

In [14]:
model_compartment_mapping = {
    "BIOMD0000000955": {
        "Susceptible": "S",
        "Infected": "I",
        "Diagnosed": "D",
        "Recognized": "R",
        "Threatened": "T",
        "Extinct": "E",
        "Ailing": "A",
        "Healed": "H",
    },
    "BIOMD0000000956": {"Susceptible": "S", "Infected": "I", "Recovered": "R"},
    "BIOMD0000000957": {
        "Susceptible": "S",
        "Infected": "I",
        "Recovered": "R",
        "Confirmed": "E",
    },
    "BIOMD0000000958": {
        "Susceptible": "S",
        "Exposed": "E",
        "Infectious": "I",
        "Super_spreaders": "P",
        "Asymptomatic": "A",
        "Hospitalised": "H",
        "Recovered": "R",
        "Fatalities": "F",
    },
    "BIOMD0000000960": {
        "Hospitalized": "H",
        "Infectious": "I",
        "Recovered": "R",
        "Susceptible": "S",
        "Asymptomatic": "A",
        "Deceased": "D",
        "Exposed": "E",
    },
}

# Store a mapping of ground truth compartments to their predicted concepts
model_concept_to_predicted_concept_map = {}
for model_id in model_ids:
    tm_ground_truths = dd_llm[model_id]
    model_concept_to_predicted_concept_map[model_id] = {}
    for key, concept_dict in tm_ground_truths.items():
        model_concept_to_predicted_concept_map[model_id][key] = tms[
            model_id
        ].get_concept(model_compartment_mapping[model_id][concept_dict["name"]])

In [15]:
table_rows = []
columns = [
    "Model ID",
    "Concept Name",
    "Ground truth identifier",
    "Predicted identifier",
    "Ground truth context",
    "Predicted context",
]
for model_id in model_ids:
    tm_ground_truths = dd_llm[model_id]
    for key, concept_dict in tm_ground_truths.items():
        curie_prefix, curie_value = next(
            iter(
                model_concept_to_predicted_concept_map[model_id][
                    key
                ].identifiers.items()
            )
        )
        predicted_identifier = f"{curie_prefix}:{curie_value}"
        predicted_context = {
            (context_key, context_value)
            for context_key, context_value in model_concept_to_predicted_concept_map[
                model_id
            ][
                key
            ].context.items()
        }
        table_rows.append(
            (
                model_id,
                concept_dict["name"],
                concept_dict["identifier"],
                predicted_identifier,
                concept_dict["context"],
                predicted_context,
            )
        )
pd.set_option("display.max_colwidth", None)
pd.DataFrame(table_rows, columns=columns)

Unnamed: 0,Model ID,Concept Name,Ground truth identifier,Predicted identifier,Ground truth context,Predicted context
0,BIOMD0000000955,Ailing,ido:0000511,ido:0000511,"{(disease_severity, ncit:C25269), (diagnosis, ncit:C113725)}","{(species, ncbitaxon:9606), (disease_severity, ncit:C3833)}"
1,BIOMD0000000955,Recognized,ido:0000511,ido:0000592,"{(diagnosis, ncit:C15220)}","{(recovery_status, recovered), (species, ncbitaxon:9606)}"
2,BIOMD0000000955,Extinct,ncit:C28554,apollosv:00000154,{},"{(exposure_status, exposed), (species, ncbitaxon:9606)}"
3,BIOMD0000000955,Healed,ido:0000592,ido:0000511,{},"{(hospitalization, ncit:C25179), (species, ncbitaxon:9606)}"
4,BIOMD0000000955,Infected,ido:0000511,ido:0000511,{},"{(species, ncbitaxon:9606), (disease_severity, ncit:C25269)}"
5,BIOMD0000000955,Susceptible,ido:0000514,ido:0000514,{},"{(status, susceptible), (species, ncbitaxon:9606)}"
6,BIOMD0000000955,Threatened,ido:0000511,ido:0000592,"{(disease_severity, ncit:C25467)}","{(treatment_status, under_treatment), (species, ncbitaxon:9606)}"
7,BIOMD0000000956,Infected,ido:0000511,ido:0000511,{},"{(species, ncbitaxon:9606), (status, infected)}"
8,BIOMD0000000956,Recovered,ido:0000592,ido:0000592,{},"{(status, recovered), (species, ncbitaxon:9606)}"
9,BIOMD0000000956,Susceptible,ido:0000514,ido:0000514,{},"{(status, susceptible), (species, ncbitaxon:9606)}"
