In [1]:
%load_ext autoreload
%autoreload 2

# BIOMD0000000955 - https://pmc.ncbi.nlm.nih.gov/articles/PMC7175834/
# BIOMD0000000956 - https://pmc.ncbi.nlm.nih.gov/articles/PMC7382213/
# BIOMD0000000957 - https://pmc.ncbi.nlm.nih.gov/articles/PMC7104073/
# BIOMD0000000958 - https://pmc.ncbi.nlm.nih.gov/articles/PMC7184012/
# BIOMD0000000960 - https://pmc.ncbi.nlm.nih.gov/articles/PMC7394373/

model_ids = [
    "BIOMD0000000955",
    "BIOMD0000000956",
    "BIOMD0000000957",
    "BIOMD0000000958",
    "BIOMD0000000960",
]

In [2]:
import os

import sympy
import pandas as pd

# Old TA1 Extraction results
url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vQ2G-ZFSotS2qo94mnTWCDvj9Y49ai-9O61DA7940sPYynEdBXq2cT2-Wl3nNldIb3gkpbPFaTFY2PJ/pub?output=xlsx"
xls = pd.ExcelFile(url)

In [3]:
sheets = {
    sheet_name: pd.read_excel(xls, sheet_name) for sheet_name in xls.sheet_names
}
for sheet in sheets.values():
    sheet["grounding"] = sheet["grounding"].map(eval)
set(sheets)

{'BIOMD0000000955',
 'BIOMD0000000956',
 'BIOMD0000000957',
 'BIOMD0000000958',
 'BIOMD0000000960'}

In [4]:
# load new groundings stored in version control
from collections import defaultdict

ground_truth_df = pd.read_csv(
    "../mira/resources/mapped_biomodels_groundings.csv", sep=","
)
dd = defaultdict(set)
dd_context = defaultdict(set)

for row_index, entry in ground_truth_df.iterrows():
    model_id = entry["model"]
    if model_id not in model_ids:
        continue
    curie = None

    # Store identifiers
    if not pd.isna(entry["mapped_identifiers"]):
        curie = entry["mapped_identifiers"].split("/")[0]

    # Store contexts
    if not pd.isna(entry["mapped_context"]):
        contexts = entry["mapped_context"].split("|")

        context = contexts[0]
        context_kv_pair = context.split("=")

        key = context_kv_pair[0]
        value = context_kv_pair[1].split("/")[0]

        dd_context[model_id].add((key, value))

        for context_str in contexts[1:]:
            context_str_kv_pair = context_str.split("=")
            key = context_str_kv_pair[0]
            value = context_str_kv_pair[1].split("/")[0]
            dd_context[model_id].add((key, value))

    if curie is not None:
        dd[model_id].add(curie)

dd = dict(dd)
dd_context = dict(dd_context)

In [5]:
# Ta1 Extraction result table using only identifiers

ta1_rows = []
for model_id, sheet in sheets.items():
    sheet_curies = {
        grounding["id"]
        for groundings in sheet["grounding"]
        for grounding in groundings
    }

    ground_truth_curies = dd[model_id]

    intersection_curies = sheet_curies & ground_truth_curies
    n_intersection = len(intersection_curies)
    n_ground_truth = len(ground_truth_curies)
    ta1_rows.append(
        (
            model_id,
            n_ground_truth,
            n_intersection,
            round(n_intersection / n_ground_truth, 2),
        )
    )

print("The way to interpret the precision is the percent")
pd.DataFrame(ta1_rows, columns=["model", "structured", "TA-1", "precision"])

The way to interpret the precision is the percent


Unnamed: 0,model,structured,TA-1,precision
0,BIOMD0000000955,4,0,0.0
1,BIOMD0000000956,3,0,0.0
2,BIOMD0000000957,3,0,0.0
3,BIOMD0000000958,5,1,0.2
4,BIOMD0000000960,5,0,0.0


In [6]:
from mira.openai import OpenAIClient
from mira.sources.sympy_ode.llm_util import (
    execute_template_model_from_sympy_odes,
    image_file_to_odes_str,
)
from IPython.display import Image

client = OpenAIClient()

In [7]:
# full pipeline option (sometimes works), failure points are returning "lambda" symbols or not defining "theta" parameter
# ode_str0 = image_file_to_odes_str('images/BIOMD0000000955_odes.png', client=client)

# Output returned from llm grounding service option, no modification done to ode string
ode_str0 = """
import sympy 

# Define time variable
t = sympy.symbols("t")

# Define the time-dependent variables
S, I, D, A, R, T, H, E = sympy.symbols("S I D A R T H E", cls=sympy.Function)

# Define the parameters
alpha, beta, gamma, delta, epsilon, zeta, lambda_, eta, rho, theta, mu, kappa, nu, xi, sigma, tau, psi, lambda_ = sympy.symbols("alpha beta gamma delta epsilon zeta lambda eta rho theta mu kappa nu xi sigma tau psi lambda")

odes = [
    sympy.Eq(S(t).diff(t), - S(t) * (alpha * I(t) + beta * D(t) + gamma * A(t) + delta * R(t))),
    sympy.Eq(I(t).diff(t), S(t) * (alpha * I(t) + beta * D(t) + gamma * A(t) + delta * R(t)) - (epsilon + zeta + lambda_) * I(t)),
    sympy.Eq(D(t).diff(t), epsilon * I(t) - eta * D(t) - rho * D(t)),
    sympy.Eq(A(t).diff(t), zeta * I(t) - (theta + mu + kappa) * A(t)),
    sympy.Eq(R(t).diff(t), eta * D(t) + theta * A(t) - (nu + xi) * R(t)),
    sympy.Eq(T(t).diff(t), mu * A(t) + nu * R(t) - (sigma + tau) * T(t)),
    sympy.Eq(H(t).diff(t), lambda_ * I(t) + rho * D(t) + kappa * A(t) + xi * R(t) + sigma * T(t)),
    sympy.Eq(E(t).diff(t), tau * T(t))
]
"""

tm0 = execute_template_model_from_sympy_odes(
    ode_str=ode_str0, attempt_grounding=True, client=client
)

In [8]:
# full pipeline option (sometimes works), failure on defining "N" parameter
# ode_str1 = image_file_to_odes_str('images/BIOMD0000000956_odes.png', client=client)

# Output returned from llm grounding service option, had to add N parameter
ode_str1 = """
import sympy

# Define time variable
t = sympy.symbols("t")

# Define the time-dependent variables
S, E, I, R = sympy.symbols("S E I R", cls=sympy.Function)

# Define the parameters
beta, gamma, alpha, N = sympy.symbols("beta gamma alpha N")

N = S(t) + E(t) + I(t) + R(t)

odes = [
    sympy.Eq(S(t).diff(t), -beta * S(t) * I(t) / N),
    sympy.Eq(E(t).diff(t), beta * S(t) * I(t) / N - alpha * E(t)),
    sympy.Eq(I(t).diff(t), alpha * E(t) - gamma * I(t)),
    sympy.Eq(R(t).diff(t), gamma * I(t))
]"""

tm1 = execute_template_model_from_sympy_odes(
    ode_str=ode_str1, attempt_grounding=True, client=client
)

In [9]:
ode_str2 = image_file_to_odes_str(
    "images/BIOMD0000000957_odes.png", client=client
)
tm2 = execute_template_model_from_sympy_odes(
    ode_str=ode_str2, attempt_grounding=True, client=client
)

In [10]:
# Output returned from llm grounding service option, had to add N, rho_1, and gamma_r parameter
ode_str3 = """
import sympy 

# Define time variable
t = sympy.symbols("t")

# Define the time-dependent variables
S, E, I, P, A, H, R, F = sympy.symbols("S E I P A H R F", cls=sympy.Function)

# Define the parameters
beta, l, beta_prime, kappa_1, gamma_alpha, delta_I, kappa_2, gamma_i, delta_P, delta_H, kappa, N, rho_1, gamma_r = sympy.symbols("beta l beta_prime kappa_1 gamma_alpha delta_I kappa_2 gamma_i delta_P delta_H kappa N rho_1 gamma_r")

odes = [
    sympy.Eq(S(t).diff(t), -beta * I(t) * S(t) / N - l * beta * H(t) / N - beta_prime * P(t) * S(t)),
    sympy.Eq(E(t).diff(t), beta * I(t) * S(t) / N + l * beta * H(t) / N + beta_prime * P(t) * S(t) - kappa * E(t)),
    sympy.Eq(I(t).diff(t), kappa_1 * E(t) - (gamma_alpha + delta_I) * I(t)),
    sympy.Eq(P(t).diff(t), kappa_2 * E(t) - (gamma_alpha + gamma_i) * P(t) - delta_P * P(t)),
    sympy.Eq(A(t).diff(t), kappa * (1 - rho_1) * E(t)),
    sympy.Eq(H(t).diff(t), gamma_alpha * (I(t) + P(t)) - gamma_r * H(t) - delta_H * H(t)),
    sympy.Eq(R(t).diff(t), gamma_i * (I(t) + P(t)) * H(t)),
    sympy.Eq(F(t).diff(t), delta_I * I(t) + delta_P * P(t) + delta_H * H(t))
]
"""
tm3 = execute_template_model_from_sympy_odes(
    ode_str=ode_str3, attempt_grounding=True, client=client
)

In [11]:
# full pipeline option (sometimes works), failure points are not defining "gamma_I" parameter
# ode_str4 = image_file_to_odes_str('images/BIOMD0000000960_odes.png', client=client)

# Output returned from llm grounding service option, this is what was returned, no modification done to ode string
ode_str4 = """
import sympy

# Define time variable
t = sympy.symbols("t")

# Define the time-dependent variables
S, E, I, A, H, R, D = sympy.symbols("S E I A H R D", cls=sympy.Function)

# Define the parameters
beta, kappa, gamma_a, gamma_I, gamma_H, mu, delta_H, delta_I, delta_A, rho, N = sympy.symbols("beta kappa gamma_a gamma_I gamma_H mu delta_H delta_I delta_A rho N")

odes = [
    sympy.Eq(S(t).diff(t), -S(t) * beta * (I(t) + delta_A * A(t) + delta_H * H(t)) / N),
    sympy.Eq(E(t).diff(t), S(t) * beta * (I(t) + delta_A * A(t) + delta_H * H(t)) / N - kappa * E(t)),
    sympy.Eq(I(t).diff(t), kappa * rho * E(t) - (gamma_a + gamma_I + delta_I) * I(t)),
    sympy.Eq(A(t).diff(t), kappa * (1 - rho) * E(t) - mu * A(t)),
    sympy.Eq(H(t).diff(t), gamma_a * I(t) - (gamma_H + delta_H) * H(t)),
    sympy.Eq(R(t).diff(t), gamma_I * I(t) + gamma_H * H(t) + mu * (1 - delta_A) * A(t)),
    sympy.Eq(D(t).diff(t), delta_H * H(t) + delta_I * I(t) + mu * delta_A * A(t))
]
"""
tm4 = execute_template_model_from_sympy_odes(
    ode_str=ode_str4, attempt_grounding=True, client=client
)

In [12]:
tms = {
    "BIOMD0000000955": tm0,
    "BIOMD0000000956": tm1,
    "BIOMD0000000957": tm2,
    "BIOMD0000000958": tm3,
    "BIOMD0000000960": tm4,
}

In [14]:
# Comparing just the identifiers using our LLM grounding tool

llm_identifier_rows = []
missed_identifiers = defaultdict(set)
for model_id in model_ids:
    if model_id not in tms:
        continue

    ground_truth_curies = dd[model_id]

    tm_concepts = tms[model_id].get_concepts_map()
    tm_curies = set()
    for concept in tm_concepts.values():
        for prefix, identifier in concept.identifiers.items():
            tm_curies.add(f"{prefix}:{identifier}")
    intersection_curies = tm_curies & ground_truth_curies

    for missed_identifier in ground_truth_curies - tm_curies:
        missed_identifiers[model_id].add(missed_identifier)

    n_intersection = len(intersection_curies)
    n_ground_truth = len(ground_truth_curies)
    llm_identifier_rows.append(
        (
            model_id,
            n_ground_truth,
            n_intersection,
            round(n_intersection / n_ground_truth, 2),
        )
    )
missed_identifiers = dict(missed_identifiers)
print("The way to interpret the precision is the percent")
pd.DataFrame(
    llm_identifier_rows,
    columns=["model", "ground_truths", "LLM Grounding Identifiers", "precision"],
)

The way to interpret the precision is the percent


Unnamed: 0,model,ground_truths,LLM Grounding Identifiers,precision
0,BIOMD0000000955,4,3,0.75
1,BIOMD0000000956,3,3,1.0
2,BIOMD0000000957,3,3,1.0
3,BIOMD0000000958,5,4,0.8
4,BIOMD0000000960,5,5,1.0


In [15]:
for model, missing_identifier_set in missed_identifiers.items():
    print(
        f"For biomodel {model}, the identifiers our LLM grounding tool was not able to extract from the model are {missing_identifier_set}"
    )

For biomodel BIOMD0000000955, the identifiers our LLM grounding tool was not able to extract from the model are {'ncit:C28554'}
For biomodel BIOMD0000000958, the identifiers our LLM grounding tool was not able to extract from the model are {'ncit:C28554'}


In [18]:
# Comparing just the contexts using our LLM grounding tool

llm_context_rows = []
missed_contexts = defaultdict(set)
for model_id in model_ids:
    if model_id not in tms or model_id not in dd_context:
        continue
    ground_truth_context = dd_context[model_id]
    tm_context = set()
    for concept in tm_concepts.values():
        for key, val in concept.context.items():
            tm_context.add((key, val))

    intersection_curies = ground_truth_context & tm_context

    for missed_context in ground_truth_context - tm_context:
        missed_contexts[model_id].add(missed_context)

    n_intersection = len(intersection_curies)
    n_ground_truth = len(ground_truth_context)
    llm_context_rows.append(
        (
            model_id,
            n_ground_truth,
            n_intersection,
            round(n_intersection / n_ground_truth, 2),
        )
    )

print("The way to interpret the precision is the percent")
pd.DataFrame(
    llm_context_rows,
    columns=["model", "ground_truths", "LLM Grounding Context", "precision"],
)

The way to interpret the precision is the percent


Unnamed: 0,model,ground_truths,LLM Grounding Context,precision
0,BIOMD0000000955,4,1,0.25
1,BIOMD0000000957,1,1,1.0
2,BIOMD0000000958,4,2,0.5
3,BIOMD0000000960,4,2,0.5


In [19]:
for model, missing_identifier_set in missed_contexts.items():
    print(
        f"For biomodel {model}, the identifiers our LLM grounding tool was not able to extract from the model are \n {missing_identifier_set} \n"
    )

For biomodel BIOMD0000000955, the identifiers our LLM grounding tool was not able to extract from the model are 
 {('diagnosis', 'ncit:C113725'), ('disease_severity', 'ncit:C25269'), ('disease_severity', 'ncit:C25467')} 

For biomodel BIOMD0000000958, the identifiers our LLM grounding tool was not able to extract from the model are 
 {('transmissibility', 'ncit:C25376'), ('transmissibility', 'ncit:C49508')} 

For biomodel BIOMD0000000960, the identifiers our LLM grounding tool was not able to extract from the model are 
 {('hospitalization', 'ncit:C68851'), ('disease_severity', 'ncit:C25269')} 

