In [12]:
from indra.sources import reach
from indra.literature import pubmed_client
from typing import NamedTuple
import gilda

# See https://indra.readthedocs.io/en/latest/modules/sources/reach/index.html for setting up REACH locally
print(f"Warming up dockerized REACH at {reach.local_text_url}")
reach_processor = reach.process_text("this is about a vaccine", url=reach.local_text_url)
print("finished warming up")
reach_processor

Warming up dockerized REACH at http://localhost:8080/api/text


ERROR: [2023-09-29 14:09:14] indra.sources.reach.api - Could not connect to REACH service:
ERROR: [2023-09-29 14:09:14] indra.sources.reach.api - ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


finished warming up


In [5]:
class ReachEntity(NamedTuple):
    text: str
    start: int
    end: int

print(f"using reach at {reach.local_text_url}")

def get_reach_entities(text: str, pubmed=None) -> list[ReachEntity]:
    reach_processor = reach.process_text(
        text, 
        citation=pubmed, 
        # url=reach.local_text_url,
    )
    if reach_processor is None:
        raise ValueError("reach.process_text() returned None")
    return [
        ReachEntity(
            text=data["text"], 
            start=data["start-pos"]["offset"], 
            end=data["end-pos"]["offset"],
        )
        for data in reach_processor.get_all_entities()
    ]

using reach at http://localhost:8080/api/text


In [3]:
pubmed_id = "37192450"
abstract = pubmed_client.get_abstract(pubmed_id)

In [6]:
get_reach_entities(abstract, pubmed_id)

[ReachEntity(text='COVID-19 vaccines', start=108, end=125),
 ReachEntity(text='SARS-CoV-2 polymerase', start=646, end=667),
 ReachEntity(text='test', start=683, end=687),
 ReachEntity(text='MV', start=954, end=956),
 ReachEntity(text='duration', start=958, end=966),
 ReachEntity(text='MV', start=1218, end=1220),
 ReachEntity(text='membrane', start=1292, end=1300),
 ReachEntity(text='duration', start=1477, end=1485),
 ReachEntity(text='ICU LOS', start=1519, end=1526),
 ReachEntity(text='MV', start=1486, end=1488),
 ReachEntity(text='CI', start=1450, end=1452),
 ReachEntity(text='duration', start=1775, end=1783),
 ReachEntity(text='CI', start=1754, end=1756),
 ReachEntity(text='MV', start=1787, end=1789),
 ReachEntity(text='ICU', start=2176, end=2179)]

In [7]:
from kestrel.sources.literature.utils import get_pubmed_dataframe

pmids = pubmed_client.get_ids("vaccine")
df = get_pubmed_dataframe(pmids)

INFO: [2023-09-29 12:52:14] numexpr.utils - Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO: [2023-09-29 12:52:14] numexpr.utils - NumExpr defaulting to 8 threads.
limited at 10000.


In [9]:
def find_gilda_missing(text: str, *, grounder: gilda.Grounder | None = None, pubmed=None):
    if grounder is None:
        grounder = gilda.get_grounder()
    entities = get_reach_entities(text, pubmed=pubmed)
    return [
        t
        for t in entities
        if not grounder.ground(t.text)
    ]

from collections import Counter

#dd = defaultdict(lambda: defaultdict(set))
dd = Counter()
for pubmed, row in df.head(5).iterrows():
    for t in find_gilda_missing(row["title"], pubmed=pubmed):
        dd[t.text] += 1

import pandas as pd

results_df = pd.DataFrame(dd.most_common(), columns=["phrase", "frequency"])
results_df

Unnamed: 0,phrase,frequency
0,Phase 1b trial,1
1,Sm-TSP-2 Vaccine,1
2,Enugu,1
