In [15]:
%pip install -qU tiktoken pypdf openai langchain langchain-text-splitters langchain-openai langchain-core python-dotenv pydantic langchain_community


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
import itertools
import json
from pprint import pprint
from pathlib import Path
import sys

import tiktoken
from pydantic import BaseModel, Field
from openai import OpenAI
import dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents.base import Document

sys.path.append('..')
from viime_extract.schema import ArticleMeta, ArticleKeyWords, Metabolite

In [2]:
LLM_MODEL_NAME = 'gpt-4o-mini'
PDF_DIR = Path('../data/PubMed LongCovid and Metabolomics Results')

In [3]:
dotenv.load_dotenv()
LLM = ChatOpenAI(model=LLM_MODEL_NAME, temperature=0)

In [4]:
def get_pdf_pages(pdf_file: Path):
    return list(PyPDFLoader(pdf_file).lazy_load())

def extract_article_metadata(pages: list[Document], model=LLM):
    # assumption: first page contains all of the metadata
    first_page = pages[0].page_content

    template = ChatPromptTemplate([
        ('system', '''You are an expert in extracting structured information from medical journal articles.
Present the extracted information in a clear, structured format. Be comprehensive and extract every single
mentioned entity. You will be evaluated on the quality and completeness of the extracted information.'''),
        ('user', 'Please extract the title, authors, journal title, publication year, journal volume, DOI ID, and pubmed ID from the following journal article.:\n\n{article_contents}'),
    ])
    prompt = template.invoke({'article_contents': first_page})
    return model.with_structured_output(ArticleMeta).invoke(prompt)

def extract_article_keywords(pages: list[Document], model=LLM, omit_bibliography=False):
    template = ChatPromptTemplate([
        ('system', '''You are an expert in extracting structured information from medical journal articles.
Present the extracted information in a clear, structured format. Be comprehensive and extract every single
mentioned entity. You will be evaluated on the quality and completeness of the extracted information.

If you are not confident in the identifier for an entity, you can specify it as "unknown". It is better
to include an entity with an "unknown" identified than to omit it entirely.'''),
        ('user', 'Please extract the metabolites, proteins, genes, pathways, drugs, and diseases mentioned in the following journal article{prompt_suffix}:\n\n{article_contents}'),
    ])

    splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        model_name=LLM_MODEL_NAME,
        chunk_size=768,
        chunk_overlap=64,
    )

    keywords = ArticleKeyWords()

    prompt_suffix = ''
    if omit_bibliography:
        prompt_suffix = ', ignoring entities found in the references section'

    chunks = splitter.split_documents(pages)
    print('Number of chunks:', len(chunks))
    for idx, text_chunk in enumerate(chunks):
        print(f'Processing chunk #{idx}')
        prompt = template.invoke({'article_contents': text_chunk.page_content, 'prompt_suffix': prompt_suffix})
        response = model.with_structured_output(ArticleKeyWords).invoke(prompt)
        keywords = keywords.merge(response)

    return keywords

In [5]:
LC_81_pages = get_pdf_pages(PDF_DIR / 'LC_81.pdf')

In [44]:
%%time
article_meta = extract_article_metadata(LC_81_pages, model=LLM)

In [112]:
%%time
article_keywords = extract_article_keywords(LC_81_pages, model=LLM)

Number of chunks: 28
Processing chunk #0
Processing chunk #1
Processing chunk #2
Processing chunk #3
Processing chunk #4
Processing chunk #5
Processing chunk #6
Processing chunk #7
Processing chunk #8
Processing chunk #9
Processing chunk #10
Processing chunk #11
Processing chunk #12
Processing chunk #13
Processing chunk #14
Processing chunk #15
Processing chunk #16
Processing chunk #17
Processing chunk #18
Processing chunk #19
Processing chunk #20
Processing chunk #21
Processing chunk #22
Processing chunk #23
Processing chunk #24
Processing chunk #25
Processing chunk #26
Processing chunk #27
CPU times: user 601 ms, sys: 77.8 ms, total: 679 ms
Wall time: 1min 33s


# Ignoring references to the bibliography

In [35]:
from pydantic import BaseModel, Field
from typing import Optional

class Reference(BaseModel):
    refnum: Optional[int] = Field(title="Reference Number", description="The reference number", default=None)
    authors: Optional[list[str]] = Field(title="Authors", description="List of authors in a reference", default=None)
    title: Optional[str] = Field(title="Paper title", description="The reference paper title", default=None)
    year: Optional[int] = Field(title="Paper year", description="The reference paper year of publication", default=None)
    journal: Optional[str] = Field(title="Journal", description="The journal in which the reference was published", default=None)

    @property
    def incomplete(self):
        return any(getattr(self, prop) is None for prop in ['refnum', 'authors', 'title', 'year', 'journal'])

class References(BaseModel):
    references: Optional[list[Reference]] = Field(title="Bibliographic reference", description="A bibliographic reference in an academic paper", default=None)

    @property
    def complete_references(self):
        return [r for r in self.references if not r.incomplete]

def extract_references(page: Document, model=LLM):
    template = ChatPromptTemplate([
        ('system', '''You are an expert in extracting structured information from medical journal articles.
Present the extracted information in a clear, structured format. Be comprehensive and extract every single reference. You will be evaluated on the quality and completeness of the extracted information. If you are unsure if an entry is a reference, then do not include it.'''),
        ('user', 'Please list the references in this journal article:\n\n{article_contents}'),
    ])
    prompt = template.invoke({'article_contents': page})
    return model.with_structured_output(References).invoke(prompt)

In [16]:
LC_69_pages = get_pdf_pages(PDF_DIR / 'LC_69.pdf')
pages_up_to_bibliography = LC_69_pages[:13]
print(len(LC_69_pages))

18


In [39]:
refs = extract_references(LC_69_pages[13])

In [40]:
refs.complete_references

[Reference(refnum=1, authors=['B. Hu', 'S. Huang', 'L. Yin'], title='The cytokine storm and COVID-19', year=2021, journal='J. Med. Virol.'),
 Reference(refnum=2, authors=['C. Stasi', 'S. Fallani', 'F. Voller', 'C. Silvestri'], title='Treatment for COVID-19: an overview', year=2020, journal='Eur. J. Pharmacol.'),
 Reference(refnum=3, authors=['M.G. Netea', 'F. Balkwill', 'M. Chonchol', 'F. Cominelli', 'M.Y. Donath', 'E. J. Giamarellos-Bourboulis', 'D. Golenbock', 'M.S. Gresnigt', 'M.T. Heneka', 'H. M. Hoffman', 'R. Hotchkiss', 'L.A.B. Joosten', 'D.L. Kastner', 'M. Korte', 'E. Latz', 'P. Libby', 'T. Mandrup-Poulsen', 'A. Mantovani', 'K.H.G. Mills', 'K.L. Nowak', 'L. A. O ’ Neill', 'P. Pickkers', 'T. van der Poll', 'P.M. Ridker', 'J. Schalkwijk', 'D.A. Schwartz', 'B. Siegmund', 'C.J. Steer', 'H. Tilg', 'J.W.M. van der Meer', 'F.L. van de Veerdonk', 'C. A. Dinarello'], title='A guiding map for inflammation', year=2017, journal='Nat. Immunol.'),
 Reference(refnum=4, authors=['C.N. Serhan', 

In [18]:
for idx, page in enumerate(LC_69_pages[::-1]):
    print(f'====== {len(LC_69_pages) - idx}')
    refs = extract_references(page)
    print(len(refs.references))

27
50
47
50
40
30


KeyboardInterrupt: 

In [20]:
refs

References(references=[Reference(refnum=8, authors=[], title='Acid-methanol trapping studies', year=2022), Reference(refnum=14, authors=[], title='Studies with leukocytes, isolated enzymes, and their stereoselective total organic synthesis', year=2022), Reference(refnum=15, authors=[], title='Evidence for the cellular production of unstable epoxide intermediates', year=2022), Reference(refnum=27, authors=[], title='Targeted LC-MS-MS-based profiling', year=2022), Reference(refnum=28, authors=[], title='Targeted LC-MS-MS-based metabololipidomic profiling', year=2022), Reference(refnum=38, authors=[], title='SPM biosynthesis in vivo in humans', year=2022), Reference(refnum=39, authors=[], title='SPM biosynthesis in vivo in humans', year=2022), Reference(refnum=40, authors=[], title='SPM biosynthesis in vivo in humans', year=2022), Reference(refnum=50, authors=[], title='Omega-3 supplementation and SPM production', year=2022), Reference(refnum=63, authors=[], title='Clinical trial with pro

In [8]:
for l in LC_69_pages[13:]:
    print(l.page_content)

Seminars in Immunology 59 (2022) 101597
13
in human public health and the potential for SPMs in resolution 
physiology-pharmacology. 
Author contributions 
C.N.S., S.L. and R.N. composed and contributed to the preparation of 
this manuscript, tables, and figures. 
Funding 
C.N.S is supported by the National Institutes of Health (grant no. 
R35GM095467) and S.L. is supported by NIH grant no. K99HL153673. 
Acknowledgments 
We thank M. H. Small for expert assistance in manuscript prepara -
tion. We thank the many investigators that have published exciting re -
sults on the potent functions and pharmacology of the SPMs and their 
total organic syntheses reviewed herein. 
References 
[1] B. Hu, S. Huang, L. Yin, The cytokine storm and COVID-19, J. Med. Virol. 93 (1) 
(2021) 250 – 256 . 
[2] C. Stasi, S. Fallani, F. Voller, C. Silvestri, Treatment for COVID-19: an overview, 
Eur. J. Pharmacol. 889 (2020), 173644 . 
[3] M.G. Netea, F. Balkwill, M. Chonchol, F. Cominelli, M.Y. Donath, E. 
J. G

In [22]:
%%time
keywords_all_pages_no_omit = extract_article_keywords(LC_69_pages, model=LLM, omit_bibliography=False)
keywords_all_pages_omit = extract_article_keywords(LC_69_pages, model=LLM, omit_bibliography=True)
keywords_up_to_biblio = extract_article_keywords(pages_up_to_bibliography, model=LLM, omit_bibliography=False)

Number of chunks: 62
Processing chunk #0
Processing chunk #1
Processing chunk #2
Processing chunk #3
Processing chunk #4
Processing chunk #5
Processing chunk #6
Processing chunk #7
Processing chunk #8
Processing chunk #9
Processing chunk #10
Processing chunk #11
Processing chunk #12
Processing chunk #13
Processing chunk #14
Processing chunk #15
Processing chunk #16
Processing chunk #17
Processing chunk #18
Processing chunk #19
Processing chunk #20
Processing chunk #21
Processing chunk #22
Processing chunk #23
Processing chunk #24
Processing chunk #25
Processing chunk #26
Processing chunk #27
Processing chunk #28
Processing chunk #29
Processing chunk #30
Processing chunk #31
Processing chunk #32
Processing chunk #33
Processing chunk #34
Processing chunk #35
Processing chunk #36
Processing chunk #37
Processing chunk #38
Processing chunk #39
Processing chunk #40
Processing chunk #41
Processing chunk #42
Processing chunk #43
Processing chunk #44
Processing chunk #45
Processing chunk #46
Pr

In [20]:
import itertools
list(
    itertools.zip_longest(
        map(lambda d: d.name, keywords_all_pages_omit.mentioned_diseases),
        map(lambda d: d.name, keywords_up_to_biblio.mentioned_diseases),
    )
)

[('COVID-19', 'COVID-19'),
 ('COVID-19', 'COVID-19'),
 ('COVID-19', 'COVID-19'),
 ('COVID-19', 'COVID-19'),
 ('acute inflammation', 'COVID-19'),
 ('collateral organ damage', 'COVID-19'),
 ('tissue injury', 'gingival inflammation'),
 ('COVID-19', 'inflammatory response'),
 ('COVID-19', 'COVID-19'),
 ('gingival inflammation', 'COVID-19'),
 ('cardiovascular disease', 'hemorrhagic exudate'),
 ('COVID-19', 'COVID-19'),
 ('human rhinosinusitis', 'COVID-19'),
 ('kidney diseases', 'unknown'),
 ('diabetic complications', 'peritonitis'),
 ('COVID-19', 'sepsis'),
 ('COVID-19', 'ischemia-reperfusion injury'),
 ('Cancer', 'diabetes'),
 ('hemorrhagic exudate', 'colitis'),
 ('COVID-19', 'lung inflammation'),
 ('COVID-19', 'obesity'),
 ('unknown', 'atherosclerosis'),
 ('peritonitis', 'tumor burden'),
 ('sepsis', 'dermatitis'),
 ('ischemia-reperfusion injury', 'Candida albicans infection'),
 ('diabetes', 'herpes simplex virus infection'),
 ('colitis', 'metabolic syndrome'),
 ('lung inflammation', 'arth

# Compare ground truth with MolGenetMetab results

In [83]:
ground_truth_metabolites = """formate
sarcosine
hypoxanthine
homocysteine
ATP
NAD
NADH
fatty acids
dihydroceramide
unsaturated fatty acid
phosphocholines
cholesterol ester
hydroxypropionylcarnitine
acetate
dodecanedioic acid
indoxylsulfate
arginine
glucose
carnitine
glutamate
lysine
histidine
branched chain amino acids
leucine
valine
choline
threonine
ornithine
lactate
succinate
cholesterol ester 20:0
triglycerides
phosphatidylcholine
betaine
methionine
s-adenosylmethionine
s-adenosylhomocysteine
dimethylglycine
iron-sulfur clusters
phosphocreatine
nicotinamide mononucleotide
oxylipin
reactive oxygen species
taurine
β-alanine
pyruvate
""".strip().split("\n")

ground_truth_pathways = """iron-sulfur cluster biogenesis
cellular energy metabolism
mitochondrial electron transport chain
krebs cycle
electron transfer flavoprotein
nuclear gene expression
glycolysis
carbohydrate and fatty acid metabolism
energy metabolism
one-carbon metabolism
folate cycle
methionine salvage
purine nucleotide salvage and synthesis
pyruvate metabolism
""".strip().split("\n")

ground_truth_proteins = [
    "frataxin",
]

ground_truth_drugs = [
    "Etravirine",
    "Resveratrol",
    "SS-31",
    "deferoxamine",
    "BAPTA-AM",
    "antioxidants",
]

ground_truth_diseases = [
    "friedreich ataxia",
    "dyslipidemia",
    "pre-diabetic state",
    "diabetes"
]

In [102]:
import json

def confusion_matrix(observed, expected):
    true_pos = [obs for obs in observed if obs in expected]
    false_pos = [obs for obs in observed if obs not in expected]
    false_neg = [exp for exp in expected if exp not in observed]
    return true_pos, false_pos, false_neg

def print_confusion_matrix(cm):
    tp, fp, fn = cm
    print(f'TP={len(tp)}, FP={len(fp)}, FN={len(fn)}')

def cmp_with_ground_truth(file: str):
    with open(file) as fp:
        results = json.load(fp)
    metabolite_confusion = confusion_matrix(
        set(m['name'].lower() for m in results['keywords']['mentioned_metabolites']),
        set(m.lower() for m in ground_truth_metabolites),
    )
    pathway_confusion = confusion_matrix(
        set(p['name'].lower() for p in results['keywords']['mentioned_pathways']),
        set(p.lower() for p in ground_truth_pathways),
    )
    drug_confusion = confusion_matrix(
        set(d['name'].lower() for d in results['keywords']['mentioned_drugs']),
        set(d.lower() for d in ground_truth_drugs),
    )
    disease_confusion = confusion_matrix(
        set(d['name'].lower() for d in results['keywords']['mentioned_diseases']),
        set(d.lower() for d in ground_truth_diseases),
    )

    for name, cm in (('Metabolites', metabolite_confusion), ('Pathways', pathway_confusion), ('Drugs', drug_confusion), ('Diseases', disease_confusion)):
        tp, fp, fn = cm
        print(f'## {name}:\n')
        print_confusion_matrix(cm)
        print(f'''
FP: {", ".join(fp)}

FN: {", ".join(fn)}

''')

print('# RUN 1\n')
cmp_with_ground_truth('../experiments/molgenetmetab-run1.json')

print('\n\n# RUN 2\n\n')
cmp_with_ground_truth('../experiments/molgenetmetab-run2.json')

# RUN 1

## Metabolites:

TP=38, FP=34, FN=8

FP: tg(18:1,26:0), hydrophilic metabolites, 10-formyl-thf, ribose-5-phosphate, adenosylmethionine, s-adenosylmethionine (sam), kynurenine, serine, histamine, dodecanedioc acid, inosine monophosphate (imp), hypoxanthine (hxan), glycine, 5,10-methylene thf, prostaglandins, sam, edta, s-adenosylhomocysteine (sah), sah, oxylipins, creatine, lipid-related metabolites, 5,10-methylene-thf, betaine aldehyde, iron-sulfur (fe/s) cluster, uric acid, xanthine, tetrahydrofolate, hydroxypropionylcarnitine (c3.oh), inosine monophosphate, iron, tetrahydrofolate (thf), nicotinamide mononucleotide (nmn), uric acid (ua)

FN: dodecanedioic acid, fatty acids, iron-sulfur clusters, phosphatidylcholine, unsaturated fatty acid, branched chain amino acids, reactive oxygen species, oxylipin


## Pathways:

TP=7, FP=13, FN=7

FP: non-folate-dependent pathways, fatty acid metabolism, glucose utilizing pathways, 1c metabolism pathway, carbohydrate metabolism, electron 