In [15]:
%pip install -qU tiktoken pypdf openai langchain langchain-text-splitters langchain-openai langchain-core python-dotenv pydantic langchain_community


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [34]:
import itertools
import json
from pprint import pprint
from pathlib import Path
import sys

import tiktoken
from pydantic import BaseModel, Field
from openai import OpenAI
import dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents.base import Document

sys.path.append('..')
from viime_extract.schema import ArticleMeta, ArticleKeyWords, Metabolite

In [2]:
LLM_MODEL_NAME = 'gpt-4o-2024-08-06'
PDF_DIR = Path('../data/PubMed LongCovid and Metabolomics Results')

In [37]:
dotenv.load_dotenv()
LLM = ChatOpenAI(model=LLM_MODEL_NAME, temperature=0)

In [18]:
def get_pdf_pages(pdf_file: Path):
    return list(PyPDFLoader(pdf_file).lazy_load())

def extract_article_metadata(pages: list[Document], model=LLM):
    # assumption: first page contains all of the metadata
    first_page = pages[0].page_content

    template = ChatPromptTemplate([
        ('system', '''You are an expert in extracting structured information from medical journal articles.
Present the extracted information in a clear, structured format. Be comprehensive and extract every single
mentioned entity. You will be evaluated on the quality and completeness of the extracted information.'''),
        ('user', 'Please extract the title, authors, journal title, publication year, journal volume, DOI ID, and pubmed ID from the following journal article:\n\n{article_contents}'),
    ])
    prompt = template.invoke({'article_contents': first_page})
    return model.with_structured_output(ArticleMeta).invoke(prompt)

def extract_article_keywords(pages: list[Document], model=LLM):
    template = ChatPromptTemplate([
        ('system', '''You are an expert in extracting structured information from medical journal articles.
Present the extracted information in a clear, structured format. Be comprehensive and extract every single
mentioned entity. You will be evaluated on the quality and completeness of the extracted information.

If you are not confident in the identifier for an entity, you can specify it as "unknown". It is better
to include an entity with an "unknown" identified than to omit it entirely.'''),
        ('user', 'Please extract the metabolites, proteins, genes, pathways, drugs, and diseases mentioned in the following journal article:\n\n{article_contents}'),
    ])

    splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        model_name=LLM_MODEL_NAME,
        chunk_size=1024,
        chunk_overlap=128,
    )

    keywords = ArticleKeyWords()

    for text_chunk in splitter.split_documents(pages):
        prompt = template.invoke({'article_contents': text_chunk})
        response = model.with_structured_output(ArticleKeyWords).invoke(prompt)
        keywords = keywords.merge(response)

    return keywords

In [5]:
LC_81_pages = get_pdf_pages(PDF_DIR / 'LC_81.pdf')

In [44]:
article_meta = extract_article_metadata(LC_81_pages, model=LLM)

In [8]:
article_keywords = extract_article_keywords(LC_81_pages, model=LLM)

In [33]:
article_keywords.mentioned_metabolites

[Metabolite(name='triacylglycerols', chebi_id=None),
 Metabolite(name='phosphatidylcholines', chebi_id=None),
 Metabolite(name='prostaglandin E2', chebi_id=None),
 Metabolite(name='arginine', chebi_id='CHEBI:29016'),
 Metabolite(name='betain', chebi_id='CHEBI:17750'),
 Metabolite(name='adenosine', chebi_id='CHEBI:16335'),
 Metabolite(name='total bilirubin', chebi_id='CHEBI:17587'),
 Metabolite(name='direct bilirubin', chebi_id='CHEBI:17012'),
 Metabolite(name='albumin', chebi_id='CHEBI:16199'),
 Metabolite(name='globulin', chebi_id='CHEBI:36080'),
 Metabolite(name='Cys-C', chebi_id='unknown'),
 Metabolite(name='LDH', chebi_id=None),
 Metabolite(name='CRP', chebi_id=None),
 Metabolite(name='TBIL', chebi_id=None),
 Metabolite(name='DBIL', chebi_id=None),
 Metabolite(name='ALT', chebi_id=None),
 Metabolite(name='AST', chebi_id=None),
 Metabolite(name='ALP', chebi_id=None),
 Metabolite(name='GGT', chebi_id=None),
 Metabolite(name='TP', chebi_id=None),
 Metabolite(name='Albumin', chebi_id=N

In [15]:
keywords_no_unknown = extract_article_keywords(LC_81_pages, model=LLM)

In [29]:
len(article_keywords.mentioned_diseases)

64