In [53]:
from pydantic import BaseModel, Field
from openai import OpenAI
import dotenv
import itertools
import json

from schema import Article, ArticleFragment

In [5]:
dotenv.load_dotenv()
client = OpenAI()

In [47]:
extraction_message = """You are an expert in extracting structured information from medical journal articles.
Identify the following key details: the title, authors, and any mentioned metabolites and pathways.
Present the extracted information in a clear, structured format. Be comprehensive and extract every single
mentioned entity. You will be evaluated on the quality and completeness of the extracted information.

If you are not confident in the identifier for an entity, you can specify it as "unknown". It is better
to include an entity with an "unknown" identified than to omit it entirely."""

consolidation_message = """You are a helpful assistant who is an expert in metabolites and metabolics. """

In [93]:
def is_page_marker(line: str) -> bool:
    try:
        data = json.loads(line.replace("'", '"'))
        return isinstance(data, dict) and 'page' in data and 'source' in data
    except json.decoder.JSONDecodeError:
        return False

def chunk_document(document):
    '''Chunk the document by page'''
    linegen = iter(document.strip().split('\n'))
    
    # skip lines until we get to the first page marker
    for line in linegen:
        if is_page_marker(line):
            break

    page = []
    for line in linegen:
        if not line.strip():
            continue
        if is_page_marker(line):
            yield '\n'.join(page)
            page = []
        else:
            page.append(line)

    if page:
        yield '\n'.join(page)

def chunk_and_extract(document, model="gpt-4o-2024-08-06", temperature=0, system_message=extraction_message):
    responses = []
    for page, response_format in zip(
        chunk_document(document),
        itertools.chain([Article], itertools.repeat(ArticleFragment)),
    ):
        response = client.beta.chat.completions.parse(
            model=model,
            temperature=temperature,
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": page},
            ],
            response_format=response_format,
        )
        responses.append(json.loads(response.choices[0].message.content))
    return responses

class DedupResponse(BaseModel):
    metabolites: list[str] = Field(..., title="Metabolites", description="A list of metabolites")

def remove_duplicates(content, model="gpt-4o-2024-08-06", temperature=0, system_message=consolidation_message):
    response = client.beta.chat.completions.parse(
        model=model,
        temperature=temperature,
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": content},
        ],
        response_format=DedupResponse,
    )
    return response

## Extract Data

In [16]:
with open('../data/MolGenetMetab_136_306_2022.txt', 'r') as file:
    contents = file.read()

responses = chunk_and_extract(contents)

In [17]:
print(len(responses))

9


## Remove Duplicate Metabolites

In [24]:
all_metabolites_set = set()
for resp in responses:
    fragment = ArticleFragment.parse_obj(resp)
    for metabolite in fragment.mentioned_metabolites:
        all_metabolites_set.add(metabolite.name)
all_metabolites = list(all_metabolites_set)

/var/folders/jg/0t5c1xb50dz652thhvddz6jh5w38nh/T/ipykernel_60902/4085826527.py:3: PydanticDeprecatedSince20: The `parse_obj` method is deprecated; use `model_validate` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  fragment = ArticleFragment.parse_obj(resp)


In [48]:
all_metabolites = sorted(all_metabolites, key=lambda v: v.lower())
print('\n'.join(all_metabolites))

10-formyl-THF
5,10-methylene THF
5,10-methylene-THF
acetate
Apolipoprotein A-I
arginine
ATP
betaine
betaine aldehyde
carnitine
cholesterol ester
choline
creatine
dihydroceramide
dimethylglycine
dodecanedioic acid
formate
glucose
glutamate
glycine
histamine
histidine
homocysteine
hydroxypropionylcarnitine
hypoxanthine
indoxylsulfate
inosine monophosphate (IMP)
kynurenine
lactate
leucine
lysine
methionine
NAD
NADH
Nicotinamide mononucleotide
nicotinamide mononucleotide (NMN)
ornithine
oxylipins
phosphocholines
phosphocreatine
pyruvate
ribose-5-phosphate
S-adenosylhomocysteine (SAH)
S-adenosylmethionine (SAM)
SAH
SAM
sarcosine
serine
succinate
taurine
tetrahydrofolate (THF)
THF
threonine
triglycerides
uric acid
Uric acid
valine
xanthine
β-alanine


In [63]:
dedup_response_raw = remove_duplicates(
    f'Please all duplicate metabolites from the following list:\n{"\n".join(all_metabolites)}\n'
)

In [64]:
dedup_response = DedupResponse.parse_raw(dedup_response_raw.choices[0].message.content)
print(dedup_response.metabolites)

['10-formyl-THF', '5,10-methylene THF', 'acetate', 'Apolipoprotein A-I', 'arginine', 'ATP', 'betaine', 'betaine aldehyde', 'carnitine', 'cholesterol ester', 'choline', 'creatine', 'dihydroceramide', 'dimethylglycine', 'dodecanedioic acid', 'formate', 'glucose', 'glutamate', 'glycine', 'histamine', 'histidine', 'homocysteine', 'hydroxypropionylcarnitine', 'hypoxanthine', 'indoxylsulfate', 'inosine monophosphate (IMP)', 'kynurenine', 'lactate', 'leucine', 'lysine', 'methionine', 'NAD', 'NADH', 'Nicotinamide mononucleotide', 'ornithine', 'oxylipins', 'phosphocholines', 'phosphocreatine', 'pyruvate', 'ribose-5-phosphate', 'S-adenosylhomocysteine (SAH)', 'S-adenosylmethionine (SAM)', 'sarcosine', 'serine', 'succinate', 'taurine', 'tetrahydrofolate (THF)', 'threonine', 'triglycerides', 'uric acid', 'valine', 'xanthine', 'β-alanine']


/var/folders/jg/0t5c1xb50dz652thhvddz6jh5w38nh/T/ipykernel_60902/1473257754.py:1: PydanticDeprecatedSince20: The `parse_raw` method is deprecated; if your data is JSON use `model_validate_json`, otherwise load the data then use `model_validate` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  dedup_response = DedupResponse.parse_raw(dedup_response_raw.choices[0].message.content)


## Evaluate Response for Metabolites

In [100]:
ground_truth_metabolites = """formate
sarcosine
hypoxanthine
homocysteine
ATP
NAD
NADH
fatty acids
dihydroceramide
unsaturated fatty acid
phosphocholines
cholesterol ester
hydroxypropionylcarnitine
acetate
dodecanedioic acid
indoxylsulfate
arginine
glucose
carnitine
glutamate
lysine
histidine
branched chain amino acids
leucine
valine
choline
threonine
ornithine
lactate
succinate
cholesterol ester 20:0
triglycerides
phosphatidylcholine
betaine
methionine
s-adenosylmethionine
s-adenosylhomocysteine
dimethylglycine
iron-sulfur clusters
phosphocreatine
nicotinamide mononucleotide
oxylipin
reactive oxygen species
taurine
β-alanine
pyruvate
""".strip().split("\n")

ground_truth_pathways = """iron-sulfur cluster biogenesis
cellular energy metabolism
mitochondrial electron transport chain
krebs cycle
electron transfer flavoprotein
nuclear gene expression
glycolysis
carbohydrate and fatty acid metabolism
energy metabolism
one-carbon metabolism
folate cycle
methionine salvage
purine nucleotide salvage and synthesis
pyruvate metabolism
""".split("\n")

ground_truth_proteins = [
    "frataxin",
]

ground_truth_drugs = [
    "Etravirine",
    "Resveratrol",
    "SS-31",
    "deferoxamine",
    "BAPTA-AM",
    "antioxidants",
]

ground_truth_diseases = [
    "friedreich ataxia",
    "dyslipidemia",
    "pre-diabetic state",
    "diabetes"
]

In [87]:
def compare_lists(observed: list, expected: list, cmp=lambda obs, exp: obs == exp):
    # contains observed items
    true_positives = []
    # contains observed items
    false_positives = []
    # contains expected items
    false_negatives = []

    def find(needle, haystack):
        for x in haystack:
            if cmp(needle, x):
                return True
        return False

    for obs in observed:
        if find(obs, expected):
            true_positives.append(obs)
        else:
            false_positives.append(obs)

    for exp in expected:
        if not find(exp, observed):
            false_negatives.append(exp)

    return {
        'truepos': true_positives,
        'falsepos': false_positives,
        'falseneg': false_negatives,
    }

In [102]:
def similar(a: str, b: str) -> bool:
    a = a.lower()
    b = b.lower()
    return a == b

compare_results = compare_lists(
    dedup_response.metabolites,
    ground_truth_metabolites,
    cmp=similar,
)

print(f'True positives: {len(compare_results['truepos'])}')
print(f'False positives: {len(compare_results['falsepos'])}')
print(f'False negatives: {len(compare_results['falseneg'])}')
print(f'Ground truth count: {len(ground_truth_metabolites)}')

True positives: 36
False positives: 17
False negatives: 10
Ground truth count: 46


### True positives

In [106]:
print('\n'.join(compare_results['truepos']))

acetate
arginine
ATP
betaine
carnitine
cholesterol ester
choline
dihydroceramide
dimethylglycine
dodecanedioic acid
formate
glucose
glutamate
histidine
homocysteine
hydroxypropionylcarnitine
hypoxanthine
indoxylsulfate
lactate
leucine
lysine
methionine
NAD
NADH
Nicotinamide mononucleotide
ornithine
phosphocholines
phosphocreatine
pyruvate
sarcosine
succinate
taurine
threonine
triglycerides
valine
β-alanine


### False positives

In [104]:
print('\n'.join(compare_results['falsepos']))

10-formyl-THF
5,10-methylene THF
Apolipoprotein A-I
betaine aldehyde
creatine
glycine
histamine
inosine monophosphate (IMP)
kynurenine
oxylipins
ribose-5-phosphate
S-adenosylhomocysteine (SAH)
S-adenosylmethionine (SAM)
serine
tetrahydrofolate (THF)
uric acid
xanthine


### False negatives

In [105]:
print('\n'.join(compare_results['falseneg']))

fatty acids
unsaturated fatty acid
branched chain amino acids
cholesterol ester 20:0
phosphatidylcholine
s-adenosylmethionine
s-adenosylhomocysteine
iron-sulfur clusters
oxylipin
reactive oxygen species


## Analysis

- Some false negatives were actually found (e.g. s-adenosylmethionine, oxylipin). This is a result of a naive text similarity comparison approach.
- Chunking by page could potentially miss multi-word metabolites that straddle a page break. Chunking by paragraph or sentence is more ideal, but the PDF text extraction output doesn't support such chunking reliably.
- Smaller chunking might help pare down the false negatives list.