In [78]:
from pydantic import BaseModel, Field
from openai import OpenAI
import dotenv
import itertools
import json
from pprint import pprint

from schema import Article, ArticleFragment, Metabolite

In [3]:
dotenv.load_dotenv()
client = OpenAI()

In [4]:
extraction_message = """You are an expert in extracting structured information from medical journal articles.
Identify the following key details: the title, authors, and any mentioned metabolites and pathways.
Present the extracted information in a clear, structured format. Be comprehensive and extract every single
mentioned entity. You will be evaluated on the quality and completeness of the extracted information.

If you are not confident in the identifier for an entity, you can specify it as "unknown". It is better
to include an entity with an "unknown" identified than to omit it entirely."""

consolidation_message = """You are a helpful assistant who is an expert in metabolites and metabolics. """

In [89]:
def is_page_marker(line: str) -> bool:
    try:
        data = json.loads(line.replace("'", '"'))
        return isinstance(data, dict) and 'page' in data and 'source' in data
    except json.decoder.JSONDecodeError:
        return False

def chunk_document_by_page(document):
    '''Chunk the document by page'''
    linegen = iter(document.strip().split('\n'))
    
    # skip lines until we get to the first page marker
    for line in linegen:
        if is_page_marker(line):
            break

    page = []
    for line in linegen:
        if not line.strip():
            continue
        if is_page_marker(line):
            yield '\n'.join(page)
            page = []
        else:
            page.append(line)

    if page:
        yield '\n'.join(page)

def chunk_halfpages(pages):
    for page in pages:
        lines = page.strip().split('\n')
        mid = len(lines) // 2
        yield '\n'.join(lines[:mid])
        yield '\n'.join(lines[mid:])

def iter_halfpages_with_overlap(halfpages):
    '''Chunks a document with an overlap of 1 line.

    The overlap helps identify multi-word metabolite names that get cut off at a line break.
    '''
    idx = 0
    while idx < len(halfpages):
        halfpage = halfpages[idx]
        prefix = ''
        suffix = ''
        if idx > 0:
            prefix = halfpages[idx - 1].split('\n')[-1]
        if idx < len(halfpages) - 1:
            suffix = halfpages[idx + 1].split('\n')[-1]
        
        yield f'{prefix}\n{halfpage}\n{suffix}'
        idx += 1

def ask_openai(document, response_class, model="gpt-4o-2024-08-06", temperature=0, system_message=''):
    response = client.beta.chat.completions.parse(
        model=model,
        temperature=temperature,
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": document},
        ],
        response_format=response_class,
    )
    return json.loads(response.choices[0].message.content)

def chunk_and_extract(document, model="gpt-4o-2024-08-06", temperature=0, system_message=extraction_message):
    pages = list(chunk_document_by_page(document))
    halfpages = list(chunk_halfpages(pages))
    responses = []

    if len(pages) == 0:
        return responses

    # get the article authors, title, etc.
    responses.append(
        ask_openai(pages[0], Article, model=model, temperature=temperature, system_message=system_message)
    )

    # iter by overlapped half page
    for chunk in iter_halfpages_with_overlap(halfpages):
        responses.append(
            ask_openai(chunk, ArticleFragment, model=model, temperature=temperature, system_message=system_message)
        )
    return responses

class DedupResponse(BaseModel):
    metabolites: list[Metabolite] = Field(..., title="Metabolites", description="A list of metabolites")

    @property
    def metabolite_names(self):
        return [m.name for m in self.metabolites]

def remove_duplicates(item_list, model="gpt-4o-2024-08-06", temperature=0, system_message=consolidation_message):
    content = f'Please remove all duplicate metabolites from the following list:\n{"\n".join(item_list)}\n'
    response = client.beta.chat.completions.parse(
        model=model,
        temperature=temperature,
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": content},
        ],
        response_format=DedupResponse,
    )
    return response

## Extract Data

In [97]:
with open('../data/MolGenetMetab_136_306_2022.txt', 'r') as file:
    contents = file.read()

responses = chunk_and_extract(contents)

In [98]:
print(len(responses))

19


## Remove Duplicate Metabolites

In [82]:
all_metabolites_set = set()
for resp in responses:
    fragment = ArticleFragment.parse_obj(resp)
    for metabolite in fragment.mentioned_metabolites:
        all_metabolites_set.add(metabolite.name)
all_metabolites = list(all_metabolites_set)

/var/folders/jg/0t5c1xb50dz652thhvddz6jh5w38nh/T/ipykernel_9076/4085826527.py:3: PydanticDeprecatedSince20: The `parse_obj` method is deprecated; use `model_validate` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  fragment = ArticleFragment.parse_obj(resp)


In [83]:
all_metabolites = sorted(all_metabolites, key=lambda v: v.lower())
print('\n'.join(all_metabolites))

10-formyl-THF
5,10-methylene THF
5,10-methylene-THF
acetate
Apolipoprotein A-I
arginine
ATP
betaine
betaine aldehyde
carnitine
carnitine (C0)
cholesterol ester
cholesterol ester 20:0
cholesterol ester, CE(20:0)
choline
creatine
dihydroceramide
dimethylglycine
dodecanedioc acid (DiCA.12.0)
formate
frataxin
glucose
glutamate
glycine
histamine
histidine
homocysteine
hydroxypropionylcarnitine (C3.OH)
hypoxanthine
hypoxanthine (HXan)
indoxylsulfate (Ind.SO4)
inosine monophosphate
inosine monophosphate (IMP)
iron
iron-sulfur (Fe/S) cluster
kynurenine
lactate
leucine
lysine
methionine
NAD
NADH
Nicotinamide mononucleotide
nicotinamide mononucleotide (NMN)
ornithine
oxylipins
phosphatidylcholine
phosphocholines
phosphocreatine
Prostaglandins
pyruvate
ribose-5-phosphate
S-adenosylhomocysteine
S-adenosylhomocysteine (SAH)
S-adenosylmethionine
S-adenosylmethionine (SAM)
SAH
SAM
sarcosine
serine
succinate
taurine
tetrahydrofolate
tetrahydrofolate (THF)
TG(18:1,26:0)
threonine
triglycerides
unknown


In [99]:
dedup_response_raw = remove_duplicates(all_metabolites)

In [100]:
dedup_response = DedupResponse.parse_raw(dedup_response_raw.choices[0].message.content)
print(dedup_response.metabolites)

[Metabolite(name='10-formyl-THF', chebi_id='CHEBI:15636'), Metabolite(name='5,10-methylene THF', chebi_id='CHEBI:15637'), Metabolite(name='acetate', chebi_id='CHEBI:30089'), Metabolite(name='Apolipoprotein A-I', chebi_id='CHEBI:30090'), Metabolite(name='arginine', chebi_id='CHEBI:29016'), Metabolite(name='ATP', chebi_id='CHEBI:15422'), Metabolite(name='betaine', chebi_id='CHEBI:17750'), Metabolite(name='betaine aldehyde', chebi_id='CHEBI:15710'), Metabolite(name='carnitine', chebi_id='CHEBI:16347'), Metabolite(name='cholesterol ester', chebi_id='CHEBI:17002'), Metabolite(name='choline', chebi_id='CHEBI:15354'), Metabolite(name='creatine', chebi_id='CHEBI:16919'), Metabolite(name='dihydroceramide', chebi_id='CHEBI:17003'), Metabolite(name='dimethylglycine', chebi_id='CHEBI:17100'), Metabolite(name='dodecanedioc acid (DiCA.12.0)', chebi_id='CHEBI:17101'), Metabolite(name='formate', chebi_id='CHEBI:15740'), Metabolite(name='frataxin', chebi_id='CHEBI:17102'), Metabolite(name='glucose', ch

/var/folders/jg/0t5c1xb50dz652thhvddz6jh5w38nh/T/ipykernel_9076/1473257754.py:1: PydanticDeprecatedSince20: The `parse_raw` method is deprecated; if your data is JSON use `model_validate_json`, otherwise load the data then use `model_validate` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  dedup_response = DedupResponse.parse_raw(dedup_response_raw.choices[0].message.content)


## Evaluate Response for Metabolites

In [13]:
ground_truth_metabolites = """formate
sarcosine
hypoxanthine
homocysteine
ATP
NAD
NADH
fatty acids
dihydroceramide
unsaturated fatty acid
phosphocholines
cholesterol ester
hydroxypropionylcarnitine
acetate
dodecanedioic acid
indoxylsulfate
arginine
glucose
carnitine
glutamate
lysine
histidine
branched chain amino acids
leucine
valine
choline
threonine
ornithine
lactate
succinate
cholesterol ester 20:0
triglycerides
phosphatidylcholine
betaine
methionine
s-adenosylmethionine
s-adenosylhomocysteine
dimethylglycine
iron-sulfur clusters
phosphocreatine
nicotinamide mononucleotide
oxylipin
reactive oxygen species
taurine
β-alanine
pyruvate
""".strip().split("\n")

ground_truth_pathways = """iron-sulfur cluster biogenesis
cellular energy metabolism
mitochondrial electron transport chain
krebs cycle
electron transfer flavoprotein
nuclear gene expression
glycolysis
carbohydrate and fatty acid metabolism
energy metabolism
one-carbon metabolism
folate cycle
methionine salvage
purine nucleotide salvage and synthesis
pyruvate metabolism
""".split("\n")

ground_truth_proteins = [
    "frataxin",
]

ground_truth_drugs = [
    "Etravirine",
    "Resveratrol",
    "SS-31",
    "deferoxamine",
    "BAPTA-AM",
    "antioxidants",
]

ground_truth_diseases = [
    "friedreich ataxia",
    "dyslipidemia",
    "pre-diabetic state",
    "diabetes"
]

In [22]:
def compare_lists(observed: list, expected: list, cmp=lambda obs, exp: obs == exp):
    # contains observed items
    true_positives = []
    # contains observed items
    false_positives = []
    # contains expected items
    false_negatives = []

    def find(needle, haystack):
        for x in haystack:
            if cmp(needle, x):
                return True
        return False

    for obs in observed:
        if find(obs, expected):
            true_positives.append(obs)
        else:
            false_positives.append(obs)

    for exp in expected:
        if not find(exp, observed):
            false_negatives.append(exp)

    return {
        'truepos': true_positives,
        'falsepos': false_positives,
        'falseneg': false_negatives,
    }

In [92]:
def similar(a: str, b: str) -> bool:
    a = a.lower()
    b = b.lower()
    return a == b

compare_results = compare_lists(
    dedup_response.metabolite_names,
    ground_truth_metabolites,
    cmp=similar,
)

print(f'True positives: {len(compare_results['truepos'])}')
print(f'False positives: {len(compare_results['falsepos'])}')
print(f'False negatives: {len(compare_results['falseneg'])}')
print(f'Ground truth count: {len(ground_truth_metabolites)}')

True positives: 35
False positives: 24
False negatives: 11
Ground truth count: 46


### True positives

In [93]:
print('\n'.join(compare_results['truepos']))

acetate
arginine
ATP
betaine
carnitine
cholesterol ester
choline
dihydroceramide
dimethylglycine
formate
glucose
glutamate
histidine
homocysteine
hypoxanthine
lactate
leucine
lysine
methionine
NAD
NADH
Nicotinamide mononucleotide
ornithine
phosphatidylcholine
phosphocreatine
pyruvate
S-adenosylhomocysteine
S-adenosylmethionine
sarcosine
succinate
taurine
threonine
triglycerides
valine
β-alanine


### False positives

In [17]:
print('\n'.join(compare_results['falsepos']))

10-formyl-THF
5,10-methylene THF
Apolipoprotein A-I
betaine aldehyde
creatine
glycine
histamine
inosine monophosphate
kynurenine
oxylipins
ribose-5-phosphate
S-adenosylhomocysteine (SAH)
S-adenosylmethionine (SAM)
serine
tetrahydrofolate (THF)
uric acid
xanthine


### False negatives

In [18]:
print('\n'.join(compare_results['falseneg']))

fatty acids
unsaturated fatty acid
branched chain amino acids
cholesterol ester 20:0
phosphatidylcholine
s-adenosylmethionine
s-adenosylhomocysteine
iron-sulfur clusters
oxylipin
reactive oxygen species


## Analysis

- Some false negatives were actually found (e.g. s-adenosylmethionine, oxylipin). This is a result of a naive text similarity comparison approach.
- Chunking by page could potentially miss multi-word metabolites that straddle a page break. Chunking by paragraph or sentence is more ideal, but the PDF text extraction output doesn't support such chunking reliably.
- Smaller chunking might help pare down the false negatives list.

## GPT compare lists

In [72]:
class CompareResponse(BaseModel):
    true_positives: list[str] = Field(..., title="True Positives", description="A list of true positives")
    false_positives: list[str] = Field(..., title="False Positives", description="A list of false positives")
    false_negatives: list[str] = Field(..., title="False Negatives", description="A list of false negatives")

def gpt_compare_lists(observed: list, expected: list, model="gpt-4o-2024-08-06", temperature=0, system_message='You are an expert in metabolomics.'):
    content = f'''Identify the true positives, false positives, and false negatives between the ground truth and the observed lists, normalizing the metabolite names on plurality and abbreviations.

# Ground Truth
{'\n'.join(expected)}

# Observed
{'\n'.join(observed)}'''
    response = client.beta.chat.completions.parse(
        model=model,
        temperature=temperature,
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": content},
        ],
        response_format=CompareResponse,
    )
    return response

In [94]:
compare_result = gpt_compare_lists(
    dedup_response.metabolite_names,
    ground_truth_metabolites,
)

In [95]:
result = CompareResponse.parse_obj(json.loads(compare_result.choices[0].message.content))

print(f'''True positives ({len(result.true_positives)})
===
{'\n'.join(result.true_positives)}

False positives ({len(result.false_positives)})
===
{'\n'.join(result.false_positives)}

False negatives ({len(result.false_negatives)})
===
{'\n'.join(result.false_negatives)}
''')

True positives (38)
===
formate
sarcosine
hypoxanthine
homocysteine
ATP
NAD
NADH
dihydroceramide
cholesterol ester
hydroxypropionylcarnitine
acetate
glucose
carnitine
glutamate
lysine
histidine
leucine
valine
choline
threonine
ornithine
lactate
succinate
triglycerides
phosphatidylcholine
betaine
methionine
s-adenosylmethionine
s-adenosylhomocysteine
dimethylglycine
iron-sulfur clusters
phosphocreatine
nicotinamide mononucleotide
oxylipin
taurine
β-alanine
pyruvate
arginine

False positives (19)
===
10-formyl-THF
5,10-methylene THF
Apolipoprotein A-I
betaine aldehyde
creatine
frataxin
glycine
histamine
inosine monophosphate
iron
kynurenine
oxylipins
Prostaglandins
ribose-5-phosphate
tetrahydrofolate
TG(18:1,26:0)
unknown
uric acid
xanthine

False negatives (7)
===
fatty acids
unsaturated fatty acid
phosphocholines
dodecanedioic acid
branched chain amino acids
cholesterol ester 20:0
reactive oxygen species



/var/folders/jg/0t5c1xb50dz652thhvddz6jh5w38nh/T/ipykernel_9076/2666098585.py:1: PydanticDeprecatedSince20: The `parse_obj` method is deprecated; use `model_validate` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  result = CompareResponse.parse_obj(json.loads(compare_result.choices[0].message.content))


In [96]:
for metabolite in dedup_response.metabolites:
    print(metabolite)

name='10-formyl-THF' chebi_id='CHEBI:15636'
name='5,10-methylene THF' chebi_id='CHEBI:15637'
name='acetate' chebi_id='CHEBI:30089'
name='Apolipoprotein A-I' chebi_id='CHEBI:30090'
name='arginine' chebi_id='CHEBI:29016'
name='ATP' chebi_id='CHEBI:15422'
name='betaine' chebi_id='CHEBI:17750'
name='betaine aldehyde' chebi_id='CHEBI:15710'
name='carnitine' chebi_id='CHEBI:16347'
name='cholesterol ester' chebi_id='CHEBI:17002'
name='choline' chebi_id='CHEBI:15354'
name='creatine' chebi_id='CHEBI:16919'
name='dihydroceramide' chebi_id='CHEBI:17051'
name='dimethylglycine' chebi_id='CHEBI:17100'
name='dodecanedioc acid (DiCA.12.0)' chebi_id='CHEBI:17101'
name='formate' chebi_id='CHEBI:15740'
name='frataxin' chebi_id='CHEBI:17102'
name='glucose' chebi_id='CHEBI:17234'
name='glutamate' chebi_id='CHEBI:16015'
name='glycine' chebi_id='CHEBI:15428'
name='histamine' chebi_id='CHEBI:18295'
name='histidine' chebi_id='CHEBI:27570'
name='homocysteine' chebi_id='CHEBI:17592'
name='hydroxypropionylcarniti

- the "unknown" metabolite has a CHEBI ID linked to erythritol, which does not seem to appear in the text.