In [None]:
!pip install openai
!pip install pubchempy
!pip install biopython

In [25]:
import os
from openai import OpenAI
import json
import pandas as pd
import pubchempy as pcp
from Bio import Entrez
import time

## Add PMCid to BRENDA database

In [None]:
brenda_reactions = pd.read_csv(r"brenda_reactions.csv")
brenda_reactions = brenda_reactions.dropna()
brenda_reactions["pubmedId"] = brenda_reactions["pubmedId"].astype(int)

pubmedIds = [] # for a subset of papers
subset_brenda = brenda_reactions[brenda_reactions['pubmedId'].isin(pubmedIds)]

In [None]:
pubmedId_pmcId_map = {}

Entrez.email = "" # from account
Entrez.api_key = ""  # from account

def get_pmc_id_from_pubmed_id(pubmed_id):
    """Retrieve PMC ID from PubMed ID using NCBI's E-Link utility."""
    pubmed_id = str(pubmed_id)
    record = Entrez.read(Entrez.elink(dbfrom="pubmed", id=pubmed_id, db="pmc", LinkName="pubmed_pmc"))
    pmc_id = None
    time.sleep(3)
    try:
        linkset = record[0]['LinkSetDb'][0]['Link']
        pmc_id = linkset[0]['Id']
        return pmc_id
    except (IndexError, KeyError):
        print("No PMC ID found for PubMed ID:", pubmed_id)
        return pmc_id

def map_pubmedId_pmcId(pubmedId):
    return pubmedId_pmcId_map.get(pubmedId)

for pubmedId in pubmedIds:
    pubmedId_pmcId_map[pubmedId] = get_pmc_id_from_pubmed_id(pubmedId)

subset_brenda['pmcId'] = subset_brenda['pubmedId'].apply(map_pubmedId_pmcId)
subset_brenda = subset_brenda.dropna()
subset_brenda['pmcId'] = subset_brenda['pmcId'].astype(int)
subset_brenda['pubmedId'] = subset_brenda['pubmedId'].astype(int)
brenda_df = subset_brenda

## GPT Prompting

In [35]:
os.environ['OPENAI_API_KEY'] = "" # from account

def get_rxns_from_gpt(passage):

  client = OpenAI()
  model="gpt-4o"

  starting_message = """
  You are an expert at extracting structured data on enzymatic reactions from scientific literature. 
  You are also knowledgeable about enzymatic reactions, and are able to extrapolate information where either substrates or products are missing given its corresponding counterpart.
  Your task is to identify all enzymatic reactions that has an enzyme associated with each reaction from the given scientific paper and output them in the specified structured format.
  """

  prompt = """
Extract all enzymatic reactions described in the provided scientific text. Extract information from tables as well, including any substrates, enzymes, or products that are mentioned.

For reactions where only the substrate or product is mentioned but not the corresponding pair, infer the missing product or substrate to the best of your ability.

For each reaction, include the following details.
Substrates: List all substrates involved in the reaction, including any co-factors that play a role in the reaction. If only products are listed without substrates, infer likely substrates based on common enzymatic or biochemical transformations. 
Products: Identify all products formed, and only include major products. If products are not mentioned, use your knowledge of chemistry and enzymatic reactions to infer potential products. For example, if '2-butanone' and 'NADPH' are listed as substrates, you should infer that 'butan-2-ol' is a product considering typical reduction reactions. Likewise, '2-pentanone' and 'NADPH' is likely to give 'pentan-2-ol' as the product. Provide specific inferred products or substrates in chemical names, instead of generic phrases.
Enzymes: Identify any enzymes that catalyze the reaction. If there is no reaction associated with an enzyme, don't report it.
Reversibility: If a reaction is reversible and is explicitly stated, include both the forward and reverse reactions separately, clearly indicating the direction of each reaction.
Comprehensive Output: Ensure that the output is comprehensive and well-structured, capturing all relevant details of potential chemical and enzymatic transformations described in the literature. Use your knowledge to fill gaps where the reactions are not fully described in the text, inferring based on common enzymatic or biochemical transformations.
Format: Output your answer in the JSON format shown below, make sure that the output is in a valid JSON format. If you make an inference because of a missing reaction component such as missing products, fill "Inferred" with "True", otherwise fill it "False".
{
    "enzyme": "(enzyme name)",
    "reaction": {
        "substrate": ["(substrate 1 name)"],
        "product": ["(product 1 name)", "(product 2 name)"]
    },
    "inferred": "(false)"
}
Make sure both keys and string values are enclosed in double quotes ("), not single quotes ('), and there should not be trailing commas after the last item in a JSON object or array.
The text you are meant to analyze begins here:
"""

  response = client.chat.completions.create(
    model=model,
    messages=[
      {"role": "developer", "content": starting_message},
      {"role": "user", "content": prompt},
      {"role": "user", "content": passage}
    ],
    temperature=0,
    max_tokens=2000
  )
  
  return response

In [None]:
all_reactions = []

for i in range(len(unique_ids)):
    pubmedId = brenda_df[brenda_df['pmcId'] == unique_ids[i]]['pubmedId'].iloc[0]
    pmcId = brenda_df[brenda_df['pmcId'] == unique_ids[i]]['pmcId'].iloc[0]
    paper = brenda_df[brenda_df['pmcId'] == unique_ids[i]]['full_paper'].iloc[0]

    reactions = get_rxns_from_gpt(paper)
    input_string = reactions.choices[0].message.content
    lines = input_string.splitlines()

    # Remove the Markdown code block syntax if present
    if lines[0].startswith("```"):
        cleaned_lines = lines[1:-1] 
    else:
        cleaned_lines = lines
    
    cleaned_lines = lines[1:-1]
    json_string = "".join(cleaned_lines)
    try:
        reactions_data = json.loads(json_string)

        print(reactions_data)

        df_reactions = pd.json_normalize(reactions_data, sep='_')
        df_reactions['pmcId'] = pmcId  
        df_reactions['pubmedId'] = pubmedId  
        df_reactions['paper'] = paper  

        all_reactions.append(df_reactions)
        print(f"Completed {i}")
    except:
        print(f"pmcId {unique_ids[i]} json conversion error")

full_reactions_df = pd.concat(all_reactions, ignore_index=True)
full_reactions_df.rename(columns={'reaction_substrate': 'substrates', 'reaction_product': 'products'}, inplace=True)

In [None]:
full_reactions_df.to_csv("gpt_extracted.csv", index=False)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=2570873b-d089-4ed7-b836-6c2df496af15' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>