In [None]:
!pip install rdkit-pypi
!pip install pubchempy

import pubchempy as pcp
from rdkit import Chem
from rdkit.Chem import Draw
import re
import requests
from IPython.display import Image
import json
import time
import requests

import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("abstracts.csv")
df['substrates_split'] = df['substrates'].map(lambda x: x.split(" + "))
df['products_split'] = df['products'].map(lambda x: str(x).split(" + "))

In [None]:
pubchem_matches = set()
cactus_matches = set()
queried = set()
pubchem_no_match = set()
cactus_no_match = set()
no_matches = set()

def get_smiles_pubchem(name):
  smiles = pcp.get_compounds(name, 'name')

  if smiles:
    return smiles[0].isomeric_smiles

  else:
    return None

def get_smiles_cactus(structure_identifier):
    url = f"https://cactus.nci.nih.gov/chemical/structure/{structure_identifier}/smiles"

    response = requests.get(url)

    if response.status_code == 200:
        return response.text
    else:
        return None
            
def get_smiles_chemspider(name):
    api_key = ''

    search_url = f'https://api.rsc.org/compounds/v1/filter/name/'
    headers = {'apikey': api_key}

    try:
        data = {'name': name}

        json_data = json.dumps(data)
        response = requests.post(search_url, headers=headers, data=json_data)
        response.raise_for_status()  
        data = response.json()

        csid = data['queryId']

        # Give the server time to load the query 
        time.sleep(1)

        details_url = f'https://api.rsc.org/compounds/v1/filter/{csid}/results'
        response = requests.get(details_url, headers=headers)
        response.raise_for_status()
        data = response.json()

        results = data['results'][0]
        details_url = f'https://api.rsc.org/compounds/v1/records/{results}/details?fields=SMILES'
        response = requests.get(details_url, headers=headers)
        response.raise_for_status()  
        data = response.json()

        print(data['smiles'])
        return data['smiles']

    except Exception as e:
        print(f'Error: {e}')
        print(f"chemspider: no match for {name}")
        return None

def convert_smiles_to_structure(smiles):
  molecule = Chem.MolFromSmiles(smiles)

  if molecule is not None:
      img = Draw.MolToImage(molecule)
      img.save("molecule.png")
      display(Image(filename="molecule.png"))
      print("SMILES notation is valid.")
  else:
      print("Invalid SMILES notation.")

def smiles_tokenizer(smiles):
    """
    Tokenize a SMILES molecule or reaction
    """
    pattern =  "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
    regex = re.compile(pattern)
    tokens = [token for token in regex.findall(smiles)]
    assert smiles == ''.join(tokens)
    return ' '.join(tokens)

def get_smiles(mols, tried=False):
    for mol in mols:
        if mol != '?':
            pubchem_smile = None
            cactus_smile = None

            if mol not in queried:
                try:
                    pubchem_smile = get_smiles_pubchem(mol)
                    if pubchem_smile:
                        pubchem_matches.add((mol, pubchem_smile))
                    else:
                        pubchem_no_match.add(mol)
                        print(f"pubchem: no match for {mol}")
                except:
                    pubchem_no_match.add(mol)
                    print("pubchem: an error occurred")

                try:
                    cactus_smile = get_smiles_cactus(mol)
                    if cactus_smile:
                        cactus_matches.add((mol, cactus_smile))
                    else:
                        cactus_no_match.add(mol)
                        print(f"cactus: no match for {mol}")
                except:
                    cactus_no_match.add(mol)
                    print("cactus: an error occurred")
                    
                if not pubchem_smile and not cactus_smile:
                    no_matches.add(mol)
                    
                queried.add(mol)
            

def process_chemical_column_brenda(column_str):
    smiles = []
    
    for chem in column_str:
        get_smiles([chem])
        smile = next(
            (s for m, s in pubchem_matches if m == chem),
            next(
                (s for m, s in cactus_matches if m == chem),
                None
            )
        )
        
        if smile:
            smiles.append(smile)
    
    return smiles

In [None]:
def process_reaction_dataframes(brenda_df):
    df_processed = brenda_df.copy()
    
    brenda_substrates_results = df_processed['substrates_split'].apply(process_chemical_column_brenda)
    brenda_products_results = df_processed['products_split'].apply(process_chemical_column_brenda)
    df_processed['substrates_converted_smiles'] = brenda_substrates_results
    df_processed['products_converted_smiles'] = brenda_products_results
    
    return df_processed

df_processed = process_reaction_dataframes(df)

In [None]:
df_processed.to_csv('brenda_smiles.csv', index=False)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=2570873b-d089-4ed7-b836-6c2df496af15' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>