In [17]:
import pandas as pd
import numpy as np
import time
import re
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
from rdkit.Chem import PandasTools
from sklearn.linear_model import LogisticRegression
import pubchempy as pcp

In [18]:
# Read data downloaded from TransportDB - all transporters for E. coli 536
df = pd.read_csv('data/ecoli_ed1a.csv')
df

Unnamed: 0,Protein-Name,Substrate,Subtype,Family,Family Name,Transporter Class,TC number
0,ECED1_0811,aspartate:alanine antiporter,,AAE,The Aspartate:Alanine Exchanger (AAE) Family,Secondary Transporter,2.A.81
1,ECED1_4376,aspartate:alanine antiporter,,AAE,The Aspartate:Alanine Exchanger (AAE) Family,Secondary Transporter,2.A.81
2,ECED1_1546,aminobenzoyl-glutamate,,AbgT,The p-Aminobenzoyl-glutamate Transporter (AbgT...,Secondary Transporter,2.A.68
3,ECED1_2819,Unclassified,,AEC,The Auxin Efflux Carrier (AEC) Family,Secondary Transporter,2.A.69
4,ECED1_0006,sodium ion:alanine symporter,,AGCS,The Alanine or Glycine:Cation Symporter (AGCS)...,Secondary Transporter,2.A.25
...,...,...,...,...,...,...,...
639,ECED1_4980,ascorbate,EnzymeIIC,SSPTS,Sugar Specific PTS,Phosphotransferase System (PTS),4.A
640,ECED1_0564,Unclassified,,OMF,The Outer Membrane Factor (OMF) Family,Outer Membrane Porins,1.B.17
641,ECED1_0760,Unclassified,,OMF,The Outer Membrane Factor (OMF) Family,Outer Membrane Porins,1.B.17
642,ECED1_0264,Unclassified,,OOP,The OmpA-OmpF Porin (OOP) Family,Outer Membrane Porins,1.B.6


In [19]:
substrates = df['Substrate'].tolist()
# Remove duplicates
substrates = list(set(substrates))
# Remove empty strings
substrates = [s for s in substrates if s]
substrates

['copper ion',
 'ribose',
 'TMAO inducer',
 'glycine betaine/L-proline',
 'protein export (SecDF)',
 'polysaccharide export',
 'Acetyl-CoA:CoA antiporter',
 '? (Fe-S assembly/SufBCD system)',
 'potassium/sodium ion:proton antiporter',
 'L-lactate',
 'D-galactose/galactoside',
 'cation efflux',
 'fructose',
 'multidrug efflux',
 'virulence factor MviN',
 'lipoprotein',
 'mannose/fructose',
 'Unclassified',
 'leucine/valine',
 'C4-dicarboxylate',
 'heme export',
 'Vitamin B12',
 'daunorubicin',
 'oligopeptide',
 'glycerol-3-phosphate',
 'amino acid efflux',
 'sodium ion:proline symporter',
 'aromatic amino acid',
 'tellurite',
 'multidrug efflux (SMR)',
 'branched-chain amino acid efflux (AzlC)',
 'zinc ion',
 'branched-chain amino acid',
 'sodium ion:serine/threonine symporter',
 'glycine betaine/L-proline/carnitine/choline',
 'sodium ion:alanine symporter',
 'galactitol',
 'multidrug/quaternary ammonium compound efflux (SMR subfamily)',
 'GABA',
 'nitrate/sulfonate/taurine',
 'glucose/

In [20]:
# process names
substrates = [s.split('/') for s in substrates]
substrates = [item for sublist in substrates for item in sublist]
print(substrates)

# Define function to retrieve SMILES
def get_smiles(chemical_name):
    name = re.sub("[^A-Za-z0-9 ]+", " ", chemical_name)
    print(name)
    try:
        compound = pcp.get_compounds(name, 'name')
        if compound:
            return compound[0].canonical_smiles
    except Exception as e:
        return str(e)


['copper ion', 'ribose', 'TMAO inducer', 'glycine betaine', 'L-proline', 'protein export (SecDF)', 'polysaccharide export', 'Acetyl-CoA:CoA antiporter', '? (Fe-S assembly', 'SufBCD system)', 'potassium', 'sodium ion:proton antiporter', 'L-lactate', 'D-galactose', 'galactoside', 'cation efflux', 'fructose', 'multidrug efflux', 'virulence factor MviN', 'lipoprotein', 'mannose', 'fructose', 'Unclassified', 'leucine', 'valine', 'C4-dicarboxylate', 'heme export', 'Vitamin B12', 'daunorubicin', 'oligopeptide', 'glycerol-3-phosphate', 'amino acid efflux', 'sodium ion:proline symporter', 'aromatic amino acid', 'tellurite', 'multidrug efflux (SMR)', 'branched-chain amino acid efflux (AzlC)', 'zinc ion', 'branched-chain amino acid', 'sodium ion:serine', 'threonine symporter', 'glycine betaine', 'L-proline', 'carnitine', 'choline', 'sodium ion:alanine symporter', 'galactitol', 'multidrug', 'quaternary ammonium compound efflux (SMR subfamily)', 'GABA', 'nitrate', 'sulfonate', 'taurine', 'glucose',

In [21]:
print(get_smiles(substrates[1]))

ribose
C1C(C(C(C(O1)O)O)O)O


In [22]:
# Query PubChem and collect SMILES
results = []
for substrate in substrates:
    smiles = get_smiles(substrate)
    results.append({
        'Substrate': substrate,
        'SMILES': smiles
    })
    time.sleep(0.2)  # Add delay to be polite to PubChem servers

smiles_df = pd.DataFrame(results)


copper ion
ribose
TMAO inducer
glycine betaine
L proline
protein export  SecDF 
polysaccharide export
Acetyl CoA CoA antiporter
   Fe S assembly
SufBCD system 
potassium
sodium ion proton antiporter
L lactate
D galactose
galactoside
cation efflux
fructose
multidrug efflux
virulence factor MviN
lipoprotein
mannose
fructose
Unclassified
leucine
valine
C4 dicarboxylate
heme export
Vitamin B12
daunorubicin
oligopeptide
glycerol 3 phosphate
amino acid efflux
sodium ion proline symporter
aromatic amino acid
tellurite
multidrug efflux  SMR 
branched chain amino acid efflux  AzlC 
zinc ion
branched chain amino acid
sodium ion serine
threonine symporter
glycine betaine
L proline
carnitine
choline
sodium ion alanine symporter
galactitol
multidrug
quaternary ammonium compound efflux  SMR subfamily 
GABA
nitrate
sulfonate
taurine
glucose
maltose
N acetylglucosamine
lipid A
amino acid  glutamine
glutamate
aspartate 
Autoinducer 2 export
mobybdenate
2 keto 3 deoxygluconate
formate
nitrite
multidrug 

In [23]:
smiles_df.to_csv('data/ecoli_ed1a_smiles.csv', index=False)
