In [1]:
import pandas as pd
import time
import re
from rdkit import Chem
from rdkit.Chem import DataStructs, rdFingerprintGenerator
import pubchempy as pcp

In [2]:
# Read data downloaded from TransportDB - all transporters for E. coli 536
df = pd.read_csv('data/ecoli_ed1a.csv')
df

Unnamed: 0,Protein-Name,Substrate,Subtype,Family,Family Name,Transporter Class,TC number
0,ECED1_0811,aspartate:alanine antiporter,,AAE,The Aspartate:Alanine Exchanger (AAE) Family,Secondary Transporter,2.A.81
1,ECED1_4376,aspartate:alanine antiporter,,AAE,The Aspartate:Alanine Exchanger (AAE) Family,Secondary Transporter,2.A.81
2,ECED1_1546,aminobenzoyl-glutamate,,AbgT,The p-Aminobenzoyl-glutamate Transporter (AbgT...,Secondary Transporter,2.A.68
3,ECED1_2819,Unclassified,,AEC,The Auxin Efflux Carrier (AEC) Family,Secondary Transporter,2.A.69
4,ECED1_0006,sodium ion:alanine symporter,,AGCS,The Alanine or Glycine:Cation Symporter (AGCS)...,Secondary Transporter,2.A.25
...,...,...,...,...,...,...,...
639,ECED1_4980,ascorbate,EnzymeIIC,SSPTS,Sugar Specific PTS,Phosphotransferase System (PTS),4.A
640,ECED1_0564,Unclassified,,OMF,The Outer Membrane Factor (OMF) Family,Outer Membrane Porins,1.B.17
641,ECED1_0760,Unclassified,,OMF,The Outer Membrane Factor (OMF) Family,Outer Membrane Porins,1.B.17
642,ECED1_0264,Unclassified,,OOP,The OmpA-OmpF Porin (OOP) Family,Outer Membrane Porins,1.B.6


In [3]:
substrates = df['Substrate'].tolist()
# Remove duplicates
substrates = list(set(substrates))
# Remove empty strings
substrates = [s for s in substrates if s]
substrates

['thiamin',
 'large-conductance mechanosensitive ion channel',
 'chloride ion channel',
 'protein export (SecDF)',
 'dipeptide/oligopeptide',
 'copper ion',
 'sulfate/thiosulfate',
 'cell division',
 'proton/sodium ion:glutamate/aspartate symporter',
 'cytosine/purines/uracil/thiamine/allantoin',
 'heme',
 'Acetyl-CoA:CoA antiporter',
 'lipoprotein',
 'potassium/sodium ion:proton antiporter',
 'toluene tolerance',
 'sugar efflux?',
 'fructose',
 'multidrug',
 'galactitol',
 'sodium ion/?',
 'ribose',
 'protein export',
 'glucose',
 'polysaccharide export',
 'nitrate/sulfonate/taurine',
 'Autoinducer-2 export',
 'glutamate:GABA antiporter',
 'Vitamin B12',
 'L-lactate',
 'potassium ion',
 'magnesium ion',
 'amino acid',
 'chloramphenicol (RarD homolog)',
 '60 KD inner membrane protein OxaA homolog',
 'proton:dipeptide/tripeptide symporter',
 'sugar (maltose?)',
 'ascorbate',
 'metabolite (benzoate?)',
 'manganese/zinc ion',
 'potassium ion uptake',
 'lipoprotein releasing',
 'serine',
 

In [4]:
# process names
substrates = [s.split('/') for s in substrates]
substrates = [item for sublist in substrates for item in sublist]
print(substrates)

# Define function to retrieve SMILES
def get_smiles(chemical_name):
    name = re.sub("[^A-Za-z0-9 ]+", " ", chemical_name)
    print(name)
    try:
        compound = pcp.get_compounds(name, 'name')
        if compound:
            return compound[0].canonical_smiles
    except Exception as e:
        return str(e)


['thiamin', 'large-conductance mechanosensitive ion channel', 'chloride ion channel', 'protein export (SecDF)', 'dipeptide', 'oligopeptide', 'copper ion', 'sulfate', 'thiosulfate', 'cell division', 'proton', 'sodium ion:glutamate', 'aspartate symporter', 'cytosine', 'purines', 'uracil', 'thiamine', 'allantoin', 'heme', 'Acetyl-CoA:CoA antiporter', 'lipoprotein', 'potassium', 'sodium ion:proton antiporter', 'toluene tolerance', 'sugar efflux?', 'fructose', 'multidrug', 'galactitol', 'sodium ion', '?', 'ribose', 'protein export', 'glucose', 'polysaccharide export', 'nitrate', 'sulfonate', 'taurine', 'Autoinducer-2 export', 'glutamate:GABA antiporter', 'Vitamin B12', 'L-lactate', 'potassium ion', 'magnesium ion', 'amino acid', 'chloramphenicol (RarD homolog)', '60 KD inner membrane protein OxaA homolog', 'proton:dipeptide', 'tripeptide symporter', 'sugar (maltose?)', 'ascorbate', 'metabolite (benzoate?)', 'manganese', 'zinc ion', 'potassium ion uptake', 'lipoprotein releasing', 'serine', 

In [5]:
print(get_smiles(substrates[1]))

large conductance mechanosensitive ion channel
None


In [6]:
# Query PubChem and collect SMILES
results = []
for substrate in substrates:
    smiles = get_smiles(substrate)
    results.append({
        'Substrate': substrate,
        'SMILES': smiles
    })
    time.sleep(0.2)  # Add delay to be polite to PubChem servers

smiles_df = pd.DataFrame(results)


thiamin
large conductance mechanosensitive ion channel
chloride ion channel
protein export  SecDF 
dipeptide
oligopeptide
copper ion
sulfate
thiosulfate
cell division
proton
sodium ion glutamate
aspartate symporter
cytosine
purines
uracil
thiamine
allantoin
heme
Acetyl CoA CoA antiporter
lipoprotein
potassium
sodium ion proton antiporter
toluene tolerance
sugar efflux 
fructose
multidrug
galactitol
sodium ion
 
ribose
protein export
glucose
polysaccharide export
nitrate
sulfonate
taurine
Autoinducer 2 export
glutamate GABA antiporter
Vitamin B12
L lactate
potassium ion
magnesium ion
amino acid
chloramphenicol  RarD homolog 
60 KD inner membrane protein OxaA homolog
proton dipeptide
tripeptide symporter
sugar  maltose 
ascorbate
metabolite  benzoate 
manganese
zinc ion
potassium ion uptake
lipoprotein releasing
serine
potassium ion channel
thiamine
sodium ion glutamate symporter
antimicrobial peptide uptake
sodium ion panthothenate symporter
amino acid  glutamine
glutamate
aspartate 
ni

In [7]:
pfas_smiles = [
    "FC(F)(F)C(F)(F)C(F)(F)C(F)(F)F", # Example long-chain PFAS
    "FC(F)(F)C(F)(F)C(F)(F)F"          # Example short-chain PFAS
]
pfas_mols = [Chem.MolFromSmiles(smile) for smile in pfas_smiles]
mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)
# Generate fingerprints for PFAS
pfas_fps = [mfpgen.GetFingerprint(mol) for mol in pfas_mols]

# filter none values from smiles_df
smiles_df = smiles_df[smiles_df['SMILES'].notna()]
substrate_fps = []
for substrate in smiles_df['SMILES']:
    mol = Chem.MolFromSmiles(substrate)
    if mol:
        fp = mfpgen.GetFingerprint(mol)
        substrate_fps.append(fp)
    else:
        substrate_fps.append(None)
# Remove None values
substrate_fps = [fp for fp in substrate_fps if fp is not None]
# Create a DataFrame for fingerprints
fingerprint_df = pd.DataFrame({
    'Substrate': smiles_df['Substrate'],
    'SMILES': smiles_df['SMILES'],
    'Fingerprint': substrate_fps
})



In [8]:
# calculate similarity
def calculate_similarity(fp1, fp2):
    return DataStructs.TanimotoSimilarity(fp1, fp2)
# Calculate similarity for each substrate against PFAS
similarity_results = []
for substrate_fp in substrate_fps:
    simsum = 0
    for pfas_fp in pfas_fps:
        simsum += calculate_similarity(substrate_fp, pfas_fp)
    similarity_results.append(simsum / len(pfas_fps))

# Create a DataFrame for similarity results
similarity_df = pd.DataFrame({
    'Substrate': smiles_df['Substrate'],
    'Similarity': similarity_results
})
# Merge the similarity results with the original DataFrame
merged_df = pd.merge(smiles_df, similarity_df, on='Substrate', how='left')
merged_df = merged_df.dropna()
print(merged_df[merged_df['Similarity'] > 0.1])

Empty DataFrame
Columns: [Substrate, SMILES, Similarity]
Index: []


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Prepare features (X) and labels (y)
# Fix: Set top 5 substrates as 'likely' importers to ensure at least two classes
merged_df = merged_df.sort_values('Similarity', ascending=False)
merged_df['Likely_Importer'] = 0
merged_df.loc[merged_df.head(5).index, 'Likely_Importer'] = 1

# Align y with fingerprint_df by matching Substrate (ensure same length as X)
substrate_to_label = merged_df.set_index('Substrate')['Likely_Importer'].to_dict()
y = fingerprint_df['Substrate'].map(substrate_to_label).fillna(0).astype(int).values

# Use fingerprints as features (convert ExplicitBitVect to numpy array)
X = np.array([np.array(list(fp)) for fp in fingerprint_df['Fingerprint']])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train logistic regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[19  0]
 [ 1  0]]
              precision    recall  f1-score   support

           0       0.95      1.00      0.97        19
           1       0.00      0.00      0.00         1

    accuracy                           0.95        20
   macro avg       0.47      0.50      0.49        20
weighted avg       0.90      0.95      0.93        20



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
