In [1]:
import pandas as pd
import time
import re
from rdkit import Chem
from rdkit.Chem import DataStructs, rdFingerprintGenerator
import pubchempy as pcp
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

In [2]:
# Read data downloaded from TransportDB - all transporters for E. coli 536
df = pd.read_csv('data/ecoli_ed1a.csv')
df

Unnamed: 0,Protein-Name,Substrate,Subtype,Family,Family Name,Transporter Class,TC number
0,ECED1_0811,aspartate:alanine antiporter,,AAE,The Aspartate:Alanine Exchanger (AAE) Family,Secondary Transporter,2.A.81
1,ECED1_4376,aspartate:alanine antiporter,,AAE,The Aspartate:Alanine Exchanger (AAE) Family,Secondary Transporter,2.A.81
2,ECED1_1546,aminobenzoyl-glutamate,,AbgT,The p-Aminobenzoyl-glutamate Transporter (AbgT...,Secondary Transporter,2.A.68
3,ECED1_2819,Unclassified,,AEC,The Auxin Efflux Carrier (AEC) Family,Secondary Transporter,2.A.69
4,ECED1_0006,sodium ion:alanine symporter,,AGCS,The Alanine or Glycine:Cation Symporter (AGCS)...,Secondary Transporter,2.A.25
...,...,...,...,...,...,...,...
639,ECED1_4980,ascorbate,EnzymeIIC,SSPTS,Sugar Specific PTS,Phosphotransferase System (PTS),4.A
640,ECED1_0564,Unclassified,,OMF,The Outer Membrane Factor (OMF) Family,Outer Membrane Porins,1.B.17
641,ECED1_0760,Unclassified,,OMF,The Outer Membrane Factor (OMF) Family,Outer Membrane Porins,1.B.17
642,ECED1_0264,Unclassified,,OOP,The OmpA-OmpF Porin (OOP) Family,Outer Membrane Porins,1.B.6


In [17]:
substrates = df['Substrate'].tolist()
# Remove duplicates
substrates = list(set(substrates))
# Remove empty strings
substrates = [s for s in substrates if s]
substrates

['potassium/sodium ion:proton antiporter',
 'mannitol/fructose',
 'leucine/valine',
 '? (Fe-S assembly/SufBCD system)',
 'large-conductance mechanosensitive ion channel',
 'zinc/cadmium/cobalt ion',
 'manganese/iron ion',
 'tricarboxylate (TctA)',
 'glycerol uptake',
 'rhamnose',
 'multidrug/quaternary ammonium compound efflux (SMR subfamily)',
 'D-galactose/galactoside',
 'amino acid (lysine/arginine/ornithine/histidine/octopine)',
 'proton:dipeptide/tripeptide symporter',
 'sodium ion:glutamate symporter',
 'cobalt ion',
 'aminobenzoyl-glutamate',
 'D-galactonate',
 'sulfate/thiosulfate',
 'cell division',
 'ferrous ion',
 'heavy metal ion',
 'glutamate:GABA antiporter',
 '60 KD inner membrane protein OxaA homolog',
 'cobalt',
 'cyanate',
 'manganese/zinc ion',
 'pyoverdin (siderophore) exporter PvdE',
 'amino acid',
 'branched-chain amino acid',
 'thiamin',
 'aromatic amino acid',
 'sulfate',
 'protein export',
 'metabolite (alpha-ketoglutarate?)',
 'xanthine/uracil',
 'protein expo

In [18]:
# process names
substrates = [s.split('/') for s in substrates]
substrates = [item for sublist in substrates for item in sublist]
print(substrates)

# Define function to retrieve SMILES
def get_smiles(chemical_name):
    name = re.sub("[^A-Za-z0-9 ]+", " ", chemical_name)
    print(name)
    try:
        compound = pcp.get_compounds(name, 'name')
        if compound:
            return compound[0].canonical_smiles
    except Exception as e:
        return str(e)


['potassium', 'sodium ion:proton antiporter', 'mannitol', 'fructose', 'leucine', 'valine', '? (Fe-S assembly', 'SufBCD system)', 'large-conductance mechanosensitive ion channel', 'zinc', 'cadmium', 'cobalt ion', 'manganese', 'iron ion', 'tricarboxylate (TctA)', 'glycerol uptake', 'rhamnose', 'multidrug', 'quaternary ammonium compound efflux (SMR subfamily)', 'D-galactose', 'galactoside', 'amino acid (lysine', 'arginine', 'ornithine', 'histidine', 'octopine)', 'proton:dipeptide', 'tripeptide symporter', 'sodium ion:glutamate symporter', 'cobalt ion', 'aminobenzoyl-glutamate', 'D-galactonate', 'sulfate', 'thiosulfate', 'cell division', 'ferrous ion', 'heavy metal ion', 'glutamate:GABA antiporter', '60 KD inner membrane protein OxaA homolog', 'cobalt', 'cyanate', 'manganese', 'zinc ion', 'pyoverdin (siderophore) exporter PvdE', 'amino acid', 'branched-chain amino acid', 'thiamin', 'aromatic amino acid', 'sulfate', 'protein export', 'metabolite (alpha-ketoglutarate?)', 'xanthine', 'uracil'

In [19]:
print(get_smiles(substrates[1]))

sodium ion proton antiporter
None


In [6]:
# Query PubChem and collect SMILES
# results = []
# for substrate in substrates:
#     smiles = get_smiles(substrate)
#     results.append({
#         'Substrate': substrate,
#         'SMILES': smiles
#     })
#     time.sleep(0.2)  # Add delay to be polite to PubChem servers

smiles_df = pd.read_csv('data/ecoli_ed1a_smiles.csv')


In [7]:
from rdkit.Chem import MACCSkeys

pfas_smiles = [
    "FC(F)(F)C(F)(F)C(F)(F)C(F)(F)F", # Example long-chain PFAS
    "FC(F)(F)C(F)(F)C(F)(F)F"          # Example short-chain PFAS
]
pfas_mols = [Chem.MolFromSmiles(smile) for smile in pfas_smiles]
def get_fingerprints(mols, method):
    if method == 'Morgan3':
        generator = rdFingerprintGenerator.GetMorganGenerator(radius=3, fpSize=2048)
        return [generator.GetFingerprint(m) for m in mols]
    elif method == 'MACCS':
        return [MACCSkeys.GenMACCSKeys(m) for m in mols]
    elif method == 'RDKit':
        return [Chem.RDKFingerprint(m) for m in mols]
    elif method == 'Pattern':
        return [Chem.PatternFingerprint(m) for m in mols]
    else:
        raise ValueError(f"Unknown fingerprint method: {method}")

fingerprint_methods = ['Morgan3', 'MACCS', 'RDKit', 'Pattern']


In [None]:
# simple identification of likely importers using various fingerprints and classifiers

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

mol_df = smiles_df.copy()
# Remove rows where SMILES is None or NaN before converting to Mol
mol_df = mol_df[mol_df['SMILES'].notna()]
mol_df['Mol'] = mol_df['SMILES'].apply(Chem.MolFromSmiles)
mol_df = mol_df[mol_df['Mol'].notna()]

for method in fingerprint_methods:
    print(f"\nFingerprint: {method}")

    substrate_fps = get_fingerprints(mol_df['Mol'], method)
    pfas_fps = get_fingerprints(pfas_mols, method)

    # Compute average Tanimoto similarity to PFAS set
    similarity_results = []
    for fp in substrate_fps:
        simsum = sum(DataStructs.TanimotoSimilarity(fp, p) for p in pfas_fps)
        similarity_results.append(simsum / len(pfas_fps))
    
    mol_df[f'Similarity_{method}'] = similarity_results

    # Label top 5 as likely importers
    mol_df = mol_df.sort_values(f'Similarity_{method}', ascending=False)
    mol_df[f'Likely_Importer_{method}'] = 0
    mol_df.loc[mol_df.head(5).index, f'Likely_Importer_{method}'] = 1

    # Create X, y
    X = np.array([np.array(list(fp)) for fp in substrate_fps])
    y = mol_df[f'Likely_Importer_{method}'].values

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Logistic regression
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    # Report
    print(similarity_results)
    print(classification_report(y_test, y_pred, zero_division=0))
    print(mol_df.head(5)[['Substrate', f'Similarity_{method}']])

    # Random Forest
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    rf_pred = rf.predict(X_test)
    print("Random Forest Results:")
    print(classification_report(y_test, rf_pred, zero_division=0))
    print(mol_df.head(5)[['Substrate', f'Similarity_{method}']])

    # SVM
    svm = SVC(kernel='linear', probability=True, random_state=42)
    svm.fit(X_train, y_train)
    svm_pred = svm.predict(X_test)
    print("SVM Results:")
    print(classification_report(y_test, svm_pred, zero_division=0))
    print(mol_df.head(5)[['Substrate', f'Similarity_{method}']])

    print('\n ' + '='*50 + '\n')




Fingerprint: Morgan3
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.008032258064516129, 0.008438968807862128, 0.0, 0.0, 0.0, 0.006329113924050633, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.005633847521107091, 0.00510204081632653, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.024099883855981417, 0.019803921568627453, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.016530054644808743, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03279569892473118, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.013888888888888888, 0.0, 0.0, 0.011111111111111112, 0.0, 0.0, 0.008670520231213872, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.003105590062111801, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
              precision    recall  f1-score   support

           0       0.96      1.00      0.98        25
         

In [None]:
# comparative approach; similarity to known pfas binding proteins 

 