Skip to content

domgurvic/efflux_evaders_and_substrates

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

20 Commits
 
 
 
 
 
 
 
 

Repository files navigation

E. coli efflux evaders and substrates - chemical space

# load conda environment

from master_functions import master_functions

# data process
import pandas as pd
import numpy as np
from scipy import stats

import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE

#chem

from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, MACCSkeys, Descriptors, Descriptors3D, Draw, rdMolDescriptors, Draw, PandasTools, rdFingerprintGenerator
from rdkit.DataManip.Metric.rdMetricMatrixCalc import GetTanimotoSimMat, GetTanimotoDistMat
# from rdkit.Chem.Draw import IPythonConsole
# import curated datasets

efflux_evaders_om_corrected = pd.read_pickle('data_curated/efflux_evaders_om_corrected.pkl')
efflux_substrates_om_corrected = pd.read_pickle('data_curated/efflux_substrates_om_corrected.pkl')
inactive = pd.read_pickle('data_curated/new_inactive.pkl') # this file is too big to upload to github, you can get your inactives from the inhibition file

Initial set up

Importing master dataset

# import master inhibition data
inhibition = pd.read_csv('data/CO-ADD_InhibitionData_r03_01-02-2020_CSV.csv', low_memory=False)
# this dataset can be downlaoded from: "https://www.co-add.org/"
# check strains avilable in organism == e. coli
inhibition[inhibition['ORGANISM'] == 'Escherichia coli'].STRAIN.value_counts()
ATCC 25922      82517
lpxC; MB4902    81058
tolC; MB5747    74177
Name: STRAIN, dtype: int64
# one compound has outlying values of -213.7 and -278.75 and -329.47 for WT,  tolC and lpxC respectivley, it skews data, I will drop it.

inhibition = inhibition[inhibition.SMILES != 'S(O)(=O)(=O)c1ccccc1\\C(\\c(cc(C)c(c2Br)O)c2)=C(\\C=C3C)/C=C(C3=O)Br']
# define subsets: 

e_coli_wild = inhibition[(inhibition['ORGANISM']=='Escherichia coli') & (inhibition['STRAIN']=='ATCC 25922')][['SMILES', 'INHIB_AVE']].groupby('SMILES').mean().reset_index()

e_coli_efflux = inhibition[(inhibition['ORGANISM']=='Escherichia coli') & (inhibition['STRAIN']=='tolC; MB5747')][['SMILES', 'INHIB_AVE']].groupby('SMILES').mean().reset_index()

e_coli_pore = inhibition[(inhibition['ORGANISM']=='Escherichia coli') & (inhibition['STRAIN']=='lpxC; MB4902')][['SMILES', 'INHIB_AVE']].groupby('SMILES').mean().reset_index()
# collect overlping data:

e_coli_wild_efflux = e_coli_wild[['SMILES', 'INHIB_AVE']].merge(e_coli_efflux[['SMILES', 'INHIB_AVE']],  on='SMILES', suffixes=('_wild', '_efflux'))
e_coli_wild_perm = e_coli_wild[['SMILES', 'INHIB_AVE']].merge(e_coli_pore[['SMILES', 'INHIB_AVE']], on='SMILES', suffixes=('_wild', '_lpxC'))

Plotting WT vs tolC

# e_coli_wild_efflux[['INHIB_AVE_wild', 'INHIB_AVE_efflux']].plot.hist(bins=200, alpha=0.5, figsize=[10,7])


sns.set(context='notebook', style='ticks', font='Times New Roman', font_scale=1.6, rc=None)

fig, ax = plt.subplots(figsize=(7,7))


sns.set(context='notebook', style='ticks', font='Times New Roman', font_scale=1.9, rc=None)

sns.histplot(e_coli_wild_efflux[['INHIB_AVE_efflux', 'INHIB_AVE_wild']], alpha=0.5, bins=150)

plt.legend(labels = ['Wild Type', '$\Delta TolC$'],  fontsize=15)

plt.xlim([-120, 120])

plt.xlabel('Growth Inhibition based on $OD_{600}$ (%)', fontsize=22);
plt.ylabel('Number of Compounds',  fontsize=22);

plt.yticks(fontsize=20)
plt.xticks(fontsize=20)

plt.tight_layout()
sns.despine()

png

Paired t-test

# we can now compoute paired t-test to see if removing TolC made a significant difference or not:

stats.ttest_rel(e_coli_wild_efflux['INHIB_AVE_wild'], e_coli_wild_efflux['INHIB_AVE_efflux'])
Ttest_relResult(statistic=-44.099887587864416, pvalue=0.0)

Defining evaders and substartes

# calculate z-score:
e_coli_wild_efflux['wild_stds'] = stats.zscore(e_coli_wild_efflux.INHIB_AVE_wild)
e_coli_wild_efflux['tolc_stds'] = stats.zscore(e_coli_wild_efflux.INHIB_AVE_efflux)
# label each compounds according to threshold of 4

threshold = 4

def label_it(row):
    if row['wild_stds'] >=threshold:
        return 'active'
    if row['wild_stds'] <threshold:
        return 'inactive'
    
e_coli_wild_efflux['wild_class'] = e_coli_wild_efflux.apply(label_it, axis=1)

def label_it_tolc(row):
    if row['tolc_stds'] >=threshold:
        return 'active'
    if row['tolc_stds'] <threshold:
        return 'inactive'
    
    
e_coli_wild_efflux['tolc_class'] = e_coli_wild_efflux.apply(label_it_tolc, axis=1)
# label compounds based on combination of activity defined above

def label_substrate(row):
    if row['tolc_class'] == 'active' and row['wild_class'] == 'inactive':
        return 'Efflux Substrate'
    if row['tolc_class'] == 'active' and row['wild_class'] == 'active':
        return 'Efflux Evader'
    if row['tolc_class'] == 'inactive' and row['wild_class'] == 'inactive':
        return 'Inactive'
    if row['tolc_class'] == 'inactive' and row['wild_class'] == 'active':
        return 'WT-only Active'

Resulting Number of classes

# check the numbers of classified data

e_coli_wild_efflux['Class'] = e_coli_wild_efflux.apply(label_substrate, axis=1)
e_coli_wild_efflux.Class.value_counts()
Inactive            72730
Efflux Substrate      760
Efflux Evader         200
WT-only Active         53
Name: Class, dtype: int64

Scatter Plot

sns.set(context='notebook', style='ticks', font='Times New Roman', font_scale=1.6, rc=None)
fig, ax = plt.subplots(figsize=(7,7))

sns.scatterplot(data = e_coli_wild_efflux, x='INHIB_AVE_wild', y='INHIB_AVE_efflux', hue='Class', s=30)

sns.despine()

# plt.legend(fontsize=20)

# plt.xlim([-120, 120])

plt.xlabel('$\it{E. coli}$ WT Growth Inhibition (%)', font='Sans serif');

plt.ylabel('$\it{E. coli}$ $\it{tolC}$ Growth Inhibition (%)', font='Sans serif');

# plt.yticks(fontsize=20)
# plt.xticks(fontsize=20)

# plt.axvline(x=43.02,  color='red', linestyle='--', alpha=0.5)
# plt.axhline(y=74.98,  color='red', linestyle='--', alpha=0.5)

plt.axvline(x=e_coli_wild_efflux[e_coli_wild_efflux['wild_stds']>=4].sort_values(by='wild_stds').INHIB_AVE_wild.iloc[0],  color='red', linestyle='--', alpha=0.5)
plt.axhline(y=e_coli_wild_efflux[e_coli_wild_efflux['tolc_stds']>=4].sort_values(by='tolc_stds').INHIB_AVE_efflux.iloc[0],  color='red', linestyle='--', alpha=0.5)

plt.legend(fontsize=15)
plt.tight_layout()

plt.savefig('figures/wild_tolc_class_scatter.png', dpi=600)

png

# we can save those datasets seperately

efflux_substrate = e_coli_wild_efflux[e_coli_wild_efflux['Class']=='Efflux Substrate']

efflux_evader = e_coli_wild_efflux[e_coli_wild_efflux['Class']=='Efflux Evader']

wt_only = e_coli_wild_efflux[e_coli_wild_efflux['Class']=='WT-only Active']

inactive = e_coli_wild_efflux[e_coli_wild_efflux['Class']=='Inactive']

Resulting evaders and substartes

print('No. of resulting evaders: {} \nNo. of resulting substrates: {}'.format(len(efflux_evader), len(efflux_substrate)))
No. of resulting evaders: 200 
No. of resulting substrates: 760

OM Bias

# import permeating and non-permeating datapoints, they were achived using same process as described above

om_permeating = pd.read_pickle('data_curated/om_permeating.pkl')
om_non_permeating = pd.read_pickle('data_curated/om_non_permeating.pkl')
# to compare the smiles between the two we fisr turn all smiles into same canonical smiles format

efflux_evader['SMILES'] = efflux_evader.SMILES.apply(Chem.CanonSmiles)
efflux_substrate['SMILES'] = efflux_substrate.SMILES.apply(Chem.CanonSmiles)
C:\Users\domin\AppData\Local\Temp\ipykernel_23268\348032441.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  efflux_evader['SMILES'] = efflux_evader.SMILES.apply(Chem.CanonSmiles)
C:\Users\domin\AppData\Local\Temp\ipykernel_23268\348032441.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  efflux_substrate['SMILES'] = efflux_substrate.SMILES.apply(Chem.CanonSmiles)
# grab only evaders that are also in OM permeating class
efflux_evaders_om_corrected = efflux_evader[efflux_evader['SMILES'].isin(om_permeating['SMILES'])]
# grab only substrates that are not in non-permeating class
efflux_substrates_om_corrected = efflux_substrate[~efflux_substrate['SMILES'].isin(om_non_permeating['SMILES'])]

Resulting evaders and substartes

print('No. of resulting evaders: {} \nNo. of resulting substrates: {}'.format(len(efflux_evaders_om_corrected), len(efflux_substrates_om_corrected)))
No. of resulting evaders: 186 
No. of resulting substrates: 554

Re-defining inactive mols

e_coli_wild_efflux['mol'] = e_coli_wild_efflux.SMILES.apply(Chem.MolFromSmiles)
[20:56:01] Explicit valence for atom # 2 C, 6, is greater than permitted
[20:56:04] Explicit valence for atom # 0 B, 6, is greater than permitted
[20:56:04] Explicit valence for atom # 0 B, 6, is greater than permitted
[20:56:04] Explicit valence for atom # 0 B, 6, is greater than permitted
[20:56:04] Explicit valence for atom # 0 B, 6, is greater than permitted
[20:56:04] Explicit valence for atom # 0 C, 6, is greater than permitted
e_coli_wild_efflux = e_coli_wild_efflux.dropna()
e_coli_wild_efflux['SMILES'] = e_coli_wild_efflux.SMILES.apply(Chem.CanonSmiles)
C:\Users\domin\AppData\Local\Temp\ipykernel_3876\1164120927.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  e_coli_wild_efflux['SMILES'] = e_coli_wild_efflux.SMILES.apply(Chem.CanonSmiles)
wt_only['mol'] = wt_only.SMILES.apply(Chem.MolFromSmiles)
wt_only = wt_only.dropna()
wt_only['SMILES'] = wt_only.SMILES.apply(Chem.CanonSmiles)
# Since efflux evaders and substartes have changed we must redifine inactive molecules, as:
#     Original dataset without evaders and substartes and wt-active only

not_inactive = pd.concat([efflux_evaders_om_corrected, efflux_substrates_om_corrected, wt_only])

inactive = e_coli_wild_efflux[~e_coli_wild_efflux['SMILES'].isin(not_inactive['SMILES'])]
inactive['mol'] = inactive.SMILES.apply(Chem.MolFromSmiles)

inactive = inactive.dropna(subset=['mol'])
inactive.reset_index(drop=True, inplace=True)

inactive['SMILES'] = inactive.SMILES.apply(Chem.CanonSmiles)
C:\Users\domin\AppData\Local\Temp\ipykernel_23268\1771852805.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inactive['mol'] = inactive.SMILES.apply(Chem.MolFromSmiles)
inactive.to_pickle('data_curated/new_inactive.pkl')

t-SNE of evaders vs substartes

# sample of what the dataset currently looks like
efflux_substrates_om_corrected.head(5)
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
SMILES INHIB_AVE_wild INHIB_AVE_efflux Mol fps abs_diff sub_class wild_stds tolc_stds wild_class tolc_class Class
145 Brc1cncc(-c2cc(NCCCn3ccnc3)nc(-c3ccccc3)n2)c1 4.60 80.47 <rdkit.Chem.rdchem.Mol object at 0x000002164E6... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 75.87 increase 0.054629 4.326538 inactive active Efflux Substrate
308 N#C/C(=N\Nc1cccc(C(F)(F)F)c1)C(N)=S 18.36 87.98 <rdkit.Chem.rdchem.Mol object at 0x000002164E6... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 69.62 increase 1.468421 4.766464 inactive active Efflux Substrate
403 CC(C)C(=O)/C(=C/c1ccc(Cl)cc1Cl)n1cncn1 5.84 97.31 <rdkit.Chem.rdchem.Mol object at 0x00000215D73... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 91.47 increase 0.182034 5.313003 inactive active Efflux Substrate
585 O=C(N/N=C(/CC(=O)c1cccs1)C(F)(F)F)c1cccc([N+](... -3.58 88.80 <rdkit.Chem.rdchem.Mol object at 0x000002164E6... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 92.38 increase -0.785838 4.814498 inactive active Efflux Substrate
589 O=C(N/N=C(/CC(=O)c1cccs1)C(F)(F)F)c1ccc(Cl)cc1 20.78 77.14 <rdkit.Chem.rdchem.Mol object at 0x000002164E6... [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... 56.36 increase 1.717067 4.131471 inactive active Efflux Substrate
# we need to compute fingerprints from SMILES for t-sne:

mfpgen =rdFingerprintGenerator.GetMorganGenerator(radius=2,fpSize=2048)

efflux_evaders_om_corrected['mol'] = efflux_evaders_om_corrected.SMILES.apply(Chem.MolFromSmiles)
efflux_evaders_om_corrected.dropna(subset=['mol'], inplace=True)

efflux_evaders_om_corrected['fps']=efflux_evaders_om_corrected.mol.apply(mfpgen.GetFingerprint)

# substartes

efflux_substrates_om_corrected['mol'] = efflux_substrates_om_corrected.SMILES.apply(Chem.MolFromSmiles)
efflux_substrates_om_corrected.dropna(subset=['mol'], inplace=True)

efflux_substrates_om_corrected['fps']=efflux_substrates_om_corrected.mol.apply(mfpgen.GetFingerprint)
C:\Users\domin\AppData\Local\Temp\ipykernel_4000\1058469890.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  efflux_evaders_om_corrected['mol'] = efflux_evaders_om_corrected.SMILES.apply(Chem.MolFromSmiles)
C:\Users\domin\AppData\Local\Temp\ipykernel_4000\1058469890.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  efflux_evaders_om_corrected.dropna(subset=['mol'], inplace=True)
C:\Users\domin\AppData\Local\Temp\ipykernel_4000\1058469890.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  efflux_evaders_om_corrected['fps']=efflux_evaders_om_corrected.mol.apply(mfpgen.GetFingerprint)
C:\Users\domin\AppData\Local\Temp\ipykernel_4000\1058469890.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  efflux_substrates_om_corrected['mol'] = efflux_substrates_om_corrected.SMILES.apply(Chem.MolFromSmiles)
C:\Users\domin\AppData\Local\Temp\ipykernel_4000\1058469890.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  efflux_substrates_om_corrected.dropna(subset=['mol'], inplace=True)
C:\Users\domin\AppData\Local\Temp\ipykernel_4000\1058469890.py:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  efflux_substrates_om_corrected['fps']=efflux_substrates_om_corrected.mol.apply(mfpgen.GetFingerprint)
# combine two datasets and reset index

sub_and_evade_om_corrected = pd.concat([efflux_evaders_om_corrected,efflux_substrates_om_corrected]).reset_index(drop=True)
def tsne_no_plot(df, perp):
    
    sample=df.values
    tanimoto_sim_mat_lower_triangle=GetTanimotoSimMat(sample) # similartity matrix 
    n_mol = len(sample)
    similarity_matrix = np.ones([n_mol,n_mol])
    i_lower= np.tril_indices(n=n_mol,m=n_mol,k=-1)
    i_upper= np.triu_indices(n=n_mol,m=n_mol,k=1)
    similarity_matrix[i_lower] = tanimoto_sim_mat_lower_triangle
    similarity_matrix[i_upper] = similarity_matrix.T[i_upper] 
    distance_matrix = np.subtract(1,similarity_matrix) 

    TSNE_sim = TSNE(verbose=1, n_components=2, init='pca', method='barnes_hut', perplexity=perp).fit_transform(distance_matrix) 

    tsne_result = pd.DataFrame(data = TSNE_sim , columns=["TC1","TC2"]) 
    return tsne_result
sub_and_evade_om_corrected_tsne = tsne_no_plot(sub_and_evade_om_corrected['fps'], perp=50)

fig, ax = plt.subplots(figsize=(8,8))

sns.scatterplot(x='TC1',y='TC2',data=sub_and_evade_om_corrected_tsne, s=30 ,alpha=0.9, hue=sub_and_evade_om_corrected['Class']) 
# plt.legend(fontsize=20)
fig, ax = plt.subplots(figsize=(8,8))

sns.kdeplot(x='TC1',y='TC2',data=sub_and_evade_om_corrected_tsne,alpha=0.7, hue=sub_and_evade_om_corrected['Class'], levels = 4)
# plt.legend(fontsize=20)
[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 740 samples in 0.001s...
[t-SNE] Computed neighbors for 740 samples in 0.126s...
[t-SNE] Computed conditional probabilities for sample 740 / 740
[t-SNE] Mean sigma: 0.709102
[t-SNE] KL divergence after 250 iterations with early exaggeration: 59.515961
[t-SNE] KL divergence after 1000 iterations: 0.638264





<Axes: xlabel='TC1', ylabel='TC2'>

png

png

t-SNE of evader + substrate + inactive

inactive_sample = inactive.sample(500, random_state= 42)

inactive_sample['mol'] = inactive_sample.SMILES.apply(Chem.MolFromSmiles)
inactive_sample.dropna(subset=['mol'], inplace=True)

inactive_sample['fps']=inactive_sample.mol.apply(mfpgen.GetFingerprint)
# add sample of inactive mols into the mix

sub_evade_inactive = pd.concat([sub_and_evade_om_corrected, inactive_sample])
sub_evade_inactive.reset_index(drop=True, inplace=True)
sub_evade_inactive_tsne = tsne_no_plot(sub_evade_inactive['fps'], perp=50)
[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 1240 samples in 0.002s...
[t-SNE] Computed neighbors for 1240 samples in 0.083s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1240
[t-SNE] Computed conditional probabilities for sample 1240 / 1240
[t-SNE] Mean sigma: 0.725025
[t-SNE] KL divergence after 250 iterations with early exaggeration: 65.622444
[t-SNE] KL divergence after 1000 iterations: 0.954569
sns.set(context='notebook', style='ticks', font='Times New Roman', font_scale=1.6, rc=None)

fig, ax = plt.subplots(figsize=(8,8))

sns.scatterplot(x='TC1',y='TC2',data=sub_evade_inactive_tsne, s=20 ,alpha=0.5, hue=sub_evade_inactive['Class'], legend=False) 
sns.kdeplot(x='TC1',y='TC2',data=sub_evade_inactive_tsne, hue=sub_evade_inactive['Class'], levels = 2, linewidths=2)
sns.despine()

plt.savefig('tsne_all.svg')

png

We find some overlapping compounds

om_permeating = pd.read_pickle('data_curated/om_permeating.pkl')
mfpgen =rdFingerprintGenerator.GetMorganGenerator(radius=2,fpSize=2048)

om_permeating['mol'] = om_permeating.SMILES.apply(Chem.MolFromSmiles)
om_permeating.dropna(subset=['mol'], inplace=True)

om_permeating['fps']=om_permeating.mol.apply(mfpgen.GetFingerprint)
sub_evade_om_permeating = pd.concat([sub_and_evade_om_corrected, om_permeating])
sub_evade_om_permeating.reset_index(drop=True, inplace=True)
sub_evade_om_permeating_tsne = tsne_no_plot(sub_evade_om_permeating['fps'], perp=50)
fig, ax = plt.subplots(figsize=(8,8))

sns.scatterplot(x='TC1',y='TC2',data=sub_evade_om_permeating_tsne, s=30 ,alpha=0.9, hue=sub_evade_om_permeating['Class']) 
# plt.legend(fontsize=20)
fig, ax = plt.subplots(figsize=(8,8))

sns.kdeplot(x='TC1',y='TC2',data=sub_evade_om_permeating_tsne,alpha=0.7, hue=sub_evade_om_permeating['Class'], levels = 4)
# plt.legend(fontsize=20)
[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 961 samples in 0.001s...
[t-SNE] Computed neighbors for 961 samples in 0.068s...
[t-SNE] Computed conditional probabilities for sample 961 / 961
[t-SNE] Mean sigma: 0.854897
[t-SNE] KL divergence after 250 iterations with early exaggeration: 61.729122
[t-SNE] KL divergence after 1000 iterations: 0.519036





<Axes: xlabel='TC1', ylabel='TC2'>

png

png

PCA of evaders, substrates and inactives

we'll sue same dataset as for t-sne

from descriptastorus.descriptors.DescriptorGenerator import MakeGenerator
from tqdm import trange, tqdm

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

def calcualte_features_single(df, col):
    generator = MakeGenerator(("rdkit2d",))
    names=[name[0] for name in  generator.GetColumns()]
    
    l_feat=[]
    
    print('Computing features: ')
    
    for i in trange(len(df[col].values)):
        l_data = generator.process(df[col].values[i])
        
        if l_data[0]  == True:
            l_feat.append(l_data[1:])
        else:
            print('left: ', l_data[0])
            print(df[col].values[i])
    

    # add descriptors to existing dataframe 
    feats = pd.DataFrame()
    
    for i in trange(len(l_feat)):
        feats = feats.append(pd.Series(l_feat[i]), ignore_index=True)
    feats.columns = names[1:]

    return feats
sub_evade_inactive_features = calcualte_features_single(sub_evade_inactive, 'SMILES')
sub_evade_inactive_features['Class'] = sub_evade_inactive['Class']
Computing features: 


100%|██████████████████████████████████████████████████████████████████████████████| 1240/1240 [00:13<00:00, 90.43it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1240/1240 [00:00<00:00, 1435.85it/s]
sub_evade_inactive_features
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
BalabanJ BertzCT Chi0 Chi0n Chi0v Chi1 Chi1n Chi1v Chi2n Chi2v ... fr_sulfone fr_term_acetylene fr_tetrazole fr_thiazole fr_thiocyan fr_thiophene fr_unbrch_alkane fr_urea qed Class
0 2.508772 249.116352 6.974691 5.449320 5.449320 4.877010 3.252155 3.252155 2.362178 2.362178 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.501865 Efflux Evader
1 1.508609 845.728650 20.597801 16.576049 18.162045 14.775990 9.905963 10.698961 6.767766 7.683442 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.504707 Efflux Evader
2 0.000001 653.569301 14.396977 11.850173 15.811520 10.203510 7.173237 9.562159 5.658176 9.088344 ... 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.599582 Efflux Evader
3 2.939539 420.685437 12.344935 7.754071 9.340068 7.303549 4.082377 4.875376 2.898481 3.814156 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.669689 Efflux Evader
4 2.603746 310.650557 9.681798 6.788319 7.544248 6.236382 3.689747 4.224269 2.376957 2.644218 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.588792 Efflux Evader
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1235 1.003357 1984.841727 34.329487 27.979443 27.979443 23.749555 16.372378 16.372378 12.307394 12.307394 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.174004 Inactive
1236 1.749666 1383.833437 21.957455 16.503270 17.259199 15.011570 9.340691 9.718655 6.803797 7.210998 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.427471 Inactive
1237 1.531621 1346.959571 25.070339 20.361266 21.947263 17.546045 12.317981 13.110979 9.033422 9.887759 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.302495 Inactive
1238 1.868993 1028.780943 15.648054 12.477331 12.477331 11.326500 7.553489 7.553489 5.475973 5.475973 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.490238 Inactive
1239 2.184490 517.236837 13.120956 10.329726 11.146223 8.592224 5.624243 6.503896 4.230048 5.177742 ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.860297 Inactive

1240 rows × 201 columns

# drop feature columns that contain Nans, its only 4 columns

sub_evade_inactive_features=sub_evade_inactive_features.dropna(axis=1)
# pca on all physcicochemical features:

table = sub_evade_inactive_features

#The non-redundant molecular descriptors chosen for PCA

descriptors  = table.iloc[:,:-87] # grab only physicochemical values

descriptors_std = StandardScaler().fit_transform(descriptors) #Important to avoid scaling problems between our different descriptors
pca = PCA()
descriptors_2d = pca.fit_transform(descriptors_std)
descriptors_pca= pd.DataFrame(descriptors_2d) # Saving PCA values to a new table
descriptors_pca.index = table.index
descriptors_pca.columns = ['PC{}'.format(i+1) for i in descriptors_pca.columns]
descriptors_pca.head(5) #Displays the PCA table

scale1 = 1.0/(max(descriptors_pca['PC1']) - min(descriptors_pca['PC1'])) 
scale2 = 1.0/(max(descriptors_pca['PC2']) - min(descriptors_pca['PC2']))

# And we add the new values to our PCA table
descriptors_pca['PC1_normalized']=[i*scale1 for i in descriptors_pca['PC1']]
descriptors_pca['PC2_normalized']=[i*scale2 for i in descriptors_pca['PC2']]


descriptors_pca['Class'] = sub_evade_inactive_features['Class']


# plt.rcParams['axes.linewidth'] = 1.5


cmap = sns.diverging_palette(133, 10, as_cmap=True)

fig, ax = plt.subplots(figsize=(10,5))

sns.scatterplot(x='PC1',y='PC2',data=descriptors_pca, alpha=0.7, hue='Class', s=20)#, palette=["C0", "C1", "C2", "k"])


pca_lab = ('PC1 '+str([np.round(pca.explained_variance_ratio_[0]*100, 1)]), 'PC2 '+str([np.round(pca.explained_variance_ratio_[1]*100, 1)]))


plt.xlabel(pca_lab[0],fontsize=16,fontweight='bold')
plt.ylabel(pca_lab[1],fontsize=16,fontweight='bold')

plt.tick_params ('both',width=2,labelsize=14)

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

handles, labels = ax.get_legend_handles_labels()

#ax.legend(handles=handles[1:], labels=labels[1:])

#plt.legend(loc='lower right',frameon=False,prop={'size': 22},ncol=1)

plt.tight_layout()

# plt.savefig('figures/pca_evade_substrate.png', dpi=600)

plt.show()

print('same but in contours, for ease of read')

cmap = sns.diverging_palette(133, 10, as_cmap=True)


############ kdeplot


fig, ax = plt.subplots(figsize=(10,7))

sns.set_style("ticks")

# sns.set(font_scale=2)

sns.kdeplot(x='PC1',y='PC2',data=descriptors_pca, hue='Class' , levels=3,)


pca_lab= ('PC1 '+str([np.round(pca.explained_variance_ratio_[0]*100, 1)]), 'PC2 '+str([np.round(pca.explained_variance_ratio_[1]*100, 1)]))


plt.xlabel(pca_lab[0],fontweight='bold',fontsize=22)
plt.ylabel(pca_lab[1],fontweight='bold', fontsize=22)

plt.tick_params ('both',width=2,labelsize=20)

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

handles, labels = ax.get_legend_handles_labels()

#ax.legend(handles=handles[1:], labels=labels[1:])

#plt.legend(loc='lower right',frameon=False,prop={'size': 22},ncol=1)

# plt.legend()

plt.tight_layout()

# plt.savefig('figures/pca_evade_substrate_contour.png', dpi=600)

png

same but in contours, for ease of read

png

explained variance is too low, I will chose only 7 mian features for PCA next:

# pca on only 8 main physcicochemical features:

table = sub_evade_inactive_features

#The non-redundant molecular descriptors chosen for PCA

descriptors = table[['MolWt', 'MolLogP', 'RingCount','FractionCSP3', 'TPSA','NumHAcceptors', 'NumHDonors', 'NumRotatableBonds' ]].values

# descriptors  = table.iloc[:,:-87]

descriptors_std = StandardScaler().fit_transform(descriptors) 
pca = PCA()
descriptors_2d = pca.fit_transform(descriptors_std)
descriptors_pca= pd.DataFrame(descriptors_2d)
descriptors_pca.index = table.index
descriptors_pca.columns = ['PC{}'.format(i+1) for i in descriptors_pca.columns]


scale1 = 1.0/(max(descriptors_pca['PC1']) - min(descriptors_pca['PC1'])) 
scale2 = 1.0/(max(descriptors_pca['PC2']) - min(descriptors_pca['PC2']))

# And we add the new values to our PCA table
descriptors_pca['PC1_normalized']=[i*scale1 for i in descriptors_pca['PC1']]
descriptors_pca['PC2_normalized']=[i*scale2 for i in descriptors_pca['PC2']]


descriptors_pca['Class'] = sub_evade_inactive_features['Class']


# plt.rcParams['axes.linewidth'] = 1.5


cmap = sns.diverging_palette(133, 10, as_cmap=True)

fig, ax = plt.subplots(figsize=(10,5))

sns.scatterplot(x='PC1',y='PC2',data=descriptors_pca, alpha=0.7, hue='Class', s=20)#, palette=["C0", "C1", "C2", "k"])


pca_lab = ('PC1 '+str([np.round(pca.explained_variance_ratio_[0]*100, 1)]), 'PC2 '+str([np.round(pca.explained_variance_ratio_[1]*100, 1)]))


plt.xlabel(pca_lab[0],fontsize=16,fontweight='bold')
plt.ylabel(pca_lab[1],fontsize=16,fontweight='bold')

plt.tick_params ('both',width=2,labelsize=14)

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

handles, labels = ax.get_legend_handles_labels()

#ax.legend(handles=handles[1:], labels=labels[1:])

#plt.legend(loc='lower right',frameon=False,prop={'size': 22},ncol=1)

plt.tight_layout()

# plt.savefig('figures/pca_evade_substrate.png', dpi=600)

plt.show()

print('same but in contours, for ease of read')

cmap = sns.diverging_palette(133, 10, as_cmap=True)


############ kdeplot


fig, ax = plt.subplots(figsize=(10,7))

sns.set_style("ticks")

# sns.set(font_scale=2)

sns.kdeplot(x='PC1',y='PC2',data=descriptors_pca, hue='Class' , levels=3,)


pca_lab= ('PC1 '+str([np.round(pca.explained_variance_ratio_[0]*100, 1)]), 'PC2 '+str([np.round(pca.explained_variance_ratio_[1]*100, 1)]))


plt.xlabel(pca_lab[0],fontweight='bold',fontsize=22)
plt.ylabel(pca_lab[1],fontweight='bold', fontsize=22)

plt.tick_params ('both',width=2,labelsize=20)

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

handles, labels = ax.get_legend_handles_labels()

#ax.legend(handles=handles[1:], labels=labels[1:])

#plt.legend(loc='lower right',frameon=False,prop={'size': 22},ncol=1)

# plt.legend()

plt.tight_layout()

# plt.savefig('figures/pca_evade_substrate_contour.png', dpi=600)

png

same but in contours, for ease of read

png

sns.set(context='notebook', style='ticks', font='Times New Roman', font_scale=1.6, rc=None)
fig, ax = plt.subplots(figsize=(8,8))

sns.scatterplot(x='PC1',y='PC2',data=descriptors_pca, alpha=0.5, hue='Class', s=20)

pca_lab = ('PC1 '+str([np.round(pca.explained_variance_ratio_[0]*100, 1)]), 'PC2 '+str([np.round(pca.explained_variance_ratio_[1]*100, 1)]))


plt.xlabel(pca_lab[0])
plt.ylabel(pca_lab[1])


sns.kdeplot(x='PC1',y='PC2',data=descriptors_pca, hue='Class' , levels=2, linewidths=2)

pca_lab= ('PC1 - '+str(np.round(pca.explained_variance_ratio_[0]*100, 1)) + '%', 'PC2 - '+str(np.round(pca.explained_variance_ratio_[1]*100, 1)) + '%')
plt.xlabel(pca_lab[0])
plt.ylabel(pca_lab[1])

sns.despine()

# plt.savefig('pca_all.svg')

png

similar result where exlpained variance is about 70% but classes are still not seperated at all

MMPA

To carry out mmpa I used modified mmpdb : https://github.com/rdkit/mmpdb

publication : https://doi.org/10.1021/acs.jcim.8b00173

# import results from MMPA: 

efflux_mmpa_index = pd.read_pickle('data_curated/efflux_mmpa_index.pkl')

# it contains 1.4M pairs

Evader Transforms

def split_transition(df, col):
    df['LHS'] = [re.split('>>',df[col].values[i])[0] for i in range(len(df)) ]
    df['RHS'] = [re.split('>>',df[col].values[i])[1] for i in range(len(df)) ]
    return df

def mols_to_NHA(mol):
    return Chem.MolFromSmarts(mol).GetNumHeavyAtoms()

def clean_mmpa_pairs_len(mmpa_df):
    temp=pd.DataFrame() # temp dataframe
    if 'LHS' not in mmpa_df.columns: # add LHS and RHS if not present
        mmpa_df = split_transition(mmpa_df, 'smirks')     # produce LHS and RHS
    else:
        temp['common_core_HA'] = mmpa_df['common_core'].apply(mols_to_NHA) # produce number of heavy atoms
        temp['LHS_HA'] = mmpa_df['LHS'].apply(mols_to_NHA)
        temp['RHS_HA'] = mmpa_df['LHS'].apply(mols_to_NHA)
        
        temp['len_check'] = np.where((temp['LHS_HA'] >= temp['common_core_HA']) & (temp['RHS_HA'] >= temp['common_core_HA'])
                     , 'fail', 'pass') # compare lengths of heavy atoms
        
        mmpa_df = mmpa_df.drop(temp[temp['len_check']=='fail'].index) # drop index that failed length check
        
        print('Initial number of transofrms: {} \nNumber fo transforms disqualified based on length discrepancy: {} \nRemaining number of transforms: {}'.format(len(temp[temp['len_check']=='fail']) +  len(mmpa_df) , len(temp[temp['len_check']=='fail']), len(mmpa_df)))
        # return temp to debug
    return mmpa_df
# find evader transforms:

evader_transforms = efflux_mmpa_index[(efflux_mmpa_index['compound_structure_B'].isin(efflux_evaders_om_corrected.SMILES)) & (efflux_mmpa_index['compound_structure_A'].isin(inactive.SMILES))]

evader_transforms = clean_mmpa_pairs_len(evader_transforms)
Initial number of transofrms: 2468 
Number fo transforms disqualified based on length discrepancy: 1856 
Remaining number of transforms: 612
len(evader_transforms.compound_structure_B.unique())
77
len(evader_transforms.compound_structure_A.unique())
397

Substrate Transforms

substrate_transforms = efflux_mmpa_index[(efflux_mmpa_index['compound_structure_B'].isin(efflux_substrates_om_corrected.SMILES)) & (efflux_mmpa_index['compound_structure_A'].isin(inactive.SMILES)) ]

substrate_transforms = clean_mmpa_pairs_len(substrate_transforms)
Initial number of transofrms: 6827 
Number fo transforms disqualified based on length discrepancy: 1927 
Remaining number of transforms: 4900
len(substrate_transforms.compound_structure_A.unique())
2053
len(substrate_transforms.compound_structure_B.unique())
349

Transforming substarte into evaders

def calculate_fractions_mk7_new_smarts_spec(df, smirks, measurement_delta, measurement_A, measurement_B):
    
    mol_substructures, name_substructure = new_smarts()

    name_substructure = name_substructure + ['smirks', 'measurement' ,'target']
    
    smirks=smirks
    measurement_delta=measurement_delta
    measurement_A = measurement_A
    measurement_B = measurement_B
    
    # Comapre left hand side

    #     frame_left=pd.DataFrame(columns=name_substructure)

    frame_left=[]
    frame_right=[]
    

    print('Calcualting LHS+RHS matches')

    #for index in enumerate(df.LHS.values)):

    for index in range(len(df)):  

        #grab structure
        frame_temp_left=pd.DataFrame(0, index=range(1), columns=name_substructure)
        frame_temp_right=pd.DataFrame(0, index=range(1), columns=name_substructure)

        frame_temp_left['smirks'] = df[smirks].values[index]
        frame_temp_left['target'] = df[measurement_delta].values[index]    
        frame_temp_left['measurement'] = df[measurement_A].values[index]
        
        
        for sub_nr, sub in enumerate(mol_substructures):
            if df['mol_a'].iloc[index].HasSubstructMatch(sub):
                frame_temp_left[name_substructure[sub_nr]] = [1]
            
#             if mol_target_left.HasSubstructMatch(sub):
#                 frame_temp_left[name_substructure[sub_nr]] = [1]

        frame_temp_right['smirks'] = df[smirks].values[index]
        frame_temp_right['target'] = df[measurement_delta].values[index]    
        frame_temp_right['measurement'] = df[measurement_B].values[index]
        
        for sub_nr, sub in enumerate(mol_substructures):
            if df['mol_b'].iloc[index].HasSubstructMatch(sub):
                frame_temp_right[name_substructure[sub_nr]] = [1]

        frame_left.append(frame_temp_left.values)
        frame_right.append(frame_temp_right.values)

    frame_left_df = pd.DataFrame(np.concatenate(frame_left), columns = name_substructure)
    # compare right hand side
    frame_right_df = pd.DataFrame(np.concatenate(frame_right), columns = name_substructure)

    diff = frame_right_df.iloc[:,:-3] - frame_left_df.iloc[:,:-3] 

    diff['smirks'] = frame_right_df['smirks']
    diff['measurement_A'] = frame_left_df['measurement']
    diff['measurement_B'] = frame_right_df['measurement']
    diff['target'] = frame_right_df['target']

    return diff.reset_index(drop=True), frame_left_df.reset_index(drop=True), frame_right_df.reset_index(drop=True)


def new_smarts():
#     print(os.getcwd())
    func_groups=pd.read_csv('ml_mmpa/fg_smarts_2.csv')
    
        #fetch all substructure definitions and calculate mosl for them
    print('Generating molecular objects from pre-defined substructures')
    mol_substructures=[]
    for substructure in func_groups.SMARTS:
        mol_substructures.append(Chem.MolFromSmarts(substructure))

    return mol_substructures,  func_groups.name.to_list()
def calculate_fractions_mk7_new_smarts(df):
    
    mol_substructures, name_substructure = new_smarts()

    name_substructure = name_substructure + ['smirks', 'measurement' ,'target']

    # Comapre left hand side

    #     frame_left=pd.DataFrame(columns=name_substructure)

    frame_left=[]
    frame_right=[]

    print('Calcualting LHS+RHS matches')

    #for index in enumerate(df.LHS.values)):

    for index in range(len(df)):  

        #grab structure
        frame_temp_left=pd.DataFrame(0, index=range(1), columns=name_substructure)
        frame_temp_right=pd.DataFrame(0, index=range(1), columns=name_substructure)

        frame_temp_left['smirks'] = df.smirks.values[index]
        frame_temp_left['target'] = df.measurement_delta.values[index]    
        frame_temp_left['measurement'] = df.measurement_A.values[index]
        
        
        for sub_nr, sub in enumerate(mol_substructures):
            if df['mol_a'].iloc[index].HasSubstructMatch(sub):
                frame_temp_left[name_substructure[sub_nr]] = [1]
            
#             if mol_target_left.HasSubstructMatch(sub):
#                 frame_temp_left[name_substructure[sub_nr]] = [1]

        frame_temp_right['smirks'] = df.smirks.values[index]
        frame_temp_right['target'] = df.measurement_delta.values[index]    
        frame_temp_right['measurement'] = df.measurement_B.values[index]
        
        for sub_nr, sub in enumerate(mol_substructures):
            if df['mol_b'].iloc[index].HasSubstructMatch(sub):
                frame_temp_right[name_substructure[sub_nr]] = [1]

        frame_left.append(frame_temp_left.values)
        frame_right.append(frame_temp_right.values)

    frame_left_df = pd.DataFrame(np.concatenate(frame_left), columns = name_substructure)
    # compare right hand side
    frame_right_df = pd.DataFrame(np.concatenate(frame_right), columns = name_substructure)

    diff = frame_right_df.iloc[:,:-3] - frame_left_df.iloc[:,:-3] 

    diff['smirks'] = frame_right_df['smirks']
    diff['measurement_A'] = frame_left_df['measurement']
    diff['measurement_B'] = frame_right_df['measurement']
    diff['target'] = frame_right_df['target']

    return diff.reset_index(drop=True), frame_left_df.reset_index(drop=True), frame_right_df.reset_index(drop=True)


def new_smarts():
#     print(os.getcwd())
    func_groups=pd.read_csv('ml_mmpa/fg_smarts_2.csv')
    
        #fetch all substructure definitions and calculate mosl for them
    print('Generating molecular objects from pre-defined substructures')
    mol_substructures=[]
    for substructure in func_groups.SMARTS:
        mol_substructures.append(Chem.MolFromSmarts(substructure))

    return mol_substructures,  func_groups.name.to_list()
# find substrate to evader transforms:

sub_to_evader_transforms = efflux_mmpa_index[(efflux_mmpa_index['compound_structure_B'].isin(efflux_evaders_om_corrected.SMILES)) & (efflux_mmpa_index['compound_structure_A'].isin(efflux_substrates_om_corrected.SMILES))]
len(sub_to_evader_transforms), len(sub_to_evader_transforms.compound_structure_A.unique()), len(sub_to_evader_transforms.compound_structure_B.unique())
(60, 26, 24)
sub_to_evader_transforms[sub_to_evader_transforms['compound_structure_B']=='O=[N+]([O-])c1ccc2no[n+]([O-])c2c1']
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
compound_structure_A compound_structure_B idsmiles_A idsmiles_B smirks common_core measurement_A measurement_B measurement_delta LHS RHS mol_a mol_b
1037285 O=Cc1cc([N+](=O)[O-])cc(I)c1O O=[N+]([O-])c1ccc2no[n+]([O-])c2c1 43577 47709 [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1ccc2no[n+]([O... [*:1][N+](=O)[O-] 55.67 -1.98 -57.65 [*:1]c1cc(I)c(O)c(C=O)c1 [*:1]c1ccc2no[n+]([O-])c2c1 <rdkit.Chem.rdchem.Mol object at 0x000002AA5A4... <rdkit.Chem.rdchem.Mol object at 0x000002AA5A2...
1038977 Cn1nc([N+](=O)[O-])c[n+]1[O-] O=[N+]([O-])c1ccc2no[n+]([O-])c2c1 47632 47709 [*:1]c1c[n+]([O-])n(C)n1>>[*:1]c1ccc2no[n+]([O... [*:1][N+](=O)[O-] 42.16 -1.98 -44.14 [*:1]c1c[n+]([O-])n(C)n1 [*:1]c1ccc2no[n+]([O-])c2c1 <rdkit.Chem.rdchem.Mol object at 0x000002AA5A4... <rdkit.Chem.rdchem.Mol object at 0x000002AA5A2...
new_df = sub_to_evader_transforms.groupby(['compound_structure_A', 'compound_structure_B']).size().reset_index(name='Freq')
new_df.drop_duplicates(subset=['compound_structure_B'])
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
compound_structure_A compound_structure_B Freq
0 CC(=O)Cn1cc([N+](=O)[O-])c([N+](=O)[O-])n1 CC(=O)Cn1nnc([N+](=O)[O-])n1 4
1 CC(=O)Cn1cc([N+](=O)[O-])c([N+](=O)[O-])n1 Nc1ncc([N+](=O)[O-])cc1[N+](=O)[O-] 1
2 CC(=O)Cn1cc([N+](=O)[O-])c([N+](=O)[O-])n1 O=[N+]([O-])c1ncn(CCO)c1[N+](=O)[O-] 1
3 CCCCCCCn1ccc(=N)cc1.I Br.CCCCCCCCCCn1ccc(=N)cc1 1
4 CCCCCCCn1ccc(=N)cc1.I Br.CCCCCCCCn1ccc(=N)cc1 1
5 CCc1ccc(O)c(/N=C/c2cc(I)cc(I)c2O)c1 Oc1cccnc1/N=C/c1cc(I)cc(I)c1O 3
6 CCc1ccc(OCCNc2cc(N3CC(C)NC(C)C3)ccc2[N+](=O)[O... CCc1ccc(OCCNc2cc(N3CCNC(C)C3)ccc2[N+](=O)[O-])cc1 3
8 CCn1cc(C(=O)O)c(=O)c2cc(F)c(N/N=C/c3ccccc3)cc21 CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCN(C)CC3)cc21.C... 1
9 CCn1cc(C(=O)O)c(=O)c2cc(F)c(N/N=C/c3ccccc3)cc21 CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCNCC3)cc21 1
10 CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCN(C(=S)NC(=O)C... CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCN(C(=S)NC(=O)c... 1
11 CCn1cc(C(=O)O)c(=O)c2cnc(N3CCN(C(=S)Nc4ccc(OC)... CCn1cc(C(=O)O)c(=O)c2cnc(N3CCN(C(=S)Nc4ccc(C(=... 1
12 COc1ccc(/C=C/c2ccc(C(=O)Nc3cc(C(=O)Nc4cc(C(=O)... COc1ccc(/C=C/c2ccc(C(=O)Nc3cc(C(=O)Nc4cc(C(=O)... 2
15 Cc1cc(C)c2nc3nc(C)cc(C)c3c(N)c2c1 Cc1ccc2nc3nc(C)cc(C)c3c(N)c2c1 1
16 Cn1cnc([N+](=O)[O-])c1Oc1ccccc1 Cn1cnc([N+](=O)[O-])c1S(=O)CC#N 1
17 Cn1cnc([N+](=O)[O-])c1Oc1ccccc1 Cn1cnc([N+](=O)[O-])c1Sc1nnnn1C 1
18 Cn1nc([N+](=O)[O-])c[n+]1[O-] C=CCNc1c([N+](=O)[O-])nn(C)[n+]1[O-] 1
22 Cn1nc([N+](=O)[O-])c[n+]1[O-] Nc1nonc1[N+](=O)[O-] 1
23 Cn1nc([N+](=O)[O-])c[n+]1[O-] O=C(O)/C=C/c1ccc([N+](=O)[O-])o1 1
24 Cn1nc([N+](=O)[O-])c[n+]1[O-] O=[N+]([O-])c1ccc2no[n+]([O-])c2c1 1
25 N#Cc1c(Cl)nc(NN)c(Cl)c1Cl N#Cc1nc(Cl)c2sc(=O)sc2c1Cl 1
26 N/C(=C\C(=O)/C=C/c1cccs1)C(Cl)(Cl)Cl O=C(/C=C/c1cccs1)CC(=O)C(F)(F)F 2
33 O=[N+]([O-])c1ccc(/N=C/c2cc(I)cc(I)c2O)cc1 O=C(NC(CO)C(O)c1ccc([N+](=O)[O-])cc1)C(Cl)Cl 2
34 Oc1c(Cl)cc(Br)cc1/C=N/c1ccc(F)cc1 O=[N+]([O-])c1ccc(/C=N/c2ccc(F)cc2)o1 1
35 Oc1c(Cl)cc(Br)cc1/C=N/c1ccccc1 O=C(CCl)C(=O)Nc1ccccc1 1
len(sub_to_evader_transforms.drop_duplicates(subset=['compound_structure_B']))
24
e_coli_wild_efflux[e_coli_wild_efflux['SMILES'] == 'N/C(=C\C(=O)/C=C/c1cccs1)C(Cl)(Cl)Cl'][['INHIB_AVE_wild', 'INHIB_AVE_efflux']].values[0]
array([21.71, 90.83])
e_coli_wild_efflux[e_coli_wild_efflux['SMILES'] == 'O=C(/C=C/c1cccs1)CC(=O)C(F)(F)F'][['INHIB_AVE_wild', 'INHIB_AVE_efflux']].values[0]
array([48.74, 93.  ])
sub_to_evader_transforms.drop_duplicates(subset=['compound_structure_B']).iloc[0].compound_structure_A
'N/C(=C\\C(=O)/C=C/c1cccs1)C(Cl)(Cl)Cl'
for i in range(24):
    
    trans = sub_to_evader_transforms.drop_duplicates(subset=['compound_structure_B'])
    # smiles:

    comp_a = sub_to_evader_transforms[sub_to_evader_transforms['smirks'] == dex.iloc[int(a/2)].smirks].compound_structure_A.values[0]

    comp_b = sub_to_evader_transforms[sub_to_evader_transforms['smirks'] == dex.iloc[int(a/2)].smirks].compound_structure_B.values[0]

    


    # wt and efflux pre
    pre =  e_coli_wild_efflux[e_coli_wild_efflux['SMILES'] == trans.iloc[i].compound_structure_A][['INHIB_AVE_wild', 'INHIB_AVE_efflux']].values[0]
    # wt and efflux post
    post = e_coli_wild_efflux[e_coli_wild_efflux['SMILES'] == trans.iloc[i].compound_structure_B][['INHIB_AVE_wild', 'INHIB_AVE_efflux']].values[0]
    print(i+1)

    print(trans.iloc[i].compound_structure_A)

    print('WT: {}%, tolC: {}%'.format(pre[0], pre[1]))

    print(trans.iloc[i].compound_structure_B)

    print('WT: {}%, tolC: {}%'.format(post[0], post[1]))
1
N/C(=C\C(=O)/C=C/c1cccs1)C(Cl)(Cl)Cl
WT: 21.71%, tolC: 90.83%
O=C(/C=C/c1cccs1)CC(=O)C(F)(F)F
WT: 48.74%, tolC: 93.0%
2
Oc1c(Cl)cc(Br)cc1/C=N/c1ccc(F)cc1
WT: 39.12%, tolC: 96.44%
O=[N+]([O-])c1ccc(/C=N/c2ccc(F)cc2)o1
WT: 93.81%, tolC: 91.72%
3
Oc1c(I)cc(I)cc1/C=N/c1ccc(F)cc1
WT: 37.97%, tolC: 100.98%
Oc1cccnc1/N=C/c1cc(I)cc(I)c1O
WT: 60.66%, tolC: 97.11%
4
N#Cc1c(Cl)nc(NN)c(Cl)c1Cl
WT: -0.99%, tolC: 86.71%
N#Cc1nc(Cl)c2sc(=O)sc2c1Cl
WT: 80.76%, tolC: 76.9%
5
CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCN(C(=S)NC(=O)C45CC6CC(CC(C6)C4)C5)CC3)cc21
WT: 9.66%, tolC: 97.46%
CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCN(C(=S)NC(=O)c4c(OC)cccc4OC)CC3)cc21
WT: 92.72%, tolC: 91.71%
6
CC(=O)Cn1cc([N+](=O)[O-])c([N+](=O)[O-])n1
WT: 14.09%, tolC: 100.19%
CC(=O)Cn1nnc([N+](=O)[O-])n1
WT: 45.0%, tolC: 77.9%
7
Cn1nc([N+](=O)[O-])c[n+]1[O-]
WT: 36.64%, tolC: 78.8%
Cn1cnc([N+](=O)[O-])c1S(=O)CC#N
WT: 93.87%, tolC: 90.29%
8
Cn1nc([N+](=O)[O-])c[n+]1[O-]
WT: 36.64%, tolC: 78.8%
C=CCNc1c([N+](=O)[O-])nn(C)[n+]1[O-]
WT: 100.62%, tolC: 102.1%
9
CCn1cc(C(=O)O)c(=O)c2cnc(N3CCN(C(=S)Nc4ccc(OC)cc4)CC3)nc21
WT: -0.57%, tolC: 80.9%
CCn1cc(C(=O)O)c(=O)c2cnc(N3CCN(C(=S)Nc4ccc(C(=O)OC)cc4)CC3)nc21
WT: 96.96%, tolC: 100.34%
10
CCc1ccc(OCCNc2cc(N3CC(C)NC(C)C3)ccc2[N+](=O)[O-])cc1
WT: 33.9%, tolC: 95.53%
CCc1ccc(OCCNc2cc(N3CCNC(C)C3)ccc2[N+](=O)[O-])cc1
WT: 52.44%, tolC: 96.71%
11
COc1ccc(/C=C/c2ccc(C(=O)Nc3cc(C(=O)Nc4cc(C(=O)NCCN5CCC(OC)CC5)n(C)c4)n(C)c3)cn2)cc1.O=C(O)C(F)(F)F.O=C(O)C(F)(F)F
WT: 5.08%, tolC: 100.53%
COc1ccc(/C=C/c2ccc(C(=O)Nc3cc(C(=O)Nc4cc(C(=O)NCCN5CCOCC5)n(C)c4)n(C)c3)cn2)cc1.O=C(O)C(F)(F)F.O=C(O)C(F)(F)F
WT: 100.46%, tolC: 100.31%
12
Oc1c(Cl)cc(Br)cc1/C=N/c1ccccc1
WT: 27.69%, tolC: 101.73%
O=C(CCl)C(=O)Nc1ccccc1
WT: 95.28%, tolC: 92.56%
13
O=[N+]([O-])c1ccc(/N=C/c2cc(I)cc(I)c2O)cc1
WT: 40.13%, tolC: 96.13%
O=C(NC(CO)C(O)c1ccc([N+](=O)[O-])cc1)C(Cl)Cl
WT: 98.55%, tolC: 98.37%
14
O=Cc1cc([N+](=O)[O-])cc(I)c1O
WT: 27.87%, tolC: 83.54%
O=[N+]([O-])c1ccc2no[n+]([O-])c2c1
WT: 96.24%, tolC: 94.26%
15
O=Cc1cc([N+](=O)[O-])cc(I)c1O
WT: 27.87%, tolC: 83.54%
Nc1ncc([N+](=O)[O-])cc1[N+](=O)[O-]
WT: 59.06%, tolC: 98.91%
16
O=Cc1cc([N+](=O)[O-])cc(I)c1O
WT: 27.87%, tolC: 83.54%
O=C(O)/C=C/c1ccc([N+](=O)[O-])o1
WT: 75.57%, tolC: 98.52%
17
O=Cc1cc([N+](=O)[O-])cc(I)c1O
WT: 27.87%, tolC: 83.54%
Nc1nonc1[N+](=O)[O-]
WT: 99.21%, tolC: 96.12%
18
CC(=O)Cn1cc([N+](=O)[O-])c([N+](=O)[O-])n1
WT: 14.09%, tolC: 100.19%
O=[N+]([O-])c1ncn(CCO)c1[N+](=O)[O-]
WT: 96.69%, tolC: 93.83%
19
Cn1cnc([N+](=O)[O-])c1Oc1ccccc1
WT: 15.81%, tolC: 94.86%
Cn1cnc([N+](=O)[O-])c1Sc1nnnn1C
WT: 53.09%, tolC: 100.9%
20
Cc1cc(C)c2nc3nc(C)cc(C)c3c(N)c2c1
WT: 42.19%, tolC: 100.93%
Cc1ccc2nc3nc(C)cc(C)c3c(N)c2c1
WT: 53.27%, tolC: 100.78%
21
CCn1cc(C(=O)O)c(=O)c2cc(F)c(N/N=C/c3ccccc3)cc21
WT: -2.02%, tolC: 81.37%
CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCN(C)CC3)cc21.CS(=O)(=O)O
WT: 100.16%, tolC: 100.18%
22
CCn1cc(C(=O)O)c(=O)c2cc(F)c(N/N=C/c3ccccc3)cc21
WT: -2.02%, tolC: 81.37%
CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCNCC3)cc21
WT: 98.83%, tolC: 98.54%
23
CCCCCCCn1ccc(=N)cc1.I
WT: 38.66%, tolC: 95.65%
Br.CCCCCCCCn1ccc(=N)cc1
WT: 58.22%, tolC: 90.97%
24
CCCCCCCn1ccc(=N)cc1.I
WT: 38.66%, tolC: 95.65%
Br.CCCCCCCCCCn1ccc(=N)cc1
WT: 101.08%, tolC: 95.52%
sub_to_evader_transforms['mol_a'] = sub_to_evader_transforms.LHS.apply(Chem.MolFromSmiles)
C:\Users\domin\AppData\Local\Temp\ipykernel_3876\1271073621.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_to_evader_transforms['mol_a'] = sub_to_evader_transforms.LHS.apply(Chem.MolFromSmiles)
sub_to_evader_transforms['mol_b'] = sub_to_evader_transforms.RHS.apply(Chem.MolFromSmiles)
[17:06:47] WARNING: not removing hydrogen atom with dummy atom neighbors
[17:06:47] WARNING: not removing hydrogen atom with dummy atom neighbors
C:\Users\domin\AppData\Local\Temp\ipykernel_3876\1879633430.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_to_evader_transforms['mol_b'] = sub_to_evader_transforms.RHS.apply(Chem.MolFromSmiles)
sub_to_evader_transforms.mol_b.isna().any()
False
feat_diff, feat_left, feat_right = master_functions.calculate_fractions_mk7_new_smarts(sub_to_evader_transforms)
H:\My Drive\co_add_jupyter
Generating molecular objects from pre-defined substructures
Calcualting LHS+RHS matches
feat_diff, feat_left, feat_right = calculate_fractions_mk7_new_smarts(sub_to_evader_transforms)
Generating molecular objects from pre-defined substructures
Calcualting LHS+RHS matches
#drop zeros
feat_diff = feat_diff.loc[:, (feat_diff != 0).any(axis=0)]
feat_diff
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
1 - Alkane group 1,2-Dicarbonyl not in ring 10 - Aldehyde 13 - Ether 15 - Secondary amine group 16 - Tertiary amine 17 - Aromatic amine 18 - Pyridine 19 - CCN 2 - Olefin group ... Thionyl Vinyl michael acceptor1 Primary amine, not amide Primary or secondary amine, not amide. tertiary aliphatic amine carboxylic acid smirks measurement_A measurement_B target
0 0 0 0 0 0 0 0 0 0 -1 ... 0 0 -1 -1 0 0 [*:1]C(=O)/C=C(\N)C(Cl)(Cl)Cl>>[*:1]C(=O)CC(=O... 69.12 44.26 -24.86
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]c1cc(Br)cc(Cl)c1O>>[*:1]c1ccc([N+](=O)[O-... 57.32 -2.09 -59.41
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]c1cc(I)cc(I)c1O>>[*:1]c1ccc([N+](=O)[O-])o1 63.01 -2.09 -65.1
3 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]c1ccc(F)cc1>>[*:1]c1ncccc1O 63.01 36.45 -26.56
4 -1 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]c1cc(CC)ccc1O>>[*:1]c1ncccc1O 53.18 36.45 -16.73
5 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]c1cccc(Cl)c1Cl>>[*:1]c1ncccc1O 72.7 36.45 -36.25
6 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]c1cccc(F)c1>>[*:1]c1ncccc1O 55.41 36.45 -18.96
7 0 0 0 -1 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]c1cccc(OC)c1>>[*:1]c1ncccc1O 59.53 36.45 -23.08
8 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]c1cccc2ccccc12>>[*:1]c1ncccc1O 64.42 36.45 -27.97
9 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]c1ccccc1I>>[*:1]c1ncccc1O 62.45 36.45 -26.0
10 0 0 0 -1 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]c1ccccc1OC>>[*:1]c1ncccc1O 64.28 36.45 -27.83
11 -1 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]c1ccc(CC)cc1[*:2]>>[*:1]c1cccnc1[*:2] 53.18 36.45 -16.73
12 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 -1 0 0 [*:1]c1c(Cl)nc(NN)c([*:2])c1[*:3]>>[*:1]c1nc([... 87.7 -3.86 -91.56
13 0 0 0 0 0 0 0 0 0 -1 ... 0 0 -1 -1 0 0 [*:1]/C=C(\N)C(Cl)(Cl)Cl>>[*:1]CC(=O)C(F)(F)F 69.12 44.26 -24.86
14 -1 0 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]C12CC3CC(CC(C3)C1)C2>>[*:1]c1c(OC)cccc1OC 87.8 -1.01 -88.81
15 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]Cn1cc([*:2])c([N+](=O)[O-])n1>>[*:1]Cn1nn... 86.1 32.9 -53.2
16 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]Cn1cc([N+](=O)[O-])c([*:2])n1>>[*:1]Cn1nn... 86.1 32.9 -53.2
17 0 0 0 0 0 0 0 0 1 0 ... 1 0 0 0 0 0 [*:1]c1c[n+]([O-])n([*:2])n1>>[*:1]c1ncn([*:2]... 42.16 -3.58 -45.74
18 0 0 0 0 1 0 0 0 0 1 ... 0 0 0 1 0 0 [*:1]c1c[n+]([O-])n([*:2])n1>>[*:1]c1nn([*:2])... 42.16 1.48 -40.68
19 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]O[*:2]>>[*:1]OC([*:2])=O 81.47 3.38 -78.09
20 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]C1CN([*:2])CC(C)N1>>[*:1]C1CN([*:2])CCN1 61.63 44.27 -17.36
21 -1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]C>>[*:1][H] 61.63 44.27 -17.36
22 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]c1cn([*:2])nc1[N+](=O)[O-]>>[*:1]c1nnn([*... 86.1 32.9 -53.2
23 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]c1nn([*:2])cc1[N+](=O)[O-]>>[*:1]c1nnn([*... 86.1 32.9 -53.2
24 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]N1CCC(OC)CC1>>[*:1]N1CCOCC1 95.45 -0.15 -95.6
25 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]/N=C/c1cc(Br)cc([*:2])c1O>>[*:2]CC(=O)C(=... 74.04 -2.72 -76.76
26 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]CCN1CCC(OC)CC1>>[*:1]CCN1CCOCC1 95.45 -0.15 -95.6
27 -1 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]/C=N\c1cc(CC)ccc1[*:2]>>[*:1]/C=N\c1ncccc... 53.18 36.45 -16.73
28 1 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]/N=C\c1cc(I)cc(I)c1[*:2]>>[*:1]C([*:2])C(... 56.0 -0.18 -56.18
29 1 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]/N=C\c1cc(I)cc(I)c1[*:2]>>[*:2]CC(NC(=O)C... 56.0 -0.18 -56.18
30 0 0 -1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1ccc2no[n+]([O... 55.67 -1.98 -57.65
31 0 0 -1 0 0 0 1 0 0 0 ... 0 0 1 1 0 0 [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1cc([N+](=O)[O... 55.67 39.85 -15.82
32 0 0 -1 0 0 0 1 0 0 0 ... 0 0 1 1 0 0 [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1cnc(N)c([N+](... 55.67 39.85 -15.82
33 1 0 -1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1nnn(CC(C)=O)n1 55.67 32.9 -22.77
34 0 0 -1 0 0 0 0 0 0 1 ... 0 1 0 0 0 1 [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1ccc(/C=C/C(=O... 55.67 22.95 -32.72
35 0 0 -1 0 0 0 1 0 0 0 ... 0 0 1 1 0 0 [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1nonc1N 55.67 -3.09 -58.76
36 -1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]c1c[n+]([O-])n(C)n1>>[*:1]c1ccc2no[n+]([O... 42.16 -1.98 -44.14
37 -1 0 0 0 0 0 1 0 0 0 ... 0 0 1 1 0 0 [*:1]c1c[n+]([O-])n(C)n1>>[*:1]c1cc([N+](=O)[O... 42.16 39.85 -2.31
38 -1 0 0 0 0 0 1 0 0 0 ... 0 0 1 1 0 0 [*:1]c1c[n+]([O-])n(C)n1>>[*:1]c1cnc(N)c([N+](... 42.16 39.85 -2.31
39 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]c1c[n+]([O-])n(C)n1>>[*:1]c1nnn(CC(C)=O)n1 42.16 32.9 -9.26
40 -1 0 0 0 0 0 0 0 0 1 ... 0 1 0 0 0 1 [*:1]c1c[n+]([O-])n(C)n1>>[*:1]c1ccc(/C=C/C(=O... 42.16 22.95 -19.21
41 -1 0 0 0 0 0 1 0 0 0 ... 0 0 1 1 0 0 [*:1]c1c[n+]([O-])n(C)n1>>[*:1]c1nonc1N 42.16 -3.09 -45.25
42 -1 0 0 0 0 0 1 0 0 0 ... 0 0 1 1 0 0 [*:1]c1cn(CC(C)=O)nc1[*:2]>>[*:1]c1cnc(N)c([*:... 86.1 39.85 -46.25
43 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]c1cn(CC(C)=O)nc1[*:2]>>[*:1]c1ncn(CCO)c1[... 86.1 -2.86 -88.96
44 0 0 0 0 0 0 0 0 1 0 ... 1 0 0 0 0 0 [*:1]Oc1ccccc1>>[*:1]S(=O)CC#N 79.05 -3.58 -82.63
45 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]Oc1ccccc1>>[*:1]Sc1nnnn1C 79.05 47.81 -31.24
46 -1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]C>>[*:1][H] 58.74 47.51 -11.23
47 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]/C=N\c1ccc(F)cc1>>[*:1]/C=N\c1ncccc1O 63.01 36.45 -26.56
48 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]/C=N\c1cccc(Cl)c1Cl>>[*:1]/C=N\c1ncccc1O 72.7 36.45 -36.25
49 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]/C=N\c1cccc(F)c1>>[*:1]/C=N\c1ncccc1O 55.41 36.45 -18.96
50 0 0 0 -1 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]/C=N\c1cccc(OC)c1>>[*:1]/C=N\c1ncccc1O 59.53 36.45 -23.08
51 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]/C=N\c1ccccc1I>>[*:1]/C=N\c1ncccc1O 62.45 36.45 -26.0
52 0 0 0 -1 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]/C=N\c1ccccc1OC>>[*:1]/C=N\c1ncccc1O 64.28 36.45 -27.83
53 0 0 0 0 0 0 -1 1 0 0 ... 0 0 -1 -1 0 0 [*:1]/C=N/c1nonc1N>>[*:1]/C=N\c1ncccc1O 70.9 36.45 -34.45
54 1 0 0 0 0 1 0 0 0 0 ... 0 0 0 -1 1 0 [*:1]N/N=C/c1ccccc1>>[*:1]N1CCN(C)CC1 83.39 0.02 -83.37
55 1 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]N/N=C/c1ccccc1>>[*:1]N1CCNCC1 83.39 -0.29 -83.68
56 0 0 0 0 1 -1 0 0 0 0 ... 0 0 0 1 -1 0 [*:1]N1CCN(CCO)CC1>>[*:1]N1CCNC(C)C1 70.95 44.27 -26.68
57 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]N1CC(C)NC(C)C1>>[*:1]N1CCNC(C)C1 61.63 44.27 -17.36
58 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]CCCCCCC>>[*:1]CCCCCCCC 56.99 32.75 -24.24
59 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]CCCCCCC>>[*:1]CCCCCCCCCC 56.99 -5.56 -62.55

60 rows × 119 columns

feat_diff.iloc[:,:-4].sum().sort_values(ascending=False).head(25)
B7                                        22
18 - Pyridine                             17
NUC                                       16
sp2 hybridized carbon atoms (12)          10
Nitrogen atoms (5)                         9
sp3 hybridized carbon atoms (10)           7
B9                                         7
Nitrogen atoms (2)                         7
N6                                         7
N9                                         7
ACID                                       7
17 - Aromatic amine                        6
sp3 hybridized carbon atoms (5)            5
A33 - phenol                               5
E3 - e.g., carbonates                      5
15 - Secondary amine group                 5
sp2 hybridized carbon atoms (10)           4
Primary amine, not amide                   4
Primary or secondary amine, not amide.     4
Alpha halo carbonyl                        4
9 - �¡arbonyl                              3
Ketone                                     3
Imines_(not_ring)                          3
sp3 hybridized carbon atoms (2)            3
Aromatic NO2                               2
dtype: object
feat_diff.iloc[:,:-4].sum().sort_values(ascending=False).tail(25)
4 - Aromatic carbon-alkane                               -3
B8EXC                                                    -3
N4EXC                                                    -3
Positively charged atoms                                 -3
ELEC                                                     -3
Negatively charged atoms                                 -3
13 - Ether                                               -3
Acyclic N-,=N and not N bound to carbonyl or sulfone     -3
25 - Aromatic chloro                                     -4
38 - Aromatic fluoro                                     -4
N oxide                                                  -5
sp2 hybridized carbon atoms (8)                          -5
10 - Aldehyde                                            -6
1 - Alkane group                                         -6
sp2 hybridized carbon atoms (7)                          -6
Aldehyde carbon atoms                                    -6
E1 - alkyl and aryl ketones and aldehydes                -6
Quaternary nitrogen (1)                                  -7
8 - Aromatic carbon-alcohol                             -10
32 - Iodo compounds                                     -11
Aryl iodide                                             -11
Iodine                                                  -11
sp3 hybridized carbon atoms (11)                        -14
sp2 hybridized carbon atoms (11)                        -18
3 - Aromatic carbon                                     -22
dtype: object
feat_diff
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
1 - Alkane group 1,2-Dicarbonyl not in ring 10 - Aldehyde 13 - Ether 15 - Secondary amine group 16 - Tertiary amine 17 - Aromatic amine 18 - Pyridine 19 - CCN 2 - Olefin group ... Thionyl Vinyl michael acceptor1 Primary amine, not amide Primary or secondary amine, not amide. tertiary aliphatic amine carboxylic acid smirks measurement_A measurement_B target
0 0 0 0 0 0 0 0 0 0 -1 ... 0 0 -1 -1 0 0 [*:1]C(=O)/C=C(\N)C(Cl)(Cl)Cl>>[*:1]C(=O)CC(=O... 69.12 44.26 -24.86
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]c1cc(Br)cc(Cl)c1O>>[*:1]c1ccc([N+](=O)[O-... 57.32 -2.09 -59.41
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]c1cc(I)cc(I)c1O>>[*:1]c1ccc([N+](=O)[O-])o1 63.01 -2.09 -65.1
3 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]c1ccc(F)cc1>>[*:1]c1ncccc1O 63.01 36.45 -26.56
4 -1 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]c1cc(CC)ccc1O>>[*:1]c1ncccc1O 53.18 36.45 -16.73
5 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]c1cccc(Cl)c1Cl>>[*:1]c1ncccc1O 72.7 36.45 -36.25
6 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]c1cccc(F)c1>>[*:1]c1ncccc1O 55.41 36.45 -18.96
7 0 0 0 -1 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]c1cccc(OC)c1>>[*:1]c1ncccc1O 59.53 36.45 -23.08
8 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]c1cccc2ccccc12>>[*:1]c1ncccc1O 64.42 36.45 -27.97
9 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]c1ccccc1I>>[*:1]c1ncccc1O 62.45 36.45 -26.0
10 0 0 0 -1 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]c1ccccc1OC>>[*:1]c1ncccc1O 64.28 36.45 -27.83
11 -1 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]c1ccc(CC)cc1[*:2]>>[*:1]c1cccnc1[*:2] 53.18 36.45 -16.73
12 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 -1 0 0 [*:1]c1c(Cl)nc(NN)c([*:2])c1[*:3]>>[*:1]c1nc([... 87.7 -3.86 -91.56
13 0 0 0 0 0 0 0 0 0 -1 ... 0 0 -1 -1 0 0 [*:1]/C=C(\N)C(Cl)(Cl)Cl>>[*:1]CC(=O)C(F)(F)F 69.12 44.26 -24.86
14 -1 0 0 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]C12CC3CC(CC(C3)C1)C2>>[*:1]c1c(OC)cccc1OC 87.8 -1.01 -88.81
15 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]Cn1cc([*:2])c([N+](=O)[O-])n1>>[*:1]Cn1nn... 86.1 32.9 -53.2
16 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]Cn1cc([N+](=O)[O-])c([*:2])n1>>[*:1]Cn1nn... 86.1 32.9 -53.2
17 0 0 0 0 0 0 0 0 1 0 ... 1 0 0 0 0 0 [*:1]c1c[n+]([O-])n([*:2])n1>>[*:1]c1ncn([*:2]... 42.16 -3.58 -45.74
18 0 0 0 0 1 0 0 0 0 1 ... 0 0 0 1 0 0 [*:1]c1c[n+]([O-])n([*:2])n1>>[*:1]c1nn([*:2])... 42.16 1.48 -40.68
19 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]O[*:2]>>[*:1]OC([*:2])=O 81.47 3.38 -78.09
20 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]C1CN([*:2])CC(C)N1>>[*:1]C1CN([*:2])CCN1 61.63 44.27 -17.36
21 -1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]C>>[*:1][H] 61.63 44.27 -17.36
22 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]c1cn([*:2])nc1[N+](=O)[O-]>>[*:1]c1nnn([*... 86.1 32.9 -53.2
23 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]c1nn([*:2])cc1[N+](=O)[O-]>>[*:1]c1nnn([*... 86.1 32.9 -53.2
24 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]N1CCC(OC)CC1>>[*:1]N1CCOCC1 95.45 -0.15 -95.6
25 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]/N=C/c1cc(Br)cc([*:2])c1O>>[*:2]CC(=O)C(=... 74.04 -2.72 -76.76
26 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]CCN1CCC(OC)CC1>>[*:1]CCN1CCOCC1 95.45 -0.15 -95.6
27 -1 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]/C=N\c1cc(CC)ccc1[*:2]>>[*:1]/C=N\c1ncccc... 53.18 36.45 -16.73
28 1 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]/N=C\c1cc(I)cc(I)c1[*:2]>>[*:1]C([*:2])C(... 56.0 -0.18 -56.18
29 1 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]/N=C\c1cc(I)cc(I)c1[*:2]>>[*:2]CC(NC(=O)C... 56.0 -0.18 -56.18
30 0 0 -1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1ccc2no[n+]([O... 55.67 -1.98 -57.65
31 0 0 -1 0 0 0 1 0 0 0 ... 0 0 1 1 0 0 [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1cc([N+](=O)[O... 55.67 39.85 -15.82
32 0 0 -1 0 0 0 1 0 0 0 ... 0 0 1 1 0 0 [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1cnc(N)c([N+](... 55.67 39.85 -15.82
33 1 0 -1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1nnn(CC(C)=O)n1 55.67 32.9 -22.77
34 0 0 -1 0 0 0 0 0 0 1 ... 0 1 0 0 0 1 [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1ccc(/C=C/C(=O... 55.67 22.95 -32.72
35 0 0 -1 0 0 0 1 0 0 0 ... 0 0 1 1 0 0 [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1nonc1N 55.67 -3.09 -58.76
36 -1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]c1c[n+]([O-])n(C)n1>>[*:1]c1ccc2no[n+]([O... 42.16 -1.98 -44.14
37 -1 0 0 0 0 0 1 0 0 0 ... 0 0 1 1 0 0 [*:1]c1c[n+]([O-])n(C)n1>>[*:1]c1cc([N+](=O)[O... 42.16 39.85 -2.31
38 -1 0 0 0 0 0 1 0 0 0 ... 0 0 1 1 0 0 [*:1]c1c[n+]([O-])n(C)n1>>[*:1]c1cnc(N)c([N+](... 42.16 39.85 -2.31
39 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]c1c[n+]([O-])n(C)n1>>[*:1]c1nnn(CC(C)=O)n1 42.16 32.9 -9.26
40 -1 0 0 0 0 0 0 0 0 1 ... 0 1 0 0 0 1 [*:1]c1c[n+]([O-])n(C)n1>>[*:1]c1ccc(/C=C/C(=O... 42.16 22.95 -19.21
41 -1 0 0 0 0 0 1 0 0 0 ... 0 0 1 1 0 0 [*:1]c1c[n+]([O-])n(C)n1>>[*:1]c1nonc1N 42.16 -3.09 -45.25
42 -1 0 0 0 0 0 1 0 0 0 ... 0 0 1 1 0 0 [*:1]c1cn(CC(C)=O)nc1[*:2]>>[*:1]c1cnc(N)c([*:... 86.1 39.85 -46.25
43 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]c1cn(CC(C)=O)nc1[*:2]>>[*:1]c1ncn(CCO)c1[... 86.1 -2.86 -88.96
44 0 0 0 0 0 0 0 0 1 0 ... 1 0 0 0 0 0 [*:1]Oc1ccccc1>>[*:1]S(=O)CC#N 79.05 -3.58 -82.63
45 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]Oc1ccccc1>>[*:1]Sc1nnnn1C 79.05 47.81 -31.24
46 -1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]C>>[*:1][H] 58.74 47.51 -11.23
47 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]/C=N\c1ccc(F)cc1>>[*:1]/C=N\c1ncccc1O 63.01 36.45 -26.56
48 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]/C=N\c1cccc(Cl)c1Cl>>[*:1]/C=N\c1ncccc1O 72.7 36.45 -36.25
49 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]/C=N\c1cccc(F)c1>>[*:1]/C=N\c1ncccc1O 55.41 36.45 -18.96
50 0 0 0 -1 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]/C=N\c1cccc(OC)c1>>[*:1]/C=N\c1ncccc1O 59.53 36.45 -23.08
51 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]/C=N\c1ccccc1I>>[*:1]/C=N\c1ncccc1O 62.45 36.45 -26.0
52 0 0 0 -1 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]/C=N\c1ccccc1OC>>[*:1]/C=N\c1ncccc1O 64.28 36.45 -27.83
53 0 0 0 0 0 0 -1 1 0 0 ... 0 0 -1 -1 0 0 [*:1]/C=N/c1nonc1N>>[*:1]/C=N\c1ncccc1O 70.9 36.45 -34.45
54 1 0 0 0 0 1 0 0 0 0 ... 0 0 0 -1 1 0 [*:1]N/N=C/c1ccccc1>>[*:1]N1CCN(C)CC1 83.39 0.02 -83.37
55 1 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]N/N=C/c1ccccc1>>[*:1]N1CCNCC1 83.39 -0.29 -83.68
56 0 0 0 0 1 -1 0 0 0 0 ... 0 0 0 1 -1 0 [*:1]N1CCN(CCO)CC1>>[*:1]N1CCNC(C)C1 70.95 44.27 -26.68
57 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]N1CC(C)NC(C)C1>>[*:1]N1CCNC(C)C1 61.63 44.27 -17.36
58 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]CCCCCCC>>[*:1]CCCCCCCC 56.99 32.75 -24.24
59 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]CCCCCCC>>[*:1]CCCCCCCCCC 56.99 -5.56 -62.55

60 rows × 119 columns

find correlated feats:

corr_feat = feat_diff.iloc[:,:-4].astype(float)
corr = corr_feat.corr()
feat_diff.iloc[:,:-4][(feat_diff.iloc[:,:-4]['3 - Aromatic carbon']<0)].sum().sort_values(ascending=False).head(20)
sp2 hybridized carbon atoms (12)    17
18 - Pyridine                       17
B7                                  17
NUC                                 15
N6                                  15
ACID                                13
A33 - phenol                        13
Nitrogen atoms (5)                   4
sp2 hybridized carbon atoms (10)     4
15 - Secondary amine group           3
sp3 hybridized carbon atoms (10)     3
Enamine                              3
Alpha halo carbonyl                  2
22 - CCl2                            2
5 - Alcohol                          2
Alkyl halide                         2
Nitrogen atoms (1)                   2
sp3 hybridized carbon atoms (5)      2
sp3 hybridized carbon atoms (2)      2
sp3 hybridized carbon atoms (12)     2
dtype: object
sub_to_evader_index_reset = sub_to_evader_transforms.reset_index(drop=True)
feat_diff[feat_diff['Iodine']<0]
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
1 - Alkane group 1,2-Dicarbonyl not in ring 10 - Aldehyde 13 - Ether 15 - Secondary amine group 16 - Tertiary amine 17 - Aromatic amine 18 - Pyridine 19 - CCN 2 - Olefin group ... Thionyl Vinyl michael acceptor1 Primary amine, not amide Primary or secondary amine, not amide. tertiary aliphatic amine carboxylic acid smirks measurement_A measurement_B target
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]c1cc(I)cc(I)c1O>>[*:1]c1ccc([N+](=O)[O-])o1 63.01 -2.09 -65.1
9 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]c1ccccc1I>>[*:1]c1ncccc1O 62.45 36.45 -26.0
28 1 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]/N=C\c1cc(I)cc(I)c1[*:2]>>[*:1]C([*:2])C(... 56.0 -0.18 -56.18
29 1 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]/N=C\c1cc(I)cc(I)c1[*:2]>>[*:2]CC(NC(=O)C... 56.0 -0.18 -56.18
30 0 0 -1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1ccc2no[n+]([O... 55.67 -1.98 -57.65
31 0 0 -1 0 0 0 1 0 0 0 ... 0 0 1 1 0 0 [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1cc([N+](=O)[O... 55.67 39.85 -15.82
32 0 0 -1 0 0 0 1 0 0 0 ... 0 0 1 1 0 0 [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1cnc(N)c([N+](... 55.67 39.85 -15.82
33 1 0 -1 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1nnn(CC(C)=O)n1 55.67 32.9 -22.77
34 0 0 -1 0 0 0 0 0 0 1 ... 0 1 0 0 0 1 [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1ccc(/C=C/C(=O... 55.67 22.95 -32.72
35 0 0 -1 0 0 0 1 0 0 0 ... 0 0 1 1 0 0 [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1nonc1N 55.67 -3.09 -58.76
51 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]/C=N\c1ccccc1I>>[*:1]/C=N\c1ncccc1O 62.45 36.45 -26.0

11 rows × 119 columns

len(sub_to_evader_index_reset.iloc[feat_diff[feat_diff['Iodine']<0].index].compound_structure_B.unique())
8
feat_diff.iloc[:,:-4].sum().sort_values(ascending=False).tail(20)
Negatively charged atoms                                 -3
13 - Ether                                               -3
Acyclic N-,=N and not N bound to carbonyl or sulfone     -3
25 - Aromatic chloro                                     -4
38 - Aromatic fluoro                                     -4
N oxide                                                  -5
sp2 hybridized carbon atoms (8)                          -5
10 - Aldehyde                                            -6
1 - Alkane group                                         -6
sp2 hybridized carbon atoms (7)                          -6
Aldehyde carbon atoms                                    -6
E1 - alkyl and aryl ketones and aldehydes                -6
Quaternary nitrogen (1)                                  -7
8 - Aromatic carbon-alcohol                             -10
32 - Iodo compounds                                     -11
Aryl iodide                                             -11
Iodine                                                  -11
sp3 hybridized carbon atoms (11)                        -14
sp2 hybridized carbon atoms (11)                        -18
3 - Aromatic carbon                                     -22
dtype: object
feat_diff.iloc[:,:-4].sum().sort_values(ascending=False).head(20)
B7                                        22
18 - Pyridine                             17
NUC                                       16
sp2 hybridized carbon atoms (12)          10
Nitrogen atoms (5)                         9
sp3 hybridized carbon atoms (10)           7
B9                                         7
Nitrogen atoms (2)                         7
N6                                         7
N9                                         7
ACID                                       7
17 - Aromatic amine                        6
sp3 hybridized carbon atoms (5)            5
A33 - phenol                               5
E3 - e.g., carbonates                      5
15 - Secondary amine group                 5
sp2 hybridized carbon atoms (10)           4
Primary amine, not amide                   4
Primary or secondary amine, not amide.     4
Alpha halo carbonyl                        4
dtype: object

vis

search = feat_diff.iloc[:,:-4][(feat_diff.iloc[:,:-4]['B7']>0)].index

len(sub_to_evader_index_reset.iloc[search].compound_structure_B.unique())
2
feat_diff.iloc[:,:-4][(feat_diff.iloc[:,:-4]['E1 - alkyl and aryl ketones and aldehydes']<0)].sum().sort_values(ascending=False).head(20)
Primary or secondary amine, not amide.    3
Primary amine, not amide                  3
B8EXC                                     3
17 - Aromatic amine                       3
B9                                        3
Negatively charged atoms                  3
Positively charged atoms                  3
Nitrogen atoms (2)                        3
Nitrogen atoms (7)                        3
Nitrogen atoms (4)                        2
Nitrogen atoms (5)                        2
B7                                        2
Dye 16 (1)                                2
E3 - e.g., carbonates                     2
N4EXC                                     2
Nitro group                               2
Aromatic NO2                              2
27 - Aromatic nitro                       2
sp2 hybridized carbon atoms (12)          2
Oxygen-nitrogen single bond               2
dtype: object
feat_diff = feat_diff.drop(['N9'], axis=1)
to_drop=['18 - Pyridine', 'N9']

# to_drop=[]

feat_diff = feat_diff.drop(to_drop, axis = 1)

feat_left =  feat_left.drop(to_drop, axis = 1)
feat_right =  feat_right.drop(to_drop, axis = 1)

fr_sig_descriptors_evade = master_functions.find_sig_feats_mk2(feat_left, feat_right, 0.05)

fractions_to_drop=[]

results_evader = master_functions.results_arr(feat_diff, fr_sig_descriptors_evade, feat_right, feat_left, fractions_to_drop )
Found significant fractions:  21
10 - Aldehyde has negative correlation 
percentage_loss 100
15 - Secondary amine group has positive correlation 
0/1/2 loss
[('3 - Aromatic carbon', 'Nitrogen atoms (5)', 'N4EXC'), 'sp2 hybridized carbon atoms (11)', 'Iodine']
[-60.0, -40.0, -40.0]
percentage gain under -100
17 - Aromatic amine has positive correlation 
0/1/2 loss
[('1 - Alkane group', 'ELEC', 'sp3 hybridized carbon atoms (11)'), 'E1 - alkyl and aryl ketones and aldehydes', 'Iodine']
[-57.14, -42.86, -42.86]
percentage gain under -100
25 - Aromatic chloro has negative correlation 
first_gain
[('sp2 hybridized carbon atoms (12)', 'B7'), 'ACID', 'N4EXC']
[50.0, 25.0, 25.0]
3 - Aromatic carbon has negative correlation 
first_gain
[('sp2 hybridized carbon atoms (12)', 'B7'), 'NUC', 'N6']
[73.91, 65.22, 65.22]
percentage_loss 100
32 - Iodo compounds has negative correlation 
percentage_loss 100
38 - Aromatic fluoro has negative correlation 
percentage_loss 100
8 - Aromatic carbon-alcohol has negative correlation 
all gain
[('B8EXC', 'Positively charged atoms', 'Negatively charged atoms'), 'Dye 16 (1)', 'Nitrogen atoms (2)']
[50.0, 40.0, 40.0]
percentage_loss 100
Aldehyde carbon atoms has negative correlation 
percentage_loss 100
Alpha halo carbonyl has positive correlation 
1/2/3 loss
['sp2 hybridized carbon atoms (11)', ('32 - Iodo compounds', '3 - Aromatic carbon', 'Nitrogen atoms (5)'), 'Iodine']
[-100.0, -50.0, -50.0]
percentage gain under -100
Aryl iodide has negative correlation 
percentage_loss 100
B7 has positive correlation 
percentage gain under -100
B9 has positive correlation 
0/1/2 loss
[('1 - Alkane group', 'ELEC', 'sp3 hybridized carbon atoms (11)'), 'Aldehyde carbon atoms', 'ACID']
[-50.0, -37.5, -37.5]
percentage gain under -100
E1 - alkyl and aryl ketones and aldehydes has negative correlation 
percentage_loss 100
Iodine has negative correlation 
percentage_loss 100
Nitrogen atoms (5) has positive correlation 
percentage gain under -100
NUC has positive correlation 
second double loss
['3 - Aromatic carbon', ('sp3 hybridized carbon atoms (11)', 'sp2 hybridized carbon atoms (11)'), 'sp2 hybridized carbon atoms (7)']
[-72.73, -40.91, -22.73]
percentage gain under -100
Quaternary nitrogen (1) has negative correlation 
percentage_loss 100
sp2 hybridized carbon atoms (11) has negative correlation 
first_gain
[('B7', 'sp2 hybridized carbon atoms (12)'), 'NUC', 'sp3 hybridized carbon atoms (10)']
[50.0, 36.36, 31.82]
percentage_loss 100
sp2 hybridized carbon atoms (7) has negative correlation 
first_gain
[('sp2 hybridized carbon atoms (12)', 'B7'), 'A33 - phenol', 'NUC']
[100.0, 83.33, 83.33]
percentage_loss 100
sp3 hybridized carbon atoms (11) has negative correlation 
percentage_loss 100
results_evader.sort_values(by='dof')
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
Main fraction Correlation $\overline{\Delta P}$ sem std dof Opposite fraction 1 % of opposite 1 Opposite fraction 2 % of opposite 2 Opposite fraction 3 % of opposite 3
3 25 - Aromatic chloro Negative 55.87 13.09 26.18 4 (sp2 hybridized carbon atoms (12), B7) 50.00 ACID 25.00 N4EXC 25.00
9 Alpha halo carbonyl Positive -40.52 9.04 18.08 4 sp2 hybridized carbon atoms (11) -100.00 (32 - Iodo compounds, 3 - Aromatic carbon, Nit... -50.00 Iodine -50.00
6 38 - Aromatic fluoro Negative 22.76 2.19 4.39 4 ACID 100.00 (NUC, A33 - phenol, sp2 hybridized carbon atom... 100.00 N6 100.00
1 15 - Secondary amine group Positive -52.68 9.50 21.25 5 (3 - Aromatic carbon, Nitrogen atoms (5), N4EXC) -60.00 sp2 hybridized carbon atoms (11) -40.00 Iodine -40.00
19 sp2 hybridized carbon atoms (7) Negative 24.81 2.85 6.98 6 (sp2 hybridized carbon atoms (12), B7) 100.00 A33 - phenol 83.33 NUC 83.33
0 10 - Aldehyde Negative 33.92 8.09 19.80 6 Primary or secondary amine, not amide. 50.00 (Primary amine, not amide, B8EXC, 17 - Aromati... 50.00 B9 50.00
13 E1 - alkyl and aryl ketones and aldehydes Negative 33.92 8.09 19.80 6 Primary or secondary amine, not amide. 50.00 (Primary amine, not amide, B8EXC, 17 - Aromati... 50.00 B9 50.00
8 Aldehyde carbon atoms Negative 33.92 8.09 19.80 6 Primary or secondary amine, not amide. 50.00 (Primary amine, not amide, B8EXC, 17 - Aromati... 50.00 B9 50.00
2 17 - Aromatic amine Positive -26.65 8.70 23.02 7 (1 - Alkane group, ELEC, sp3 hybridized carbon... -57.14 E1 - alkyl and aryl ketones and aldehydes -42.86 Iodine -42.86
17 Quaternary nitrogen (1) Negative 24.03 7.73 20.46 7 NUC 42.86 (Primary or secondary amine, not amide., Prima... 42.86 17 - Aromatic amine 42.86
12 B9 Positive -28.40 7.74 21.89 8 (1 - Alkane group, ELEC, sp3 hybridized carbon... -50.00 Aldehyde carbon atoms -37.50 ACID -37.50
7 8 - Aromatic carbon-alcohol Negative 42.15 7.48 23.65 10 (B8EXC, Positively charged atoms, Negatively c... 50.00 Dye 16 (1) 40.00 Nitrogen atoms (2) 40.00
5 32 - Iodo compounds Negative 39.36 5.82 19.30 11 Nitrogen atoms (2) 45.45 (B8EXC, B7, Negatively charged atoms) 36.36 Positively charged atoms 36.36
10 Aryl iodide Negative 39.36 5.82 19.30 11 Nitrogen atoms (2) 45.45 (B8EXC, B7, Negatively charged atoms) 36.36 Positively charged atoms 36.36
14 Iodine Negative 39.36 5.82 19.30 11 Nitrogen atoms (2) 45.45 (B8EXC, B7, Negatively charged atoms) 36.36 Positively charged atoms 36.36
15 Nitrogen atoms (5) Positive -21.62 3.16 11.81 14 sp2 hybridized carbon atoms (11) -78.57 3 - Aromatic carbon -64.29 sp3 hybridized carbon atoms (11) -50.00
20 sp3 hybridized carbon atoms (11) Negative 33.56 6.73 29.34 19 B7 52.63 NUC 47.37 Nitrogen atoms (5) 36.84
16 NUC Positive -34.87 4.61 21.64 22 3 - Aromatic carbon -72.73 (sp3 hybridized carbon atoms (11), sp2 hybridi... -40.91 sp2 hybridized carbon atoms (7) -22.73
11 B7 Positive -23.29 2.20 10.34 22 3 - Aromatic carbon -77.27 sp2 hybridized carbon atoms (11) -50.00 sp3 hybridized carbon atoms (11) -45.45
18 sp2 hybridized carbon atoms (11) Negative 38.77 5.16 24.20 22 (B7, sp2 hybridized carbon atoms (12)) 50.00 NUC 36.36 sp3 hybridized carbon atoms (10) 31.82
4 3 - Aromatic carbon Negative 37.77 4.81 23.06 23 (sp2 hybridized carbon atoms (12), B7) 73.91 NUC 65.22 N6 65.22
master_functions.plot_feats(results_evader)

png

find examples visually

feat_diff.iloc[:,:-4][(feat_diff.iloc[:,:-4]['Quaternary nitrogen (1)']<0)].sum().sort_values(ascending=False).head(20)
NUC                                                             3
Primary or secondary amine, not amide.                          3
Primary amine, not amide                                        3
B9                                                              3
17 - Aromatic amine                                             3
Nitrogen atoms (2)                                              3
Nitro group                                                     2
Nitrogen atoms (4)                                              2
Dye 16 (1)                                                      2
Nitrogen atoms (5)                                              2
27 - Aromatic nitro                                             2
sp2 hybridized carbon atoms (11)                                2
E3 - e.g., carbonates                                           2
B7                                                              2
sp3 hybridized carbon atoms (10)                                2
N4EXC                                                           2
Oxygen-nitrogen single bond                                     2
Aromatic NO2                                                    2
sp3 hybridized carbon atoms (5)                                 2
Alpha beta-unsaturated ketones; center of Michael reactivity    1
dtype: object
# get example of positive transforms

# substrates
to_fg = '17 - Aromatic amine'

from_fg = 'Quaternary nitrogen (1)'

dex = feat_diff[(feat_diff[to_fg]>0)&(feat_diff[from_fg]<0)] # multiple examples of said transformation with different smirks

print(len(dex))

print('number of unique smirks:', len(dex.smirks.unique()) )

# grab those smirks and produce examples


low=4 # take first smirk

display_arr = []
for i in range(len(dex)):
    display_lhs_sub = sub_to_evader_transforms[sub_to_evader_transforms['smirks']==dex.smirks.iloc[i]].LHS.iloc[0]
    display_rhs_sub = sub_to_evader_transforms[sub_to_evader_transforms['smirks']==dex.smirks.iloc[i]].RHS.iloc[0]
    display_arr.append(Chem.MolFromSmiles(display_lhs_sub))
    display_arr.append(Chem.MolFromSmiles(display_rhs_sub))
# Chem.Draw.MolsToGridImage([Chem.MolFromSmiles(display_lhs_sub),Chem.MolFromSmiles(display_rhs_sub)], molsPerRow=2, subImgSize=(400,400), useSVG=True)
leg=[str(x) for x in range(len(dex))]
Chem.Draw.MolsToGridImage(display_arr, molsPerRow=2, subImgSize=(400,400), useSVG=True, maxMols = 50)
3
number of unique smirks: 3

svg

# choose the compounds we're interested in:

a=4

# smiles:
    
comp_a = sub_to_evader_transforms[sub_to_evader_transforms['smirks'] == dex.iloc[int(a/2)].smirks].compound_structure_A.values[0]

comp_b = sub_to_evader_transforms[sub_to_evader_transforms['smirks'] == dex.iloc[int(a/2)].smirks].compound_structure_B.values[0]

# wt and efflux pre
pre =  e_coli_wild_efflux[e_coli_wild_efflux['SMILES'] == comp_a][['INHIB_AVE_wild', 'INHIB_AVE_efflux']].values[0]
# wt and efflux post
post = e_coli_wild_efflux[e_coli_wild_efflux['SMILES'] == comp_b][['INHIB_AVE_wild', 'INHIB_AVE_efflux']].values[0]


print(comp_a)

print('WT: {}%, tolC: {}%'.format(pre[0], pre[1]))

print(comp_b)

print('WT: {}%, tolC: {}%'.format(post[0], post[1]))
Cn1nc([N+](=O)[O-])c[n+]1[O-]
WT: 36.64%, tolC: 78.8%
Nc1nonc1[N+](=O)[O-]
WT: 99.21%, tolC: 96.12%
# get example of negative transforms

# Filter9_metal	Negative	47.02	6.41	21.27	11	Nitrogen atoms (2)	

# substrates
to_fg = 'B7'

from_fg = 'Iodine'

dex = feat_diff[(feat_diff[to_fg]>0)&(feat_diff[from_fg]<0)] # multiple examples of said transformation with different smirks

print(len(dex))

print('number of unique smirks:', len(dex.smirks.unique()) )

# grab those smirks and produce examples


low=4 # take first smirk

display_arr = []
for i in range(len(dex)):
    display_lhs_sub = sub_to_evader_transforms[sub_to_evader_transforms['smirks']==dex.smirks.iloc[i]].LHS.iloc[0]
    display_rhs_sub = sub_to_evader_transforms[sub_to_evader_transforms['smirks']==dex.smirks.iloc[i]].RHS.iloc[0]
    display_arr.append(Chem.MolFromSmiles(display_lhs_sub))
    display_arr.append(Chem.MolFromSmiles(display_rhs_sub))
# Chem.Draw.MolsToGridImage([Chem.MolFromSmiles(display_lhs_sub),Chem.MolFromSmiles(display_rhs_sub)], molsPerRow=2, subImgSize=(400,400), useSVG=True)
Chem.Draw.MolsToGridImage(display_arr, molsPerRow=2, subImgSize=(400,400), useSVG=True, maxMols = 50)
4
number of unique smirks: 4

svg

dex
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
1 - Alkane group 1,2-Dicarbonyl not in ring 10 - Aldehyde 13 - Ether 15 - Secondary amine group 16 - Tertiary amine 17 - Aromatic amine 18 - Pyridine 19 - CCN 2 - Olefin group ... Thionyl Vinyl michael acceptor1 Primary amine, not amide Primary or secondary amine, not amide. tertiary aliphatic amine carboxylic acid smirks measurement_A measurement_B target
3 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]c1ccc(F)cc1>>[*:1]c1ncccc1O 63.01 36.45 -26.56
5 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]c1cccc(Cl)c1Cl>>[*:1]c1ncccc1O 72.7 36.45 -36.25
6 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]c1cccc(F)c1>>[*:1]c1ncccc1O 55.41 36.45 -18.96
7 0 0 0 -1 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]c1cccc(OC)c1>>[*:1]c1ncccc1O 59.53 36.45 -23.08
8 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]c1cccc2ccccc12>>[*:1]c1ncccc1O 64.42 36.45 -27.97
9 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]c1ccccc1I>>[*:1]c1ncccc1O 62.45 36.45 -26.0
10 0 0 0 -1 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]c1ccccc1OC>>[*:1]c1ncccc1O 64.28 36.45 -27.83
47 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]/C=N\c1ccc(F)cc1>>[*:1]/C=N\c1ncccc1O 63.01 36.45 -26.56
48 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]/C=N\c1cccc(Cl)c1Cl>>[*:1]/C=N\c1ncccc1O 72.7 36.45 -36.25
49 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]/C=N\c1cccc(F)c1>>[*:1]/C=N\c1ncccc1O 55.41 36.45 -18.96
50 0 0 0 -1 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]/C=N\c1cccc(OC)c1>>[*:1]/C=N\c1ncccc1O 59.53 36.45 -23.08
51 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]/C=N\c1ccccc1I>>[*:1]/C=N\c1ncccc1O 62.45 36.45 -26.0
52 0 0 0 -1 0 0 0 1 0 0 ... 0 0 0 0 0 0 [*:1]/C=N\c1ccccc1OC>>[*:1]/C=N\c1ncccc1O 64.28 36.45 -27.83
53 0 0 0 0 0 0 -1 1 0 0 ... 0 0 -1 -1 0 0 [*:1]/C=N/c1nonc1N>>[*:1]/C=N\c1ncccc1O 70.9 36.45 -34.45

14 rows × 119 columns

# choose the compounds we're interested in:

a=18

# smiles:
    
comp_a = sub_to_evader_transforms[sub_to_evader_transforms['smirks'] == dex.iloc[int(a/2)].smirks].compound_structure_A.values[0]

comp_b = sub_to_evader_transforms[sub_to_evader_transforms['smirks'] == dex.iloc[int(a/2)].smirks].compound_structure_B.values[0]

# wt and efflux pre
pre =  e_coli_wild_efflux[e_coli_wild_efflux['SMILES'] == comp_a][['INHIB_AVE_wild', 'INHIB_AVE_efflux']].values[0]
# wt and efflux post
post = e_coli_wild_efflux[e_coli_wild_efflux['SMILES'] == comp_b][['INHIB_AVE_wild', 'INHIB_AVE_efflux']].values[0]


print(comp_a)

print('WT: {}%, tolC: {}%'.format(pre[0], pre[1]))

print(comp_b)

print('WT: {}%, tolC: {}%'.format(post[0], post[1]))
CCc1ccc(O)c(/N=C/c2cc(I)cc(I)c2O)c1
WT: 38.8%, tolC: 91.98%
Oc1cccnc1/N=C/c1cc(I)cc(I)c1O
WT: 60.66%, tolC: 97.11%

physcichemical of sub to evade transforms:

sub_and_evade_logd['Class'] = sub_and_evade_om_corrected['Class']
sub_and_evade_logd.columns
Index(['Index', 'SMILES', 'logS', 'logS @ pH7.4', 'logD', '2C9 pKi', 'logP',
       'MW', 'HBD', 'HBA', 'TPSA', 'Flexibility', 'Rotatable Bonds', 'mol',
       'Class'],
      dtype='object')
feat='Rotatable Bonds'
sub_and_evade_logd[sub_and_evade_logd['Class']=='Efflux Substrate'][feat].mean(), sub_and_evade_logd[sub_and_evade_logd['Class']=='Efflux Evader'][feat].mean()
(5.730560578661844, 4.859459459459459)
sub_and_evade_logd
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
Index SMILES logS logS @ pH7.4 logD 2C9 pKi logP MW HBD HBA TPSA Flexibility Rotatable Bonds mol Class
0 0 OB1OCc2ccccc21 5.188 2.2370 0.07439 4.217 0.07439 133.9 1 2 29.46 0.00000 0 <rdkit.Chem.rdchem.Mol object at 0x000002CDAA1... Efflux Evader
1 1 BrC(/C=N/Nc1nc(N2CCOCC2)nc(N2CCOCC2)n1)=C/c1cc... 2.053 0.4994 2.27200 5.529 2.78000 474.4 1 9 88.00 0.18180 6 <rdkit.Chem.rdchem.Mol object at 0x000002CDAA1... Efflux Evader
2 2 Clc1ccc(C(=C2CN3CCC2CC3)c2ccc(Cl)s2)s1 1.303 0.8745 3.51100 5.096 4.87400 356.3 0 1 3.24 0.08333 2 <rdkit.Chem.rdchem.Mol object at 0x000002CDAA1... Efflux Evader
3 3 O=C(/C=C(\O)c1ccc(Br)cc1)C(F)(F)F 2.361 2.2380 1.63100 4.581 3.76600 295.1 1 2 37.30 0.18750 3 <rdkit.Chem.rdchem.Mol object at 0x000002CDAA1... Efflux Evader
4 4 O=C(CCl)C(=O)Nc1ccccc1 4.326 2.9250 1.00300 3.932 1.00300 197.6 1 3 46.17 0.30770 4 <rdkit.Chem.rdchem.Mol object at 0x000002CDAA1... Efflux Evader
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
735 735 c1ccc2c(c1)ccc1c2nc2ccccn21 1.606 1.6420 4.15400 4.902 4.15400 218.3 0 2 17.30 0.00000 0 <rdkit.Chem.rdchem.Mol object at 0x000002CDE31... Efflux Substrate
736 736 O=C(CSc1ccc2ccccc2n1)N/N=C/c1ccc(O)cc1O 1.119 2.5010 2.21900 4.954 2.21900 353.4 3 6 94.81 0.22220 6 <rdkit.Chem.rdchem.Mol object at 0x000002CDE31... Efflux Substrate
737 737 Cc1c2ccncc2c(C)c2c1[nH]c1ccccc12 1.294 0.9868 4.80000 5.346 4.80000 246.3 1 2 28.68 0.00000 0 <rdkit.Chem.rdchem.Mol object at 0x000002CDE31... Efflux Substrate
738 738 Cc1cc(C)c(CSc2nnc(C)s2)c(C)c1 1.607 2.4660 3.86300 4.569 3.86300 264.4 0 2 25.78 0.16670 3 <rdkit.Chem.rdchem.Mol object at 0x000002CDE31... Efflux Substrate
739 739 COc1cc([C@@H]2c3cc4c(cc3[C@@H](OC3OC5CO[C@@H](... 1.052 2.1080 1.28600 5.984 1.28600 656.7 3 13 160.80 0.11320 6 <rdkit.Chem.rdchem.Mol object at 0x000002CDE31... Efflux Substrate

738 rows × 15 columns

sub_and_evade_logd = pd.read_csv('data_curated/sub_and_evade_PE.csv')
sub_and_evade_logd['mol'] = sub_and_evade_logd['SMILES'].apply(Chem.MolFromSmiles)
[09:35:17] Explicit valence for atom # 2 N, 4, is greater than permitted
[09:35:17] Explicit valence for atom # 17 N, 5, is greater than permitted
sub_and_evade_logd = sub_and_evade_logd.dropna(subset='mol')
sub_and_evade_logd['SMILES'] = sub_and_evade_logd['SMILES'].apply(Chem.CanonSmiles)
a_features = calcualte_features_single(sub_to_evader_transforms, 'compound_structure_A')
b_features = calcualte_features_single(sub_to_evader_transforms, 'compound_structure_B')

a_features= a_features.iloc[:,:-87]
b_features= b_features.iloc[:,:-87]


# sub_evade_inactive_features['Class'] = sub_evade_inactive['Class']
Computing features: 


100%|█████████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 133.08it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 1890.13it/s]


Computing features: 


100%|█████████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 139.04it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 1890.13it/s]
def get_change(current, previous):
    if current == previous:
        return 0
    try:
        return (abs(current - previous) / previous) * 100.0
    except ZeroDivisionError:
        return float('inf')
rets=[]
for column in a_features.columns:
    rets.append(get_change(b_features[column].mean(), a_features[column].mean()))
pd.DataFrame(rets, index=a_features.columns).sort_values(by=0).head(20)
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
0
VSA_EState5 -163.547238
MinEStateIndex -45.150857
HallKierAlpha -14.961832
MinPartialCharge -12.158647
NumRadicalElectrons 0.000000
EState_VSA11 0.000000
SlogP_VSA9 0.000000
SMR_VSA8 0.000000
VSA_EState8 0.066408
SlogP_VSA1 0.426011
MaxEStateIndex 1.157330
MaxAbsEStateIndex 1.157330
FpDensityMorgan1 1.441856
NumValenceElectrons 1.888042
Chi1 1.999946
Chi0 2.082437
HeavyAtomCount 2.088773
BertzCT 2.155824
FpDensityMorgan2 2.326498
EState_VSA8 2.640954
pd.DataFrame(rets, index=a_features.columns).sort_values(by=0).tail(30)
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
0
PEOE_VSA7 20.914404
EState_VSA4 23.165149
SlogP_VSA2 24.599014
NumHDonors 24.615385
NumAliphaticHeterocycles 25.000000
NumSaturatedHeterocycles 25.000000
VSA_EState6 25.295822
NHOHCount 27.142857
MolLogP 29.078748
SlogP_VSA8 29.091388
SlogP_VSA12 30.354076
EState_VSA6 30.865566
SMR_VSA6 36.032338
VSA_EState10 38.687729
SMR_VSA4 38.929079
EState_VSA7 39.171792
NumAromaticCarbocycles 42.028986
PEOE_VSA6 42.891886
EState_VSA5 46.730088
EState_VSA1 55.387805
Ipc 58.987509
PEOE_VSA13 59.265545
SMR_VSA3 60.530420
SMR_VSA2 65.444545
NumAromaticHeterocycles 74.285714
SlogP_VSA7 77.553925
NumAliphaticCarbocycles 100.000000
NumSaturatedCarbocycles 100.000000
PEOE_VSA3 159.404918
PEOE_VSA11 171.605736
a_features.MolLogP.mean()
2.7694200000000015
b_features.MolLogP.mean()
1.9641073333333339
feat='MolWt'
a_features[feat].mean(), b_features[feat].mean()
feat='MolWt'
a_features[feat].mean(), b_features[feat].mean()
feat='MolWt'
a_features[feat].mean(), b_features[feat].mean()
(365.2758000000001, 330.7681666666667)
feat='TPSA'
a_features[feat].mean(), b_features[feat].mean()
(73.45366666666665, 88.39083333333335)
feat='NumRotatableBonds'
a_features[feat].mean(), b_features[feat].mean()
(3.316666666666667, 3.4166666666666665)
feat='NumHAcceptors'
a_features[feat].mean(), b_features[feat].mean()
(4.383333333333334, 5.283333333333333)
feat='NumHDonors'
a_features[feat].mean(), b_features[feat].mean()
(1.0833333333333333, 1.35)
sns.histplot(a_features.MolLogP, color='r')
sns.histplot(b_features.MolLogP, color='b')
<Axes: xlabel='MolLogP', ylabel='Count'>

png

cluster 8

cluster_8 = pd.read_csv('data_curated/cluster_8.csv')
cluster_8
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
SMILES INHIB_AVE_wild INHIB_AVE_efflux Mol fps abs_diff sub_class wild_stds tolc_stds wild_class tolc_class Class mol
0 O=C(NC(=S)N1CCN(c2cc3c(cc2F)c(=O)c(C(=O)O)cn3C... 90.32 88.08 <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... <rdkit.DataStructs.cDataStructs.ExplicitBitVec... -2.24 decrease 8.862059 4.772322 active active Efflux Evader <rdkit.Chem.rdchem.Mol object at 0x00000271FF8...
1 CCn1cc(C(=O)O)c(=O)c2cc([N+](=O)[O-])ccc21 92.33 83.35 <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... <rdkit.DataStructs.cDataStructs.ExplicitBitVec... -8.98 decrease 9.068579 4.495245 active active Efflux Evader <rdkit.Chem.rdchem.Mol object at 0x00000271FF8...
2 CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCN(C(=S)NC(=O)c... 92.72 91.71 <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... <rdkit.DataStructs.cDataStructs.ExplicitBitVec... -1.01 decrease 9.108650 4.984962 active active Efflux Evader <rdkit.Chem.rdchem.Mol object at 0x00000271FF8...
3 CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCN(C(=S)Nc4ccc(... 94.83 93.26 <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... <rdkit.DataStructs.cDataStructs.ExplicitBitVec... -1.57 decrease 9.325446 5.075759 active active Efflux Evader <rdkit.Chem.rdchem.Mol object at 0x00000271FF8...
4 CCn1cc(C(=O)O)c(=O)c2cnc(N3CCN(C(=S)NC(=O)c4cc... 59.56 88.04 <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... <rdkit.DataStructs.cDataStructs.ExplicitBitVec... 28.48 increase 5.701576 4.769979 active active Efflux Evader <rdkit.Chem.rdchem.Mol object at 0x00000271FF8...
5 CCn1cc(C(=O)O)c(=O)c2cnc(N3CCN(C(=S)Nc4ccc(C(=... 96.96 100.34 <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... <rdkit.DataStructs.cDataStructs.ExplicitBitVec... 3.38 increase 9.544296 5.490497 active active Efflux Evader <rdkit.Chem.rdchem.Mol object at 0x00000271FF8...
6 CCOC(=O)c1cn(CC)c2cc(N3CCN(C)CC3)c(F)cc2c1=O 94.15 89.71 <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... <rdkit.DataStructs.cDataStructs.ExplicitBitVec... -4.44 decrease 9.255578 4.867805 active active Efflux Evader <rdkit.Chem.rdchem.Mol object at 0x00000271FF8...
7 Cc1c(NC(=O)c2cn3c4c(c(N5CCN(C)CC5)c(F)cc4c2=O)... 97.04 94.43 <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... <rdkit.DataStructs.cDataStructs.ExplicitBitVec... -2.61 decrease 9.552515 5.144296 active active Efflux Evader <rdkit.Chem.rdchem.Mol object at 0x00000271FF8...
8 CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23 99.54 98.79 <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... <rdkit.DataStructs.cDataStructs.ExplicitBitVec... -0.75 decrease 9.809382 5.399700 active active Efflux Evader <rdkit.Chem.rdchem.Mol object at 0x00000271FF8...
9 CCN1CCN(c2cc3c(cc2F)c(=O)c(C(=O)O)cn3C2CC2)CC1 101.15 101.88 <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... <rdkit.DataStructs.cDataStructs.ExplicitBitVec... 0.73 increase 9.974803 5.580708 active active Efflux Evader <rdkit.Chem.rdchem.Mol object at 0x00000271FF8...
10 CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCN(C)CC3)cc21.C... 100.16 100.18 <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... <rdkit.DataStructs.cDataStructs.ExplicitBitVec... 0.02 increase 9.873084 5.481124 active active Efflux Evader <rdkit.Chem.rdchem.Mol object at 0x00000271FF8...
11 CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCNCC3)cc21 98.83 98.54 <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... <rdkit.DataStructs.cDataStructs.ExplicitBitVec... -0.29 decrease 9.736432 5.385055 active active Efflux Evader <rdkit.Chem.rdchem.Mol object at 0x00000271FF8...
12 CCn1cc(C(=O)O)c(=O)c2ccc(C)nc21 100.81 101.30 <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... <rdkit.DataStructs.cDataStructs.ExplicitBitVec... 0.49 increase 9.939870 5.546732 active active Efflux Evader <rdkit.Chem.rdchem.Mol object at 0x00000271FF8...
13 CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)Nc4ccc... 74.97 93.00 <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... <rdkit.DataStructs.cDataStructs.ExplicitBitVec... 18.03 increase 7.284900 5.060529 active active Efflux Evader <rdkit.Chem.rdchem.Mol object at 0x00000271FF8...
14 CN1CCN(c2c(F)cc3c(=O)c(C(=O)O)cn4c3c2SCC4)CC1.Cl 101.07 101.69 <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... <rdkit.DataStructs.cDataStructs.ExplicitBitVec... 0.62 increase 9.966584 5.569578 active active Efflux Evader <rdkit.Chem.rdchem.Mol object at 0x00000271FF8...
15 COc1c(N2CCNC(C)C2)c(F)cc2c(=O)c(C(=O)O)cn(C3CC... 99.27 98.34 <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... <rdkit.DataStructs.cDataStructs.ExplicitBitVec... -0.93 decrease 9.781640 5.373339 active active Efflux Evader <rdkit.Chem.rdchem.Mol object at 0x000002726D8...
16 COc1c(N2C[C@@H]3CCCN[C@@H]3C2)c(F)cc2c(=O)c(C(... 99.99 99.89 <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... <rdkit.DataStructs.cDataStructs.ExplicitBitVec... -0.10 decrease 9.855617 5.464136 active active Efflux Evader <rdkit.Chem.rdchem.Mol object at 0x000002726D8...
17 C[C@H]1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)... 99.45 98.37 <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... <rdkit.DataStructs.cDataStructs.ExplicitBitVec... -1.08 decrease 9.800134 5.375097 active active Efflux Evader <rdkit.Chem.rdchem.Mol object at 0x000002726D8...
18 C[C@H]1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)... 100.58 100.90 <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... <rdkit.DataStructs.cDataStructs.ExplicitBitVec... 0.32 increase 9.916238 5.523301 active active Efflux Evader <rdkit.Chem.rdchem.Mol object at 0x000002726D8...
19 C[C@H]1COc2c(C3(N)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23 98.12 97.94 <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... <rdkit.DataStructs.cDataStructs.ExplicitBitVec... -0.18 decrease 9.663482 5.349908 active active Efflux Evader <rdkit.Chem.rdchem.Mol object at 0x000002726D8...
20 Cl.O=C(Nc1ccc(-c2n[nH]c(=S)o2)cc1)c1cn(C2CC2)c... 90.63 81.87 <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... <rdkit.DataStructs.cDataStructs.ExplicitBitVec... -8.76 decrease 8.893910 4.408548 active active Efflux Evader <rdkit.Chem.rdchem.Mol object at 0x000002726D8...
21 COc1c(N2CC3CCCNC3C2)c(F)cc2c(=O)c(C(=O)Nc3ccc(... 85.55 90.95 <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... <rdkit.DataStructs.cDataStructs.ExplicitBitVec... 5.40 increase 8.371958 4.940443 active active Efflux Evader <rdkit.Chem.rdchem.Mol object at 0x000002726D8...
22 Cc1ccc(S(=O)(=O)O)cc1.NC1CCN(c2nc3c(cc2F)c(=O)... 94.82 90.03 <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... <rdkit.DataStructs.cDataStructs.ExplicitBitVec... -4.79 decrease 9.324418 4.886550 active active Efflux Evader <rdkit.Chem.rdchem.Mol object at 0x000002726D8...
23 CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCN(C(=S)NC(=O)C... 9.66 97.46 <rdkit.Chem.rdchem.Mol object at 0x00000272495... <rdkit.DataStructs.cDataStructs.ExplicitBitVec... 87.80 increase 0.574526 5.321790 inactive active Efflux Substrate <rdkit.Chem.rdchem.Mol object at 0x000002724F6...
24 CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCN(C(=S)NC(=O)c... 2.34 93.17 <rdkit.Chem.rdchem.Mol object at 0x00000272283... <rdkit.DataStructs.cDataStructs.ExplicitBitVec... 90.83 increase -0.177578 5.070487 inactive active Efflux Substrate <rdkit.Chem.rdchem.Mol object at 0x000002724F6...
25 CCn1cc(C(=O)O)c(=O)c2cc(F)c(N/N=C/c3ccccc3)cc21 -2.02 81.37 <rdkit.Chem.rdchem.Mol object at 0x00000271FF7... <rdkit.DataStructs.cDataStructs.ExplicitBitVec... 83.39 increase -0.625553 4.379259 inactive active Efflux Substrate <rdkit.Chem.rdchem.Mol object at 0x000002724F6...
26 CCn1cc(C(=O)O)c(=O)c2cnc(N3CCN(C(=S)NC(=O)c4cc... -3.27 97.79 <rdkit.Chem.rdchem.Mol object at 0x00000271FF7... <rdkit.DataStructs.cDataStructs.ExplicitBitVec... 101.06 increase -0.753986 5.341121 inactive active Efflux Substrate <rdkit.Chem.rdchem.Mol object at 0x000002724F6...
27 CCOc1cccc(C(=O)NC(=S)N2CCN(c3ncc4c(=O)c(C(=O)O... -5.55 88.93 <rdkit.Chem.rdchem.Mol object at 0x00000271FF7... <rdkit.DataStructs.cDataStructs.ExplicitBitVec... 94.48 increase -0.988248 4.822114 inactive active Efflux Substrate <rdkit.Chem.rdchem.Mol object at 0x000002724F6...
28 CCn1cc(C(=O)O)c(=O)c2cnc(N3CCN(C(=S)NC(=O)c4cc... 6.81 97.95 <rdkit.Chem.rdchem.Mol object at 0x00000271FF7... <rdkit.DataStructs.cDataStructs.ExplicitBitVec... 91.14 increase 0.281699 5.350493 inactive active Efflux Substrate <rdkit.Chem.rdchem.Mol object at 0x000002724F6...
29 CCn1cc(C(=O)O)c(=O)c2cnc(N3CCN(C(=S)Nc4ccc(OC)... -0.57 80.90 <rdkit.Chem.rdchem.Mol object at 0x00000271FF7... <rdkit.DataStructs.cDataStructs.ExplicitBitVec... 81.47 increase -0.476571 4.351727 inactive active Efflux Substrate <rdkit.Chem.rdchem.Mol object at 0x000002724F6...
30 CCn1cc(C(=O)O)c(=O)c2cnc(N3CCN(C(=S)Nc4ccccc4C... 1.49 103.44 <rdkit.Chem.rdchem.Mol object at 0x00000271FF7... <rdkit.DataStructs.cDataStructs.ExplicitBitVec... 101.95 increase -0.264913 5.672090 inactive active Efflux Substrate <rdkit.Chem.rdchem.Mol object at 0x000002724F6...
31 CCn1cc(C(=O)O)c(=O)c2cnc(N3CCN(C(=S)Nc4ccccc4C... 0.73 97.25 <rdkit.Chem.rdchem.Mol object at 0x00000271B69... <rdkit.DataStructs.cDataStructs.ExplicitBitVec... 96.52 increase -0.343000 5.309488 inactive active Efflux Substrate <rdkit.Chem.rdchem.Mol object at 0x000002724F6...
small_set = main_transforms[main_transforms['compound_structure_B'].isin(cluster_8.SMILES)]
small_set_diff, small_set_left, small_set_right = calculate_fractions_mk7_new_smarts(small_set)
Generating molecular objects from pre-defined substructures
Calcualting LHS+RHS matches
small_set_diff.iloc[:,:-4].sum().sort_values(ascending=False).tail(20)
Dinitrobenzene_3                                         0
Dipeptide                                                0
Disulfide                                                0
Disulfides                                               0
Disulphide                                               0
Dithiocarbamate                                          0
Dithiole-2-thione                                        0
Dithiole-3-thione                                        0
Dithiomethylene_acetal                                   0
Dye 1 (1)                                                0
Dye 11                                                   0
Dye 16 (1)                                               0
E3 - e.g., carbonates                                    0
Nitrogen atoms (2)                                      -1
Adamantyl                                               -1
Primary or secondary amine, not amide.                  -1
Acyclic N-,=N and not N bound to carbonyl or sulfone    -2
N5EXC                                                   -2
N4EXC                                                   -2
Oxygen-nitrogen single bond                             -2
dtype: object
small_set_diff.iloc[:,:-4].sum().sort_values(ascending=False).head(20)
B9                                 2
N9                                 2
sp2 hybridized carbon atoms (4)    2
phenylpiperazine                   2
sp3 hybridized carbon atoms (2)    2
16 - Tertiary amine                2
NUC                                2
Nitrogen atoms (4)                 2
Sulphates                          1
B2 - secondary amine               1
S/PO3 groups                       1
5 - Alcohol                        1
41 - Acrylate                      1
B3 - tertiary amine                1
sp3 hybridized carbon atoms (9)    1
N2 - secondary amines              1
Ester                              1
sp2 hybridized carbon atoms (8)    1
ELEC                               1
Nitrogen atoms (1)                 1
dtype: object

Tripple Transforms

evader_transforms = evader_transforms.drop(columns=['idsmiles_A', 'idsmiles_B', 'measurement_A', 'measurement_B', 'measurement_delta'])
substrate_transforms = substrate_transforms.drop(columns=['idsmiles_A', 'idsmiles_B', 'measurement_A', 'measurement_B', 'measurement_delta'])
comp_a_lhs_overlap = evader_transforms.merge(substrate_transforms, on=['compound_structure_A', 'LHS', 'common_core'], suffixes=['_evader','_substrate'])
len(comp_a_lhs_overlap)
125
len(comp_a_lhs_overlap.compound_structure_A.unique())
52
len(comp_a_lhs_overlap.compound_structure_B_substrate.unique())
23
len(comp_a_lhs_overlap.compound_structure_B_evader.unique())
15
comp_a_lhs_overlap
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
compound_structure_A compound_structure_B_evader smirks_evader common_core LHS RHS_evader compound_structure_B_substrate smirks_substrate RHS_substrate
0 Cc1cccc(O)c1/N=C/c1cc(I)cc(I)c1O Oc1cccnc1/N=C/c1cc(I)cc(I)c1O [*:1]c1c(C)cccc1O>>[*:1]c1ncccc1O [*:1]/N=C/c1cc(I)cc(I)c1O [*:1]c1c(C)cccc1O [*:1]c1ncccc1O Oc1c(I)cc(I)cc1/C=N/c1ccc(F)cc1 [*:1]c1c(C)cccc1O>>[*:1]c1ccc(F)cc1 [*:1]c1ccc(F)cc1
1 Cc1cccc(O)c1/N=C/c1cc(I)cc(I)c1O Oc1cccnc1/N=C/c1cc(I)cc(I)c1O [*:1]c1c(C)cccc1O>>[*:1]c1ncccc1O [*:1]/N=C/c1cc(I)cc(I)c1O [*:1]c1c(C)cccc1O [*:1]c1ncccc1O CCc1ccc(O)c(/N=C/c2cc(I)cc(I)c2O)c1 [*:1]c1c(C)cccc1O>>[*:1]c1cc(CC)ccc1O [*:1]c1cc(CC)ccc1O
2 Cc1cccc(O)c1/N=C/c1cc(I)cc(I)c1O Oc1cccnc1/N=C/c1cc(I)cc(I)c1O [*:1]c1c(C)cccc1O>>[*:1]c1ncccc1O [*:1]/N=C/c1cc(I)cc(I)c1O [*:1]c1c(C)cccc1O [*:1]c1ncccc1O Oc1c(I)cc(I)cc1/C=N/c1cccc(Cl)c1Cl [*:1]c1c(C)cccc1O>>[*:1]c1cccc(Cl)c1Cl [*:1]c1cccc(Cl)c1Cl
3 Cc1cccc(O)c1/N=C/c1cc(I)cc(I)c1O Oc1cccnc1/N=C/c1cc(I)cc(I)c1O [*:1]c1c(C)cccc1O>>[*:1]c1ncccc1O [*:1]/N=C/c1cc(I)cc(I)c1O [*:1]c1c(C)cccc1O [*:1]c1ncccc1O Oc1c(I)cc(I)cc1/C=N/c1cccc(F)c1 [*:1]c1c(C)cccc1O>>[*:1]c1cccc(F)c1 [*:1]c1cccc(F)c1
4 Cc1cccc(O)c1/N=C/c1cc(I)cc(I)c1O Oc1cccnc1/N=C/c1cc(I)cc(I)c1O [*:1]c1c(C)cccc1O>>[*:1]c1ncccc1O [*:1]/N=C/c1cc(I)cc(I)c1O [*:1]c1c(C)cccc1O [*:1]c1ncccc1O COc1cccc(/N=C/c2cc(I)cc(I)c2O)c1 [*:1]c1c(C)cccc1O>>[*:1]c1cccc(OC)c1 [*:1]c1cccc(OC)c1
... ... ... ... ... ... ... ... ... ...
120 CCCn1ccc(=N)cc1.I Br.CCCCCCCCCCn1ccc(=N)cc1 [*:1]CCC>>[*:1]CCCCCCCCCC [*:1]n1ccc(=N)cc1 [*:1]CCC [*:1]CCCCCCCCCC CCCCCCCn1ccc(=N)cc1.I [*:1]CCC>>[*:1]CCCCCCC [*:1]CCCCCCC
121 CCCCn1ccc(=N)cc1.I Br.CCCCCCCCn1ccc(=N)cc1 [*:1]CCCC>>[*:1]CCCCCCCC [*:1]n1ccc(=N)cc1 [*:1]CCCC [*:1]CCCCCCCC CCCCCCCn1ccc(=N)cc1.I [*:1]CCCC>>[*:1]CCCCCCC [*:1]CCCCCCC
122 CCCCn1ccc(=N)cc1.I Br.CCCCCCCCCCn1ccc(=N)cc1 [*:1]CCCC>>[*:1]CCCCCCCCCC [*:1]n1ccc(=N)cc1 [*:1]CCCC [*:1]CCCCCCCCCC CCCCCCCn1ccc(=N)cc1.I [*:1]CCCC>>[*:1]CCCCCCC [*:1]CCCCCCC
123 Br.CCCCCCn1ccc(=N)cc1 Br.CCCCCCCCn1ccc(=N)cc1 [*:1]CCCCCC>>[*:1]CCCCCCCC [*:1]n1ccc(=N)cc1 [*:1]CCCCCC [*:1]CCCCCCCC CCCCCCCn1ccc(=N)cc1.I [*:1]CCCCCC>>[*:1]CCCCCCC [*:1]CCCCCCC
124 Br.CCCCCCn1ccc(=N)cc1 Br.CCCCCCCCCCn1ccc(=N)cc1 [*:1]CCCCCC>>[*:1]CCCCCCCCCC [*:1]n1ccc(=N)cc1 [*:1]CCCCCC [*:1]CCCCCCCCCC CCCCCCCn1ccc(=N)cc1.I [*:1]CCCCCC>>[*:1]CCCCCCC [*:1]CCCCCCC

125 rows × 9 columns

comp_a_lhs_overlap[comp_a_lhs_overlap.compound_structure_B_evader.isin(comp_a_lhs_overlap.compound_structure_B_evader.unique())]
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
compound_structure_A compound_structure_B_evader smirks_evader common_core LHS RHS_evader compound_structure_B_substrate smirks_substrate RHS_substrate
0 Cc1cccc(O)c1/N=C/c1cc(I)cc(I)c1O Oc1cccnc1/N=C/c1cc(I)cc(I)c1O [*:1]c1c(C)cccc1O>>[*:1]c1ncccc1O [*:1]/N=C/c1cc(I)cc(I)c1O [*:1]c1c(C)cccc1O [*:1]c1ncccc1O Oc1c(I)cc(I)cc1/C=N/c1ccc(F)cc1 [*:1]c1c(C)cccc1O>>[*:1]c1ccc(F)cc1 [*:1]c1ccc(F)cc1
1 Cc1cccc(O)c1/N=C/c1cc(I)cc(I)c1O Oc1cccnc1/N=C/c1cc(I)cc(I)c1O [*:1]c1c(C)cccc1O>>[*:1]c1ncccc1O [*:1]/N=C/c1cc(I)cc(I)c1O [*:1]c1c(C)cccc1O [*:1]c1ncccc1O CCc1ccc(O)c(/N=C/c2cc(I)cc(I)c2O)c1 [*:1]c1c(C)cccc1O>>[*:1]c1cc(CC)ccc1O [*:1]c1cc(CC)ccc1O
2 Cc1cccc(O)c1/N=C/c1cc(I)cc(I)c1O Oc1cccnc1/N=C/c1cc(I)cc(I)c1O [*:1]c1c(C)cccc1O>>[*:1]c1ncccc1O [*:1]/N=C/c1cc(I)cc(I)c1O [*:1]c1c(C)cccc1O [*:1]c1ncccc1O Oc1c(I)cc(I)cc1/C=N/c1cccc(Cl)c1Cl [*:1]c1c(C)cccc1O>>[*:1]c1cccc(Cl)c1Cl [*:1]c1cccc(Cl)c1Cl
3 Cc1cccc(O)c1/N=C/c1cc(I)cc(I)c1O Oc1cccnc1/N=C/c1cc(I)cc(I)c1O [*:1]c1c(C)cccc1O>>[*:1]c1ncccc1O [*:1]/N=C/c1cc(I)cc(I)c1O [*:1]c1c(C)cccc1O [*:1]c1ncccc1O Oc1c(I)cc(I)cc1/C=N/c1cccc(F)c1 [*:1]c1c(C)cccc1O>>[*:1]c1cccc(F)c1 [*:1]c1cccc(F)c1
4 Cc1cccc(O)c1/N=C/c1cc(I)cc(I)c1O Oc1cccnc1/N=C/c1cc(I)cc(I)c1O [*:1]c1c(C)cccc1O>>[*:1]c1ncccc1O [*:1]/N=C/c1cc(I)cc(I)c1O [*:1]c1c(C)cccc1O [*:1]c1ncccc1O COc1cccc(/N=C/c2cc(I)cc(I)c2O)c1 [*:1]c1c(C)cccc1O>>[*:1]c1cccc(OC)c1 [*:1]c1cccc(OC)c1
... ... ... ... ... ... ... ... ... ...
120 CCCn1ccc(=N)cc1.I Br.CCCCCCCCCCn1ccc(=N)cc1 [*:1]CCC>>[*:1]CCCCCCCCCC [*:1]n1ccc(=N)cc1 [*:1]CCC [*:1]CCCCCCCCCC CCCCCCCn1ccc(=N)cc1.I [*:1]CCC>>[*:1]CCCCCCC [*:1]CCCCCCC
121 CCCCn1ccc(=N)cc1.I Br.CCCCCCCCn1ccc(=N)cc1 [*:1]CCCC>>[*:1]CCCCCCCC [*:1]n1ccc(=N)cc1 [*:1]CCCC [*:1]CCCCCCCC CCCCCCCn1ccc(=N)cc1.I [*:1]CCCC>>[*:1]CCCCCCC [*:1]CCCCCCC
122 CCCCn1ccc(=N)cc1.I Br.CCCCCCCCCCn1ccc(=N)cc1 [*:1]CCCC>>[*:1]CCCCCCCCCC [*:1]n1ccc(=N)cc1 [*:1]CCCC [*:1]CCCCCCCCCC CCCCCCCn1ccc(=N)cc1.I [*:1]CCCC>>[*:1]CCCCCCC [*:1]CCCCCCC
123 Br.CCCCCCn1ccc(=N)cc1 Br.CCCCCCCCn1ccc(=N)cc1 [*:1]CCCCCC>>[*:1]CCCCCCCC [*:1]n1ccc(=N)cc1 [*:1]CCCCCC [*:1]CCCCCCCC CCCCCCCn1ccc(=N)cc1.I [*:1]CCCCCC>>[*:1]CCCCCCC [*:1]CCCCCCC
124 Br.CCCCCCn1ccc(=N)cc1 Br.CCCCCCCCCCn1ccc(=N)cc1 [*:1]CCCCCC>>[*:1]CCCCCCCCCC [*:1]n1ccc(=N)cc1 [*:1]CCCCCC [*:1]CCCCCCCCCC CCCCCCCn1ccc(=N)cc1.I [*:1]CCCCCC>>[*:1]CCCCCCC [*:1]CCCCCCC

125 rows × 9 columns

mols=[]

labels=[]


for i in range(len(comp_a_lhs_overlap)):

    # compound_A
    core = Chem.MolFromSmiles(comp_a_lhs_overlap.common_core.iloc[i])
    # LHS
    lhs = Chem.MolFromSmiles(comp_a_lhs_overlap.LHS.iloc[i])
    # compound_B_evader
    RHS_evader = Chem.MolFromSmiles(comp_a_lhs_overlap.RHS_evader.iloc[i])
    # compound_B_substrate
    RHS_substrate = Chem.MolFromSmiles(comp_a_lhs_overlap.RHS_substrate.iloc[i])

#     mols=[core, lhs , RHS_substrate, RHS_evader]

    # labels
    inactive_label = e_coli_wild_efflux[e_coli_wild_efflux['SMILES'] == comp_a_lhs_overlap.compound_structure_A.iloc[i]][['INHIB_AVE_wild', 'INHIB_AVE_efflux']].values

    evader_label = e_coli_wild_efflux[e_coli_wild_efflux['SMILES'] == comp_a_lhs_overlap.compound_structure_B_evader.iloc[i]][['INHIB_AVE_wild', 'INHIB_AVE_efflux']].values

    substrate_label = e_coli_wild_efflux[e_coli_wild_efflux['SMILES'] == comp_a_lhs_overlap.compound_structure_B_substrate.iloc[i]][['INHIB_AVE_wild', 'INHIB_AVE_efflux']].values

    lab = ['Common core no_{}'.format(i), 'Inactive\n WT: {:.1f}%; tolC: {:.1f}%'.format(inactive_label[0][0], inactive_label[0][1]), 'Substrate\n WT: {:.1f}%; tolC: {:.1f}%'.format(substrate_label[0][0], substrate_label[0][1]), 'Evader\n WT: {:.1f}%; tolC: {:.1f}%'.format(evader_label[0][0], evader_label[0][1]),]

# img = Chem.Draw.MolsToGridImage(mols, molsPerRow=4, subImgSize=(250,250), legends=lab, useSVG=True)
    mols.append(core)
    mols.append(lhs)
    mols.append(RHS_substrate)
    mols.append(RHS_evader)
    
    labels.append(lab[0])
    labels.append(lab[1])
    labels.append(lab[2])
    labels.append(lab[3])


img = Chem.Draw.MolsToGridImage(mols, molsPerRow=4, subImgSize=(250,250), legends=labels, useSVG=False, maxMols= 600, returnPNG=False)


# with open('master_transform_2' + '.svg', 'w') as f:
#     f.write(img.data)
[10:54:46] WARNING: not removing hydrogen atom with dummy atom neighbors
img

png

substrate_transforms
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
compound_structure_A compound_structure_B smirks common_core LHS RHS
2258 C/C(=N/Nc1nc(Nc2cccc(Br)c2)nc(N2CCOCC2)n1)c1cc... C/C(=N/Nc1nc(Nc2cccc(Br)c2)nc(N2CCOCC2)n1)c1cc... [*:1]c1ccc(Br)cc1>>[*:1]c1ccccc1 [*:1]/C(C)=N\Nc1nc(Nc2cccc(Br)c2)nc(N2CCOCC2)n1 [*:1]c1ccc(Br)cc1 [*:1]c1ccccc1
2259 C/C(=N/Nc1nc(Nc2cccc(Br)c2)nc(N2CCOCC2)n1)c1cc... C/C(=N/Nc1nc(Nc2cccc(Br)c2)nc(N2CCOCC2)n1)c1cc... [*:1]c1ccc(F)cc1>>[*:1]c1ccccc1 [*:1]/C(C)=N\Nc1nc(Nc2cccc(Br)c2)nc(N2CCOCC2)n1 [*:1]c1ccc(F)cc1 [*:1]c1ccccc1
3224 N#C/C(=C\c1c(F)cccc1Cl)c1nc2ccccc2[nH]1 Cl.N#C/C(=C\c1ccccc1[N+](=O)[O-])c1nc2ccccc2[nH]1 [*:1]c1c(F)cccc1Cl>>[*:1]c1ccccc1[N+](=O)[O-] [*:1]/C=C(\C#N)c1nc2ccccc2[nH]1 [*:1]c1c(F)cccc1Cl [*:1]c1ccccc1[N+](=O)[O-]
3245 N#C/C(=C\c1cc(Br)c(O)c(Br)c1O)c1nc2ccccc2[nH]1 Cl.N#C/C(=C\c1ccccc1[N+](=O)[O-])c1nc2ccccc2[nH]1 [*:1]c1cc(Br)c(O)c(Br)c1O>>[*:1]c1ccccc1[N+](=... [*:1]/C=C(\C#N)c1nc2ccccc2[nH]1 [*:1]c1cc(Br)c(O)c(Br)c1O [*:1]c1ccccc1[N+](=O)[O-]
3265 COc1c(Cl)cc(Cl)cc1/C=C(\C#N)c1nc2ccccc2[nH]1 Cl.N#C/C(=C\c1ccccc1[N+](=O)[O-])c1nc2ccccc2[nH]1 [*:1]c1cc(Cl)cc(Cl)c1OC>>[*:1]c1ccccc1[N+](=O)... [*:1]/C=C(\C#N)c1nc2ccccc2[nH]1 [*:1]c1cc(Cl)cc(Cl)c1OC [*:1]c1ccccc1[N+](=O)[O-]
... ... ... ... ... ... ...
1404497 CCOC(=O)Cn1ccc(=N)cc1.Cl CCCCCCCn1ccc(=N)cc1.I [*:1]CC(=O)OCC>>[*:1]CCCCCCC [*:1]n1ccc(=N)cc1 [*:1]CC(=O)OCC [*:1]CCCCCCC
1404504 Br.CCn1ccc(=N)cc1 CCCCCCCn1ccc(=N)cc1.I [*:1]CC>>[*:1]CCCCCCC [*:1]n1ccc(=N)cc1 [*:1]CC [*:1]CCCCCCC
1404510 CCCn1ccc(=N)cc1.I CCCCCCCn1ccc(=N)cc1.I [*:1]CCC>>[*:1]CCCCCCC [*:1]n1ccc(=N)cc1 [*:1]CCC [*:1]CCCCCCC
1404515 CCCCn1ccc(=N)cc1.I CCCCCCCn1ccc(=N)cc1.I [*:1]CCCC>>[*:1]CCCCCCC [*:1]n1ccc(=N)cc1 [*:1]CCCC [*:1]CCCCCCC
1404519 Br.CCCCCCn1ccc(=N)cc1 CCCCCCCn1ccc(=N)cc1.I [*:1]CCCCCC>>[*:1]CCCCCCC [*:1]n1ccc(=N)cc1 [*:1]CCCCCC [*:1]CCCCCCC

4900 rows × 6 columns

comp_a_lhs_overlap = evader_transforms.merge(substrate_transforms, on=['compound_structure_A', 'LHS', 'common_core'], suffixes=['_evader','_substrate'])
comp_a_lhs_overlap = substrate_transforms.merge(evader_transforms, on=['compound_structure_A'], suffixes=['_substrate', '_evader'])
len(comp_a_lhs_overlap.compound_structure_A.unique())
67
len(comp_a_lhs_overlap.compound_structure_B_evader.unique())
23
len(comp_a_lhs_overlap.compound_structure_B_substrate.unique())
42
substarte_to_evader_feats.iloc[:,:-4].sum().sort_values(ascending=False).head(50)
B7                                                      135
18 - Pyridine                                           135
sp2 hybridized carbon atoms (12)                        120
N5EXC                                                    59
sp3 hybridized carbon atoms (10)                         56
Alpha halo carbonyl                                      47
sp3 hybridized carbon atoms (7)                          46
Alkyl halide                                             46
15 - Secondary amine group                               41
5 - Alcohol                                              40
22 - CCl2                                                39
Enamine                                                  39
sp3 hybridized carbon atoms (12)                         37
4 - Aromatic carbon-alkane                               35
Nitrogen atoms (2)                                       22
1 - Alkane group                                         21
Nitrogen atoms (6)                                       20
33 - Bromo compounds                                     19
2 - Olefin group                                         17
I1 - Aliphatic methylene chains 7 or more long           12
Thiazolidinone                                           12
Dithiocarbamate                                          12
Thiocarbonyl group                                       12
ELEC                                                     12
Aromatic NO2                                             11
Nitrogen atoms (4)                                       11
Dye 16 (1)                                               11
27 - Aromatic nitro                                      11
Imines_(not_ring)                                        10
sp3 hybridized carbon atoms (5)                          10
Nitro group                                              10
Ketone                                                   10
E3 - e.g., carbonates                                     9
48 - CH2S                                                 9
Sulphur atom (3)                                          9
sp3 hybridized carbon atoms (4)                           9
9 - �¡arbonyl                                             9
Filter39_imine                                            8
Acyclic N-,=N and not N bound to carbonyl or sulfone      8
Vinyl_halide                                              8
Filter64_halo_ketone_sulfone                              8
Dye 25                                                    7
Filter41_12_dicarbonyl                                    7
Sulphur atom (5)                                          7
Alpha_halo_carbonyl                                       7
Oxalyl                                                    7
Stilbene                                                  7
Diketo group                                              7
Filter26_alkyl_halide                                     7
Beta halo carbonyl                                        7
dtype: int64
substarte_to_evader_feats[substarte_to_evader_feats['B7']>0].iloc[:,:-4].sum().sort_values(ascending=False).tail(20)
Nitrogen atoms (2)                   -1
B8EXC                                -1
sp2 hybridized carbon atoms (4)      -1
Oxygen-nitrogen single bond          -1
Dye 16 (1)                           -1
Nitrogen atoms (4)                   -1
Negatively charged atoms             -1
4 - Aromatic carbon-alkane           -7
sp3 hybridized carbon atoms (7)      -7
1 - Alkane group                     -7
sp3 hybridized carbon atoms (10)     -7
4-chlorobenzene                      -8
38 - Aromatic fluoro                -19
High halogen content (>3)           -22
25 - Aromatic chloro                -30
sp2 hybridized carbon atoms (8)     -54
13 - Ether                          -54
sp3 hybridized carbon atoms (6)     -54
sp3 hybridized carbon atoms (11)    -61
sp2 hybridized carbon atoms (7)    -117
dtype: int64
comp_a_lhs_overlap
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
compound_structure_A compound_structure_B_substrate idsmiles_A_substrate idsmiles_B_substrate smirks_substrate common_core_substrate measurement_A_substrate measurement_B_substrate measurement_delta_substrate LHS_substrate ... smirks_evader common_core_evader measurement_A_evader measurement_B_evader measurement_delta_evader LHS_evader RHS_evader mol_inactive mol_substrate mol_evader
0 O=[N+]([O-])c1ccc(/N=C/c2cc(Cl)cc(Cl)c2O)cc1 O=[N+]([O-])c1ccc(/N=C/c2cc(I)cc(I)c2O)cc1 45889 45890 [*:1]c1cc(Cl)cc(Cl)c1O>>[*:1]c1cc(I)cc(I)c1O [*:1]/C=N/c1ccc([N+](=O)[O-])cc1 2.60 56.00 53.40 [*:1]c1cc(Cl)cc(Cl)c1O ... [*:1]/N=C\c1cc([*:2])cc([*:3])c1O>>[*:2]C([*:3... [*:2]Cl.[*:3]Cl.[*:1]c1ccc([N+](=O)[O-])cc1 2.60 -0.18 -2.78 [*:1]/N=C\c1cc([*:2])cc([*:3])c1O [*:2]C([*:3])C(=O)NC(CO)C([*:1])O <rdkit.Chem.rdchem.Mol object at 0x000002A9F42... <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... <rdkit.Chem.rdchem.Mol object at 0x000002A9F42...
1 O=[N+]([O-])c1ccc(/N=C/c2cc(Cl)cc(Cl)c2O)cc1 O=[N+]([O-])c1ccc(/N=C/c2cc(I)cc(I)c2O)cc1 45889 45890 [*:1]c1cc(Cl)cc(Cl)c1O>>[*:1]c1cc(I)cc(I)c1O [*:1]/C=N/c1ccc([N+](=O)[O-])cc1 2.60 56.00 53.40 [*:1]c1cc(Cl)cc(Cl)c1O ... [*:1]/N=C\c1cc([*:2])cc(Cl)c1[*:3]>>[*:1]C([*:... [*:2]Cl.[*:3]O.[*:1]c1ccc([N+](=O)[O-])cc1 2.60 -0.18 -2.78 [*:1]/N=C\c1cc([*:2])cc(Cl)c1[*:3] [*:1]C([*:3])C(CO)NC(=O)[C@@H]([*:2])Cl <rdkit.Chem.rdchem.Mol object at 0x000002A9F42... <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... <rdkit.Chem.rdchem.Mol object at 0x000002A9F75...
2 O=[N+]([O-])c1ccc(/N=C/c2cc(Cl)cc(Cl)c2O)cc1 O=[N+]([O-])c1ccc(/N=C/c2cc(I)cc(I)c2O)cc1 45889 45890 [*:1]c1cc(Cl)cc(Cl)c1O>>[*:1]c1cc(I)cc(I)c1O [*:1]/C=N/c1ccc([N+](=O)[O-])cc1 2.60 56.00 53.40 [*:1]c1cc(Cl)cc(Cl)c1O ... [*:1]/N=C\c1cc(Cl)cc([*:2])c1[*:3]>>[*:1]C([*:... [*:2]Cl.[*:3]O.[*:1]c1ccc([N+](=O)[O-])cc1 2.60 -0.18 -2.78 [*:1]/N=C\c1cc(Cl)cc([*:2])c1[*:3] [*:1]C([*:3])C(CO)NC(=O)[C@@H]([*:2])Cl <rdkit.Chem.rdchem.Mol object at 0x000002A9F42... <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... <rdkit.Chem.rdchem.Mol object at 0x000002A9F75...
3 O=[N+]([O-])c1ccc(/N=C/c2cc(Cl)cc(Cl)c2O)cc1 O=[N+]([O-])c1ccc(/N=C/c2cc(I)cc(I)c2O)cc1 45889 45890 [*:1]c1cc(Cl)cc(Cl)c1O>>[*:1]c1cc(I)cc(I)c1O [*:1]/C=N/c1ccc([N+](=O)[O-])cc1 2.60 56.00 53.40 [*:1]c1cc(Cl)cc(Cl)c1O ... [*:1]/N=C\c1cc([*:2])cc(Cl)c1[*:3]>>[*:1]C([*:... [*:2]Cl.[*:3]O.[*:1]c1ccc([N+](=O)[O-])cc1 2.60 -0.18 -2.78 [*:1]/N=C\c1cc([*:2])cc(Cl)c1[*:3] [*:1]C([*:3])C(CO)NC(=O)[C@H]([*:2])Cl <rdkit.Chem.rdchem.Mol object at 0x000002A9F42... <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... <rdkit.Chem.rdchem.Mol object at 0x000002A9F75...
4 O=[N+]([O-])c1ccc(/N=C/c2cc(Cl)cc(Cl)c2O)cc1 O=[N+]([O-])c1ccc(/N=C/c2cc(I)cc(I)c2O)cc1 45889 45890 [*:1]c1cc(Cl)cc(Cl)c1O>>[*:1]c1cc(I)cc(I)c1O [*:1]/C=N/c1ccc([N+](=O)[O-])cc1 2.60 56.00 53.40 [*:1]c1cc(Cl)cc(Cl)c1O ... [*:1]/N=C\c1cc(Cl)cc([*:2])c1[*:3]>>[*:1]C([*:... [*:2]Cl.[*:3]O.[*:1]c1ccc([N+](=O)[O-])cc1 2.60 -0.18 -2.78 [*:1]/N=C\c1cc(Cl)cc([*:2])c1[*:3] [*:1]C([*:3])C(CO)NC(=O)[C@H]([*:2])Cl <rdkit.Chem.rdchem.Mol object at 0x000002A9F42... <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... <rdkit.Chem.rdchem.Mol object at 0x000002A9F75...
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
304 CCCn1ccc(=N)cc1.I CCCCCCCn1ccc(=N)cc1.I 28118 28233 [*:1]CCC>>[*:1]CCCCCCC [*:1]n1ccc(=N)cc1 1.42 56.99 55.57 [*:1]CCC ... [*:1]CCC>>[*:1]CCCCCCCCCC [*:1]n1ccc(=N)cc1 1.42 -5.56 -6.98 [*:1]CCC [*:1]CCCCCCCCCC <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... <rdkit.Chem.rdchem.Mol object at 0x000002A9F75...
305 CCCCn1ccc(=N)cc1.I CCCCCCCn1ccc(=N)cc1.I 28145 28233 [*:1]CCCC>>[*:1]CCCCCCC [*:1]n1ccc(=N)cc1 -14.52 56.99 71.51 [*:1]CCCC ... [*:1]CCCC>>[*:1]CCCCCCCC [*:1]n1ccc(=N)cc1 -14.52 32.75 47.27 [*:1]CCCC [*:1]CCCCCCCC <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... <rdkit.Chem.rdchem.Mol object at 0x000002A9F75...
306 CCCCn1ccc(=N)cc1.I CCCCCCCn1ccc(=N)cc1.I 28145 28233 [*:1]CCCC>>[*:1]CCCCCCC [*:1]n1ccc(=N)cc1 -14.52 56.99 71.51 [*:1]CCCC ... [*:1]CCCC>>[*:1]CCCCCCCCCC [*:1]n1ccc(=N)cc1 -14.52 -5.56 8.96 [*:1]CCCC [*:1]CCCCCCCCCC <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... <rdkit.Chem.rdchem.Mol object at 0x000002A9F75...
307 Br.CCCCCCn1ccc(=N)cc1 CCCCCCCn1ccc(=N)cc1.I 28228 28233 [*:1]CCCCCC>>[*:1]CCCCCCC [*:1]n1ccc(=N)cc1 13.72 56.99 43.27 [*:1]CCCCCC ... [*:1]CCCCCC>>[*:1]CCCCCCCC [*:1]n1ccc(=N)cc1 13.72 32.75 19.03 [*:1]CCCCCC [*:1]CCCCCCCC <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... <rdkit.Chem.rdchem.Mol object at 0x000002A9F75...
308 Br.CCCCCCn1ccc(=N)cc1 CCCCCCCn1ccc(=N)cc1.I 28228 28233 [*:1]CCCCCC>>[*:1]CCCCCCC [*:1]n1ccc(=N)cc1 13.72 56.99 43.27 [*:1]CCCCCC ... [*:1]CCCCCC>>[*:1]CCCCCCCCCC [*:1]n1ccc(=N)cc1 13.72 -5.56 -19.28 [*:1]CCCCCC [*:1]CCCCCCCCCC <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... <rdkit.Chem.rdchem.Mol object at 0x000002A9F75...

309 rows × 24 columns

substarte_to_evader_feats
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
0 26 28 42 43 > 2 ester groups 1 - Alkane group 1,2-Dicarbonyl not in ring 10 - Aldehyde 11 - Acetate group ... Vinyl michael acceptor2 Vinyl_halide Vinyl_sulphone Primary amine, not amide Primary or secondary amine, not amide. tertiary aliphatic amine carboxylic acid Smiles smirks_evader smirks_substrate
0 0 0 0 0 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 [*:1]/N=C\c1cc([*:2])cc([*:3])c1O>>[*:2]C([*:3... [*:1]c1cc(Cl)cc(Cl)c1O>>[*:1]c1cc(I)cc(I)c1O
1 0 0 0 0 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 [*:1]/N=C\c1cc([*:2])cc(Cl)c1[*:3]>>[*:1]C([*:... [*:1]c1cc(Cl)cc(Cl)c1O>>[*:1]c1cc(I)cc(I)c1O
2 0 0 0 0 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 [*:1]/N=C\c1cc(Cl)cc([*:2])c1[*:3]>>[*:1]C([*:... [*:1]c1cc(Cl)cc(Cl)c1O>>[*:1]c1cc(I)cc(I)c1O
3 0 0 0 0 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 [*:1]/N=C\c1cc([*:2])cc(Cl)c1[*:3]>>[*:1]C([*:... [*:1]c1cc(Cl)cc(Cl)c1O>>[*:1]c1cc(I)cc(I)c1O
4 0 0 0 0 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 [*:1]/N=C\c1cc(Cl)cc([*:2])c1[*:3]>>[*:1]C([*:... [*:1]c1cc(Cl)cc(Cl)c1O>>[*:1]c1cc(I)cc(I)c1O
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
304 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 [*:1]CCC>>[*:1]CCCCCCCCCC [*:1]CCC>>[*:1]CCCCCCC
305 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 [*:1]CCCC>>[*:1]CCCCCCCC [*:1]CCCC>>[*:1]CCCCCCC
306 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 [*:1]CCCC>>[*:1]CCCCCCCCCC [*:1]CCCC>>[*:1]CCCCCCC
307 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 [*:1]CCCCCC>>[*:1]CCCCCCCC [*:1]CCCCCC>>[*:1]CCCCCCC
308 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 [*:1]CCCCCC>>[*:1]CCCCCCCCCC [*:1]CCCCCC>>[*:1]CCCCCCC

309 rows × 761 columns

About

Support repository for the paper: Drug efflux in Gram-negative bacteria: How to turn an efflux pump substrate into an efflux-evader

Resources

Stars

Watchers

Forks

Packages

No packages published

Languages