Initial set up

E. coli efflux evaders and substrates - chemical space

# load conda environment

from master_functions import master_functions

# data process
import pandas as pd
import numpy as np
from scipy import stats

import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE

#chem

from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, MACCSkeys, Descriptors, Descriptors3D, Draw, rdMolDescriptors, Draw, PandasTools, rdFingerprintGenerator
from rdkit.DataManip.Metric.rdMetricMatrixCalc import GetTanimotoSimMat, GetTanimotoDistMat
# from rdkit.Chem.Draw import IPythonConsole

# import curated datasets

efflux_evaders_om_corrected = pd.read_pickle('data_curated/efflux_evaders_om_corrected.pkl')
efflux_substrates_om_corrected = pd.read_pickle('data_curated/efflux_substrates_om_corrected.pkl')
inactive = pd.read_pickle('data_curated/new_inactive.pkl') # this file is too big to upload to github, you can get your inactives from the inhibition file

Initial set up

Importing master dataset

# import master inhibition data
inhibition = pd.read_csv('data/CO-ADD_InhibitionData_r03_01-02-2020_CSV.csv', low_memory=False)
# this dataset can be downlaoded from: "https://www.co-add.org/"

# check strains avilable in organism == e. coli
inhibition[inhibition['ORGANISM'] == 'Escherichia coli'].STRAIN.value_counts()

ATCC 25922      82517
lpxC; MB4902    81058
tolC; MB5747    74177
Name: STRAIN, dtype: int64

# one compound has outlying values of -213.7 and -278.75 and -329.47 for WT,  tolC and lpxC respectivley, it skews data, I will drop it.

inhibition = inhibition[inhibition.SMILES != 'S(O)(=O)(=O)c1ccccc1\\C(\\c(cc(C)c(c2Br)O)c2)=C(\\C=C3C)/C=C(C3=O)Br']

# define subsets: 

e_coli_wild = inhibition[(inhibition['ORGANISM']=='Escherichia coli') & (inhibition['STRAIN']=='ATCC 25922')][['SMILES', 'INHIB_AVE']].groupby('SMILES').mean().reset_index()

e_coli_efflux = inhibition[(inhibition['ORGANISM']=='Escherichia coli') & (inhibition['STRAIN']=='tolC; MB5747')][['SMILES', 'INHIB_AVE']].groupby('SMILES').mean().reset_index()

e_coli_pore = inhibition[(inhibition['ORGANISM']=='Escherichia coli') & (inhibition['STRAIN']=='lpxC; MB4902')][['SMILES', 'INHIB_AVE']].groupby('SMILES').mean().reset_index()

# collect overlping data:

e_coli_wild_efflux = e_coli_wild[['SMILES', 'INHIB_AVE']].merge(e_coli_efflux[['SMILES', 'INHIB_AVE']],  on='SMILES', suffixes=('_wild', '_efflux'))
e_coli_wild_perm = e_coli_wild[['SMILES', 'INHIB_AVE']].merge(e_coli_pore[['SMILES', 'INHIB_AVE']], on='SMILES', suffixes=('_wild', '_lpxC'))

Plotting WT vs tolC

# e_coli_wild_efflux[['INHIB_AVE_wild', 'INHIB_AVE_efflux']].plot.hist(bins=200, alpha=0.5, figsize=[10,7])


sns.set(context='notebook', style='ticks', font='Times New Roman', font_scale=1.6, rc=None)

fig, ax = plt.subplots(figsize=(7,7))


sns.set(context='notebook', style='ticks', font='Times New Roman', font_scale=1.9, rc=None)

sns.histplot(e_coli_wild_efflux[['INHIB_AVE_efflux', 'INHIB_AVE_wild']], alpha=0.5, bins=150)

plt.legend(labels = ['Wild Type', '$\Delta TolC$'],  fontsize=15)

plt.xlim([-120, 120])

plt.xlabel('Growth Inhibition based on $OD_{600}$ (%)', fontsize=22);
plt.ylabel('Number of Compounds',  fontsize=22);

plt.yticks(fontsize=20)
plt.xticks(fontsize=20)

plt.tight_layout()
sns.despine()

Paired t-test

# we can now compoute paired t-test to see if removing TolC made a significant difference or not:

stats.ttest_rel(e_coli_wild_efflux['INHIB_AVE_wild'], e_coli_wild_efflux['INHIB_AVE_efflux'])

Ttest_relResult(statistic=-44.099887587864416, pvalue=0.0)

Defining evaders and substartes

# calculate z-score:
e_coli_wild_efflux['wild_stds'] = stats.zscore(e_coli_wild_efflux.INHIB_AVE_wild)
e_coli_wild_efflux['tolc_stds'] = stats.zscore(e_coli_wild_efflux.INHIB_AVE_efflux)

# label each compounds according to threshold of 4

threshold = 4

def label_it(row):
    if row['wild_stds'] >=threshold:
        return 'active'
    if row['wild_stds'] <threshold:
        return 'inactive'
    
e_coli_wild_efflux['wild_class'] = e_coli_wild_efflux.apply(label_it, axis=1)

def label_it_tolc(row):
    if row['tolc_stds'] >=threshold:
        return 'active'
    if row['tolc_stds'] <threshold:
        return 'inactive'
    
    
e_coli_wild_efflux['tolc_class'] = e_coli_wild_efflux.apply(label_it_tolc, axis=1)

# label compounds based on combination of activity defined above

def label_substrate(row):
    if row['tolc_class'] == 'active' and row['wild_class'] == 'inactive':
        return 'Efflux Substrate'
    if row['tolc_class'] == 'active' and row['wild_class'] == 'active':
        return 'Efflux Evader'
    if row['tolc_class'] == 'inactive' and row['wild_class'] == 'inactive':
        return 'Inactive'
    if row['tolc_class'] == 'inactive' and row['wild_class'] == 'active':
        return 'WT-only Active'

Resulting Number of classes

# check the numbers of classified data

e_coli_wild_efflux['Class'] = e_coli_wild_efflux.apply(label_substrate, axis=1)
e_coli_wild_efflux.Class.value_counts()

Inactive            72730
Efflux Substrate      760
Efflux Evader         200
WT-only Active         53
Name: Class, dtype: int64

Scatter Plot

sns.set(context='notebook', style='ticks', font='Times New Roman', font_scale=1.6, rc=None)
fig, ax = plt.subplots(figsize=(7,7))

sns.scatterplot(data = e_coli_wild_efflux, x='INHIB_AVE_wild', y='INHIB_AVE_efflux', hue='Class', s=30)

sns.despine()

# plt.legend(fontsize=20)

# plt.xlim([-120, 120])

plt.xlabel('$\it{E. coli}$ WT Growth Inhibition (%)', font='Sans serif');

plt.ylabel('$\it{E. coli}$ $\it{tolC}$ Growth Inhibition (%)', font='Sans serif');

# plt.yticks(fontsize=20)
# plt.xticks(fontsize=20)

# plt.axvline(x=43.02,  color='red', linestyle='--', alpha=0.5)
# plt.axhline(y=74.98,  color='red', linestyle='--', alpha=0.5)

plt.axvline(x=e_coli_wild_efflux[e_coli_wild_efflux['wild_stds']>=4].sort_values(by='wild_stds').INHIB_AVE_wild.iloc[0],  color='red', linestyle='--', alpha=0.5)
plt.axhline(y=e_coli_wild_efflux[e_coli_wild_efflux['tolc_stds']>=4].sort_values(by='tolc_stds').INHIB_AVE_efflux.iloc[0],  color='red', linestyle='--', alpha=0.5)

plt.legend(fontsize=15)
plt.tight_layout()

plt.savefig('figures/wild_tolc_class_scatter.png', dpi=600)

# we can save those datasets seperately

efflux_substrate = e_coli_wild_efflux[e_coli_wild_efflux['Class']=='Efflux Substrate']

efflux_evader = e_coli_wild_efflux[e_coli_wild_efflux['Class']=='Efflux Evader']

wt_only = e_coli_wild_efflux[e_coli_wild_efflux['Class']=='WT-only Active']

inactive = e_coli_wild_efflux[e_coli_wild_efflux['Class']=='Inactive']

Resulting evaders and substartes

print('No. of resulting evaders: {} \nNo. of resulting substrates: {}'.format(len(efflux_evader), len(efflux_substrate)))

No. of resulting evaders: 200 
No. of resulting substrates: 760

OM Bias

# import permeating and non-permeating datapoints, they were achived using same process as described above

om_permeating = pd.read_pickle('data_curated/om_permeating.pkl')
om_non_permeating = pd.read_pickle('data_curated/om_non_permeating.pkl')

# to compare the smiles between the two we fisr turn all smiles into same canonical smiles format

efflux_evader['SMILES'] = efflux_evader.SMILES.apply(Chem.CanonSmiles)
efflux_substrate['SMILES'] = efflux_substrate.SMILES.apply(Chem.CanonSmiles)

C:\Users\domin\AppData\Local\Temp\ipykernel_23268\348032441.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  efflux_evader['SMILES'] = efflux_evader.SMILES.apply(Chem.CanonSmiles)
C:\Users\domin\AppData\Local\Temp\ipykernel_23268\348032441.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  efflux_substrate['SMILES'] = efflux_substrate.SMILES.apply(Chem.CanonSmiles)

# grab only evaders that are also in OM permeating class
efflux_evaders_om_corrected = efflux_evader[efflux_evader['SMILES'].isin(om_permeating['SMILES'])]

# grab only substrates that are not in non-permeating class
efflux_substrates_om_corrected = efflux_substrate[~efflux_substrate['SMILES'].isin(om_non_permeating['SMILES'])]

Resulting evaders and substartes

print('No. of resulting evaders: {} \nNo. of resulting substrates: {}'.format(len(efflux_evaders_om_corrected), len(efflux_substrates_om_corrected)))

No. of resulting evaders: 186 
No. of resulting substrates: 554

Re-defining inactive mols

e_coli_wild_efflux['mol'] = e_coli_wild_efflux.SMILES.apply(Chem.MolFromSmiles)

[20:56:01] Explicit valence for atom # 2 C, 6, is greater than permitted
[20:56:04] Explicit valence for atom # 0 B, 6, is greater than permitted
[20:56:04] Explicit valence for atom # 0 B, 6, is greater than permitted
[20:56:04] Explicit valence for atom # 0 B, 6, is greater than permitted
[20:56:04] Explicit valence for atom # 0 B, 6, is greater than permitted
[20:56:04] Explicit valence for atom # 0 C, 6, is greater than permitted

e_coli_wild_efflux = e_coli_wild_efflux.dropna()

e_coli_wild_efflux['SMILES'] = e_coli_wild_efflux.SMILES.apply(Chem.CanonSmiles)

C:\Users\domin\AppData\Local\Temp\ipykernel_3876\1164120927.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  e_coli_wild_efflux['SMILES'] = e_coli_wild_efflux.SMILES.apply(Chem.CanonSmiles)

wt_only['mol'] = wt_only.SMILES.apply(Chem.MolFromSmiles)

wt_only = wt_only.dropna()

wt_only['SMILES'] = wt_only.SMILES.apply(Chem.CanonSmiles)

# Since efflux evaders and substartes have changed we must redifine inactive molecules, as:
#     Original dataset without evaders and substartes and wt-active only

not_inactive = pd.concat([efflux_evaders_om_corrected, efflux_substrates_om_corrected, wt_only])

inactive = e_coli_wild_efflux[~e_coli_wild_efflux['SMILES'].isin(not_inactive['SMILES'])]

inactive['mol'] = inactive.SMILES.apply(Chem.MolFromSmiles)

inactive = inactive.dropna(subset=['mol'])
inactive.reset_index(drop=True, inplace=True)

inactive['SMILES'] = inactive.SMILES.apply(Chem.CanonSmiles)

C:\Users\domin\AppData\Local\Temp\ipykernel_23268\1771852805.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inactive['mol'] = inactive.SMILES.apply(Chem.MolFromSmiles)

inactive.to_pickle('data_curated/new_inactive.pkl')

t-SNE of evaders vs substartes

# sample of what the dataset currently looks like
efflux_substrates_om_corrected.head(5)

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	SMILES	INHIB_AVE_wild	INHIB_AVE_efflux	Mol	fps	abs_diff	sub_class	wild_stds	tolc_stds	wild_class	tolc_class	Class
145	Brc1cncc(-c2cc(NCCCn3ccnc3)nc(-c3ccccc3)n2)c1	4.60	80.47	<rdkit.Chem.rdchem.Mol object at 0x000002164E6...	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...	75.87	increase	0.054629	4.326538	inactive	active	Efflux Substrate
308	N#C/C(=N\Nc1cccc(C(F)(F)F)c1)C(N)=S	18.36	87.98	<rdkit.Chem.rdchem.Mol object at 0x000002164E6...	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...	69.62	increase	1.468421	4.766464	inactive	active	Efflux Substrate
403	CC(C)C(=O)/C(=C/c1ccc(Cl)cc1Cl)n1cncn1	5.84	97.31	<rdkit.Chem.rdchem.Mol object at 0x00000215D73...	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...	91.47	increase	0.182034	5.313003	inactive	active	Efflux Substrate
585	O=C(N/N=C(/CC(=O)c1cccs1)C(F)(F)F)c1cccc([N+](...	-3.58	88.80	<rdkit.Chem.rdchem.Mol object at 0x000002164E6...	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...	92.38	increase	-0.785838	4.814498	inactive	active	Efflux Substrate
589	O=C(N/N=C(/CC(=O)c1cccs1)C(F)(F)F)c1ccc(Cl)cc1	20.78	77.14	<rdkit.Chem.rdchem.Mol object at 0x000002164E6...	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...	56.36	increase	1.717067	4.131471	inactive	active	Efflux Substrate

# we need to compute fingerprints from SMILES for t-sne:

mfpgen =rdFingerprintGenerator.GetMorganGenerator(radius=2,fpSize=2048)

efflux_evaders_om_corrected['mol'] = efflux_evaders_om_corrected.SMILES.apply(Chem.MolFromSmiles)
efflux_evaders_om_corrected.dropna(subset=['mol'], inplace=True)

efflux_evaders_om_corrected['fps']=efflux_evaders_om_corrected.mol.apply(mfpgen.GetFingerprint)

# substartes

efflux_substrates_om_corrected['mol'] = efflux_substrates_om_corrected.SMILES.apply(Chem.MolFromSmiles)
efflux_substrates_om_corrected.dropna(subset=['mol'], inplace=True)

efflux_substrates_om_corrected['fps']=efflux_substrates_om_corrected.mol.apply(mfpgen.GetFingerprint)

C:\Users\domin\AppData\Local\Temp\ipykernel_4000\1058469890.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  efflux_evaders_om_corrected['mol'] = efflux_evaders_om_corrected.SMILES.apply(Chem.MolFromSmiles)
C:\Users\domin\AppData\Local\Temp\ipykernel_4000\1058469890.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  efflux_evaders_om_corrected.dropna(subset=['mol'], inplace=True)
C:\Users\domin\AppData\Local\Temp\ipykernel_4000\1058469890.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  efflux_evaders_om_corrected['fps']=efflux_evaders_om_corrected.mol.apply(mfpgen.GetFingerprint)
C:\Users\domin\AppData\Local\Temp\ipykernel_4000\1058469890.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  efflux_substrates_om_corrected['mol'] = efflux_substrates_om_corrected.SMILES.apply(Chem.MolFromSmiles)
C:\Users\domin\AppData\Local\Temp\ipykernel_4000\1058469890.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  efflux_substrates_om_corrected.dropna(subset=['mol'], inplace=True)
C:\Users\domin\AppData\Local\Temp\ipykernel_4000\1058469890.py:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  efflux_substrates_om_corrected['fps']=efflux_substrates_om_corrected.mol.apply(mfpgen.GetFingerprint)

# combine two datasets and reset index

sub_and_evade_om_corrected = pd.concat([efflux_evaders_om_corrected,efflux_substrates_om_corrected]).reset_index(drop=True)

def tsne_no_plot(df, perp):
    
    sample=df.values
    tanimoto_sim_mat_lower_triangle=GetTanimotoSimMat(sample) # similartity matrix 
    n_mol = len(sample)
    similarity_matrix = np.ones([n_mol,n_mol])
    i_lower= np.tril_indices(n=n_mol,m=n_mol,k=-1)
    i_upper= np.triu_indices(n=n_mol,m=n_mol,k=1)
    similarity_matrix[i_lower] = tanimoto_sim_mat_lower_triangle
    similarity_matrix[i_upper] = similarity_matrix.T[i_upper] 
    distance_matrix = np.subtract(1,similarity_matrix) 

    TSNE_sim = TSNE(verbose=1, n_components=2, init='pca', method='barnes_hut', perplexity=perp).fit_transform(distance_matrix) 

    tsne_result = pd.DataFrame(data = TSNE_sim , columns=["TC1","TC2"]) 
    return tsne_result

sub_and_evade_om_corrected_tsne = tsne_no_plot(sub_and_evade_om_corrected['fps'], perp=50)

fig, ax = plt.subplots(figsize=(8,8))

sns.scatterplot(x='TC1',y='TC2',data=sub_and_evade_om_corrected_tsne, s=30 ,alpha=0.9, hue=sub_and_evade_om_corrected['Class']) 
# plt.legend(fontsize=20)
fig, ax = plt.subplots(figsize=(8,8))

sns.kdeplot(x='TC1',y='TC2',data=sub_and_evade_om_corrected_tsne,alpha=0.7, hue=sub_and_evade_om_corrected['Class'], levels = 4)
# plt.legend(fontsize=20)

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 740 samples in 0.001s...
[t-SNE] Computed neighbors for 740 samples in 0.126s...
[t-SNE] Computed conditional probabilities for sample 740 / 740
[t-SNE] Mean sigma: 0.709102
[t-SNE] KL divergence after 250 iterations with early exaggeration: 59.515961
[t-SNE] KL divergence after 1000 iterations: 0.638264





<Axes: xlabel='TC1', ylabel='TC2'>

t-SNE of evader + substrate + inactive

inactive_sample = inactive.sample(500, random_state= 42)

inactive_sample['mol'] = inactive_sample.SMILES.apply(Chem.MolFromSmiles)
inactive_sample.dropna(subset=['mol'], inplace=True)

inactive_sample['fps']=inactive_sample.mol.apply(mfpgen.GetFingerprint)

# add sample of inactive mols into the mix

sub_evade_inactive = pd.concat([sub_and_evade_om_corrected, inactive_sample])

sub_evade_inactive.reset_index(drop=True, inplace=True)

sub_evade_inactive_tsne = tsne_no_plot(sub_evade_inactive['fps'], perp=50)

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 1240 samples in 0.002s...
[t-SNE] Computed neighbors for 1240 samples in 0.083s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1240
[t-SNE] Computed conditional probabilities for sample 1240 / 1240
[t-SNE] Mean sigma: 0.725025
[t-SNE] KL divergence after 250 iterations with early exaggeration: 65.622444
[t-SNE] KL divergence after 1000 iterations: 0.954569

sns.set(context='notebook', style='ticks', font='Times New Roman', font_scale=1.6, rc=None)

fig, ax = plt.subplots(figsize=(8,8))

sns.scatterplot(x='TC1',y='TC2',data=sub_evade_inactive_tsne, s=20 ,alpha=0.5, hue=sub_evade_inactive['Class'], legend=False) 
sns.kdeplot(x='TC1',y='TC2',data=sub_evade_inactive_tsne, hue=sub_evade_inactive['Class'], levels = 2, linewidths=2)
sns.despine()

plt.savefig('tsne_all.svg')

We find some overlapping compounds

om_permeating = pd.read_pickle('data_curated/om_permeating.pkl')

mfpgen =rdFingerprintGenerator.GetMorganGenerator(radius=2,fpSize=2048)

om_permeating['mol'] = om_permeating.SMILES.apply(Chem.MolFromSmiles)
om_permeating.dropna(subset=['mol'], inplace=True)

om_permeating['fps']=om_permeating.mol.apply(mfpgen.GetFingerprint)

sub_evade_om_permeating = pd.concat([sub_and_evade_om_corrected, om_permeating])

sub_evade_om_permeating.reset_index(drop=True, inplace=True)

sub_evade_om_permeating_tsne = tsne_no_plot(sub_evade_om_permeating['fps'], perp=50)
fig, ax = plt.subplots(figsize=(8,8))

sns.scatterplot(x='TC1',y='TC2',data=sub_evade_om_permeating_tsne, s=30 ,alpha=0.9, hue=sub_evade_om_permeating['Class']) 
# plt.legend(fontsize=20)
fig, ax = plt.subplots(figsize=(8,8))

sns.kdeplot(x='TC1',y='TC2',data=sub_evade_om_permeating_tsne,alpha=0.7, hue=sub_evade_om_permeating['Class'], levels = 4)
# plt.legend(fontsize=20)

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 961 samples in 0.001s...
[t-SNE] Computed neighbors for 961 samples in 0.068s...
[t-SNE] Computed conditional probabilities for sample 961 / 961
[t-SNE] Mean sigma: 0.854897
[t-SNE] KL divergence after 250 iterations with early exaggeration: 61.729122
[t-SNE] KL divergence after 1000 iterations: 0.519036





<Axes: xlabel='TC1', ylabel='TC2'>

PCA of evaders, substrates and inactives

we'll sue same dataset as for t-sne

from descriptastorus.descriptors.DescriptorGenerator import MakeGenerator
from tqdm import trange, tqdm

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

def calcualte_features_single(df, col):
    generator = MakeGenerator(("rdkit2d",))
    names=[name[0] for name in  generator.GetColumns()]
    
    l_feat=[]
    
    print('Computing features: ')
    
    for i in trange(len(df[col].values)):
        l_data = generator.process(df[col].values[i])
        
        if l_data[0]  == True:
            l_feat.append(l_data[1:])
        else:
            print('left: ', l_data[0])
            print(df[col].values[i])
    

    # add descriptors to existing dataframe 
    feats = pd.DataFrame()
    
    for i in trange(len(l_feat)):
        feats = feats.append(pd.Series(l_feat[i]), ignore_index=True)
    feats.columns = names[1:]

    return feats

sub_evade_inactive_features = calcualte_features_single(sub_evade_inactive, 'SMILES')
sub_evade_inactive_features['Class'] = sub_evade_inactive['Class']

Computing features: 


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1240/1240 [00:13<00:00, 90.43it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1240/1240 [00:00<00:00, 1435.85it/s]

sub_evade_inactive_features

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	BalabanJ	BertzCT	Chi0	Chi0n	Chi0v	Chi1	Chi1n	Chi1v	Chi2n	Chi2v	...	fr_sulfone	fr_term_acetylene	fr_tetrazole	fr_thiazole	fr_thiocyan	fr_thiophene	fr_unbrch_alkane	fr_urea	qed	Class
0	2.508772	249.116352	6.974691	5.449320	5.449320	4.877010	3.252155	3.252155	2.362178	2.362178	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.501865	Efflux Evader
1	1.508609	845.728650	20.597801	16.576049	18.162045	14.775990	9.905963	10.698961	6.767766	7.683442	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.504707	Efflux Evader
2	0.000001	653.569301	14.396977	11.850173	15.811520	10.203510	7.173237	9.562159	5.658176	9.088344	...	0.0	0.0	0.0	0.0	0.0	2.0	0.0	0.0	0.599582	Efflux Evader
3	2.939539	420.685437	12.344935	7.754071	9.340068	7.303549	4.082377	4.875376	2.898481	3.814156	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.669689	Efflux Evader
4	2.603746	310.650557	9.681798	6.788319	7.544248	6.236382	3.689747	4.224269	2.376957	2.644218	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.588792	Efflux Evader
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1235	1.003357	1984.841727	34.329487	27.979443	27.979443	23.749555	16.372378	16.372378	12.307394	12.307394	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.174004	Inactive
1236	1.749666	1383.833437	21.957455	16.503270	17.259199	15.011570	9.340691	9.718655	6.803797	7.210998	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.427471	Inactive
1237	1.531621	1346.959571	25.070339	20.361266	21.947263	17.546045	12.317981	13.110979	9.033422	9.887759	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.302495	Inactive
1238	1.868993	1028.780943	15.648054	12.477331	12.477331	11.326500	7.553489	7.553489	5.475973	5.475973	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.490238	Inactive
1239	2.184490	517.236837	13.120956	10.329726	11.146223	8.592224	5.624243	6.503896	4.230048	5.177742	...	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.860297	Inactive

1240 rows Ã— 201 columns

# drop feature columns that contain Nans, its only 4 columns

sub_evade_inactive_features=sub_evade_inactive_features.dropna(axis=1)

# pca on all physcicochemical features:

table = sub_evade_inactive_features

#The non-redundant molecular descriptors chosen for PCA

descriptors  = table.iloc[:,:-87] # grab only physicochemical values

descriptors_std = StandardScaler().fit_transform(descriptors) #Important to avoid scaling problems between our different descriptors
pca = PCA()
descriptors_2d = pca.fit_transform(descriptors_std)
descriptors_pca= pd.DataFrame(descriptors_2d) # Saving PCA values to a new table
descriptors_pca.index = table.index
descriptors_pca.columns = ['PC{}'.format(i+1) for i in descriptors_pca.columns]
descriptors_pca.head(5) #Displays the PCA table

scale1 = 1.0/(max(descriptors_pca['PC1']) - min(descriptors_pca['PC1'])) 
scale2 = 1.0/(max(descriptors_pca['PC2']) - min(descriptors_pca['PC2']))

# And we add the new values to our PCA table
descriptors_pca['PC1_normalized']=[i*scale1 for i in descriptors_pca['PC1']]
descriptors_pca['PC2_normalized']=[i*scale2 for i in descriptors_pca['PC2']]


descriptors_pca['Class'] = sub_evade_inactive_features['Class']


# plt.rcParams['axes.linewidth'] = 1.5


cmap = sns.diverging_palette(133, 10, as_cmap=True)

fig, ax = plt.subplots(figsize=(10,5))

sns.scatterplot(x='PC1',y='PC2',data=descriptors_pca, alpha=0.7, hue='Class', s=20)#, palette=["C0", "C1", "C2", "k"])


pca_lab = ('PC1 '+str([np.round(pca.explained_variance_ratio_[0]*100, 1)]), 'PC2 '+str([np.round(pca.explained_variance_ratio_[1]*100, 1)]))


plt.xlabel(pca_lab[0],fontsize=16,fontweight='bold')
plt.ylabel(pca_lab[1],fontsize=16,fontweight='bold')

plt.tick_params ('both',width=2,labelsize=14)

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

handles, labels = ax.get_legend_handles_labels()

#ax.legend(handles=handles[1:], labels=labels[1:])

#plt.legend(loc='lower right',frameon=False,prop={'size': 22},ncol=1)

plt.tight_layout()

# plt.savefig('figures/pca_evade_substrate.png', dpi=600)

plt.show()

print('same but in contours, for ease of read')

cmap = sns.diverging_palette(133, 10, as_cmap=True)


############ kdeplot


fig, ax = plt.subplots(figsize=(10,7))

sns.set_style("ticks")

# sns.set(font_scale=2)

sns.kdeplot(x='PC1',y='PC2',data=descriptors_pca, hue='Class' , levels=3,)


pca_lab= ('PC1 '+str([np.round(pca.explained_variance_ratio_[0]*100, 1)]), 'PC2 '+str([np.round(pca.explained_variance_ratio_[1]*100, 1)]))


plt.xlabel(pca_lab[0],fontweight='bold',fontsize=22)
plt.ylabel(pca_lab[1],fontweight='bold', fontsize=22)

plt.tick_params ('both',width=2,labelsize=20)

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

handles, labels = ax.get_legend_handles_labels()

#ax.legend(handles=handles[1:], labels=labels[1:])

#plt.legend(loc='lower right',frameon=False,prop={'size': 22},ncol=1)

# plt.legend()

plt.tight_layout()

# plt.savefig('figures/pca_evade_substrate_contour.png', dpi=600)

same but in contours, for ease of read

explained variance is too low, I will chose only 7 mian features for PCA next:

# pca on only 8 main physcicochemical features:

table = sub_evade_inactive_features

#The non-redundant molecular descriptors chosen for PCA

descriptors = table[['MolWt', 'MolLogP', 'RingCount','FractionCSP3', 'TPSA','NumHAcceptors', 'NumHDonors', 'NumRotatableBonds' ]].values

# descriptors  = table.iloc[:,:-87]

descriptors_std = StandardScaler().fit_transform(descriptors) 
pca = PCA()
descriptors_2d = pca.fit_transform(descriptors_std)
descriptors_pca= pd.DataFrame(descriptors_2d)
descriptors_pca.index = table.index
descriptors_pca.columns = ['PC{}'.format(i+1) for i in descriptors_pca.columns]


scale1 = 1.0/(max(descriptors_pca['PC1']) - min(descriptors_pca['PC1'])) 
scale2 = 1.0/(max(descriptors_pca['PC2']) - min(descriptors_pca['PC2']))

# And we add the new values to our PCA table
descriptors_pca['PC1_normalized']=[i*scale1 for i in descriptors_pca['PC1']]
descriptors_pca['PC2_normalized']=[i*scale2 for i in descriptors_pca['PC2']]


descriptors_pca['Class'] = sub_evade_inactive_features['Class']


# plt.rcParams['axes.linewidth'] = 1.5


cmap = sns.diverging_palette(133, 10, as_cmap=True)

fig, ax = plt.subplots(figsize=(10,5))

sns.scatterplot(x='PC1',y='PC2',data=descriptors_pca, alpha=0.7, hue='Class', s=20)#, palette=["C0", "C1", "C2", "k"])


pca_lab = ('PC1 '+str([np.round(pca.explained_variance_ratio_[0]*100, 1)]), 'PC2 '+str([np.round(pca.explained_variance_ratio_[1]*100, 1)]))


plt.xlabel(pca_lab[0],fontsize=16,fontweight='bold')
plt.ylabel(pca_lab[1],fontsize=16,fontweight='bold')

plt.tick_params ('both',width=2,labelsize=14)

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

handles, labels = ax.get_legend_handles_labels()

#ax.legend(handles=handles[1:], labels=labels[1:])

#plt.legend(loc='lower right',frameon=False,prop={'size': 22},ncol=1)

plt.tight_layout()

# plt.savefig('figures/pca_evade_substrate.png', dpi=600)

plt.show()

print('same but in contours, for ease of read')

cmap = sns.diverging_palette(133, 10, as_cmap=True)


############ kdeplot


fig, ax = plt.subplots(figsize=(10,7))

sns.set_style("ticks")

# sns.set(font_scale=2)

sns.kdeplot(x='PC1',y='PC2',data=descriptors_pca, hue='Class' , levels=3,)


pca_lab= ('PC1 '+str([np.round(pca.explained_variance_ratio_[0]*100, 1)]), 'PC2 '+str([np.round(pca.explained_variance_ratio_[1]*100, 1)]))


plt.xlabel(pca_lab[0],fontweight='bold',fontsize=22)
plt.ylabel(pca_lab[1],fontweight='bold', fontsize=22)

plt.tick_params ('both',width=2,labelsize=20)

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

handles, labels = ax.get_legend_handles_labels()

#ax.legend(handles=handles[1:], labels=labels[1:])

#plt.legend(loc='lower right',frameon=False,prop={'size': 22},ncol=1)

# plt.legend()

plt.tight_layout()

# plt.savefig('figures/pca_evade_substrate_contour.png', dpi=600)

same but in contours, for ease of read

sns.set(context='notebook', style='ticks', font='Times New Roman', font_scale=1.6, rc=None)
fig, ax = plt.subplots(figsize=(8,8))

sns.scatterplot(x='PC1',y='PC2',data=descriptors_pca, alpha=0.5, hue='Class', s=20)

pca_lab = ('PC1 '+str([np.round(pca.explained_variance_ratio_[0]*100, 1)]), 'PC2 '+str([np.round(pca.explained_variance_ratio_[1]*100, 1)]))


plt.xlabel(pca_lab[0])
plt.ylabel(pca_lab[1])


sns.kdeplot(x='PC1',y='PC2',data=descriptors_pca, hue='Class' , levels=2, linewidths=2)

pca_lab= ('PC1 - '+str(np.round(pca.explained_variance_ratio_[0]*100, 1)) + '%', 'PC2 - '+str(np.round(pca.explained_variance_ratio_[1]*100, 1)) + '%')
plt.xlabel(pca_lab[0])
plt.ylabel(pca_lab[1])

sns.despine()

# plt.savefig('pca_all.svg')

similar result where exlpained variance is about 70% but classes are still not seperated at all

MMPA

To carry out mmpa I used modified mmpdb : https://github.com/rdkit/mmpdb

publication : https://doi.org/10.1021/acs.jcim.8b00173

# import results from MMPA: 

efflux_mmpa_index = pd.read_pickle('data_curated/efflux_mmpa_index.pkl')

# it contains 1.4M pairs

Evader Transforms

def split_transition(df, col):
    df['LHS'] = [re.split('>>',df[col].values[i])[0] for i in range(len(df)) ]
    df['RHS'] = [re.split('>>',df[col].values[i])[1] for i in range(len(df)) ]
    return df

def mols_to_NHA(mol):
    return Chem.MolFromSmarts(mol).GetNumHeavyAtoms()

def clean_mmpa_pairs_len(mmpa_df):
    temp=pd.DataFrame() # temp dataframe
    if 'LHS' not in mmpa_df.columns: # add LHS and RHS if not present
        mmpa_df = split_transition(mmpa_df, 'smirks')     # produce LHS and RHS
    else:
        temp['common_core_HA'] = mmpa_df['common_core'].apply(mols_to_NHA) # produce number of heavy atoms
        temp['LHS_HA'] = mmpa_df['LHS'].apply(mols_to_NHA)
        temp['RHS_HA'] = mmpa_df['LHS'].apply(mols_to_NHA)
        
        temp['len_check'] = np.where((temp['LHS_HA'] >= temp['common_core_HA']) & (temp['RHS_HA'] >= temp['common_core_HA'])
                     , 'fail', 'pass') # compare lengths of heavy atoms
        
        mmpa_df = mmpa_df.drop(temp[temp['len_check']=='fail'].index) # drop index that failed length check
        
        print('Initial number of transofrms: {} \nNumber fo transforms disqualified based on length discrepancy: {} \nRemaining number of transforms: {}'.format(len(temp[temp['len_check']=='fail']) +  len(mmpa_df) , len(temp[temp['len_check']=='fail']), len(mmpa_df)))
        # return temp to debug
    return mmpa_df

# find evader transforms:

evader_transforms = efflux_mmpa_index[(efflux_mmpa_index['compound_structure_B'].isin(efflux_evaders_om_corrected.SMILES)) & (efflux_mmpa_index['compound_structure_A'].isin(inactive.SMILES))]

evader_transforms = clean_mmpa_pairs_len(evader_transforms)

Initial number of transofrms: 2468 
Number fo transforms disqualified based on length discrepancy: 1856 
Remaining number of transforms: 612

len(evader_transforms.compound_structure_B.unique())

len(evader_transforms.compound_structure_A.unique())

Substrate Transforms

substrate_transforms = efflux_mmpa_index[(efflux_mmpa_index['compound_structure_B'].isin(efflux_substrates_om_corrected.SMILES)) & (efflux_mmpa_index['compound_structure_A'].isin(inactive.SMILES)) ]

substrate_transforms = clean_mmpa_pairs_len(substrate_transforms)

Initial number of transofrms: 6827 
Number fo transforms disqualified based on length discrepancy: 1927 
Remaining number of transforms: 4900

len(substrate_transforms.compound_structure_A.unique())

len(substrate_transforms.compound_structure_B.unique())

Transforming substarte into evaders

def calculate_fractions_mk7_new_smarts_spec(df, smirks, measurement_delta, measurement_A, measurement_B):
    
    mol_substructures, name_substructure = new_smarts()

    name_substructure = name_substructure + ['smirks', 'measurement' ,'target']
    
    smirks=smirks
    measurement_delta=measurement_delta
    measurement_A = measurement_A
    measurement_B = measurement_B
    
    # Comapre left hand side

    #     frame_left=pd.DataFrame(columns=name_substructure)

    frame_left=[]
    frame_right=[]
    

    print('Calcualting LHS+RHS matches')

    #for index in enumerate(df.LHS.values)):

    for index in range(len(df)):  

        #grab structure
        frame_temp_left=pd.DataFrame(0, index=range(1), columns=name_substructure)
        frame_temp_right=pd.DataFrame(0, index=range(1), columns=name_substructure)

        frame_temp_left['smirks'] = df[smirks].values[index]
        frame_temp_left['target'] = df[measurement_delta].values[index]    
        frame_temp_left['measurement'] = df[measurement_A].values[index]
        
        
        for sub_nr, sub in enumerate(mol_substructures):
            if df['mol_a'].iloc[index].HasSubstructMatch(sub):
                frame_temp_left[name_substructure[sub_nr]] = [1]
            
#             if mol_target_left.HasSubstructMatch(sub):
#                 frame_temp_left[name_substructure[sub_nr]] = [1]

        frame_temp_right['smirks'] = df[smirks].values[index]
        frame_temp_right['target'] = df[measurement_delta].values[index]    
        frame_temp_right['measurement'] = df[measurement_B].values[index]
        
        for sub_nr, sub in enumerate(mol_substructures):
            if df['mol_b'].iloc[index].HasSubstructMatch(sub):
                frame_temp_right[name_substructure[sub_nr]] = [1]

        frame_left.append(frame_temp_left.values)
        frame_right.append(frame_temp_right.values)

    frame_left_df = pd.DataFrame(np.concatenate(frame_left), columns = name_substructure)
    # compare right hand side
    frame_right_df = pd.DataFrame(np.concatenate(frame_right), columns = name_substructure)

    diff = frame_right_df.iloc[:,:-3] - frame_left_df.iloc[:,:-3] 

    diff['smirks'] = frame_right_df['smirks']
    diff['measurement_A'] = frame_left_df['measurement']
    diff['measurement_B'] = frame_right_df['measurement']
    diff['target'] = frame_right_df['target']

    return diff.reset_index(drop=True), frame_left_df.reset_index(drop=True), frame_right_df.reset_index(drop=True)


def new_smarts():
#     print(os.getcwd())
    func_groups=pd.read_csv('ml_mmpa/fg_smarts_2.csv')
    
        #fetch all substructure definitions and calculate mosl for them
    print('Generating molecular objects from pre-defined substructures')
    mol_substructures=[]
    for substructure in func_groups.SMARTS:
        mol_substructures.append(Chem.MolFromSmarts(substructure))

    return mol_substructures,  func_groups.name.to_list()

def calculate_fractions_mk7_new_smarts(df):
    
    mol_substructures, name_substructure = new_smarts()

    name_substructure = name_substructure + ['smirks', 'measurement' ,'target']

    # Comapre left hand side

    #     frame_left=pd.DataFrame(columns=name_substructure)

    frame_left=[]
    frame_right=[]

    print('Calcualting LHS+RHS matches')

    #for index in enumerate(df.LHS.values)):

    for index in range(len(df)):  

        #grab structure
        frame_temp_left=pd.DataFrame(0, index=range(1), columns=name_substructure)
        frame_temp_right=pd.DataFrame(0, index=range(1), columns=name_substructure)

        frame_temp_left['smirks'] = df.smirks.values[index]
        frame_temp_left['target'] = df.measurement_delta.values[index]    
        frame_temp_left['measurement'] = df.measurement_A.values[index]
        
        
        for sub_nr, sub in enumerate(mol_substructures):
            if df['mol_a'].iloc[index].HasSubstructMatch(sub):
                frame_temp_left[name_substructure[sub_nr]] = [1]
            
#             if mol_target_left.HasSubstructMatch(sub):
#                 frame_temp_left[name_substructure[sub_nr]] = [1]

        frame_temp_right['smirks'] = df.smirks.values[index]
        frame_temp_right['target'] = df.measurement_delta.values[index]    
        frame_temp_right['measurement'] = df.measurement_B.values[index]
        
        for sub_nr, sub in enumerate(mol_substructures):
            if df['mol_b'].iloc[index].HasSubstructMatch(sub):
                frame_temp_right[name_substructure[sub_nr]] = [1]

        frame_left.append(frame_temp_left.values)
        frame_right.append(frame_temp_right.values)

    frame_left_df = pd.DataFrame(np.concatenate(frame_left), columns = name_substructure)
    # compare right hand side
    frame_right_df = pd.DataFrame(np.concatenate(frame_right), columns = name_substructure)

    diff = frame_right_df.iloc[:,:-3] - frame_left_df.iloc[:,:-3] 

    diff['smirks'] = frame_right_df['smirks']
    diff['measurement_A'] = frame_left_df['measurement']
    diff['measurement_B'] = frame_right_df['measurement']
    diff['target'] = frame_right_df['target']

    return diff.reset_index(drop=True), frame_left_df.reset_index(drop=True), frame_right_df.reset_index(drop=True)


def new_smarts():
#     print(os.getcwd())
    func_groups=pd.read_csv('ml_mmpa/fg_smarts_2.csv')
    
        #fetch all substructure definitions and calculate mosl for them
    print('Generating molecular objects from pre-defined substructures')
    mol_substructures=[]
    for substructure in func_groups.SMARTS:
        mol_substructures.append(Chem.MolFromSmarts(substructure))

    return mol_substructures,  func_groups.name.to_list()

# find substrate to evader transforms:

sub_to_evader_transforms = efflux_mmpa_index[(efflux_mmpa_index['compound_structure_B'].isin(efflux_evaders_om_corrected.SMILES)) & (efflux_mmpa_index['compound_structure_A'].isin(efflux_substrates_om_corrected.SMILES))]

len(sub_to_evader_transforms), len(sub_to_evader_transforms.compound_structure_A.unique()), len(sub_to_evader_transforms.compound_structure_B.unique())

(60, 26, 24)

sub_to_evader_transforms[sub_to_evader_transforms['compound_structure_B']=='O=[N+]([O-])c1ccc2no[n+]([O-])c2c1']

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	compound_structure_A	compound_structure_B	idsmiles_A	idsmiles_B	smirks	common_core	measurement_A	measurement_B	measurement_delta	LHS	RHS	mol_a	mol_b
1037285	O=Cc1cc([N+](=O)[O-])cc(I)c1O	O=[N+]([O-])c1ccc2no[n+]([O-])c2c1	43577	47709	[:1]c1cc(I)c(O)c(C=O)c1>>[:1]c1ccc2no[n+]([O...	[*:1][N+](=O)[O-]	55.67	-1.98	-57.65	[*:1]c1cc(I)c(O)c(C=O)c1	[*:1]c1ccc2no[n+]([O-])c2c1	<rdkit.Chem.rdchem.Mol object at 0x000002AA5A4...	<rdkit.Chem.rdchem.Mol object at 0x000002AA5A2...
1038977	Cn1nc([N+](=O)[O-])c[n+]1[O-]	O=[N+]([O-])c1ccc2no[n+]([O-])c2c1	47632	47709	[:1]c1c[n+]([O-])n(C)n1>>[:1]c1ccc2no[n+]([O...	[*:1][N+](=O)[O-]	42.16	-1.98	-44.14	[*:1]c1c[n+]([O-])n(C)n1	[*:1]c1ccc2no[n+]([O-])c2c1	<rdkit.Chem.rdchem.Mol object at 0x000002AA5A4...	<rdkit.Chem.rdchem.Mol object at 0x000002AA5A2...

new_df = sub_to_evader_transforms.groupby(['compound_structure_A', 'compound_structure_B']).size().reset_index(name='Freq')

new_df.drop_duplicates(subset=['compound_structure_B'])

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	compound_structure_A	compound_structure_B	Freq
0	CC(=O)Cn1cc([N+](=O)[O-])c([N+](=O)[O-])n1	CC(=O)Cn1nnc([N+](=O)[O-])n1	4
1	CC(=O)Cn1cc([N+](=O)[O-])c([N+](=O)[O-])n1	Nc1ncc([N+](=O)[O-])cc1[N+](=O)[O-]	1
2	CC(=O)Cn1cc([N+](=O)[O-])c([N+](=O)[O-])n1	O=[N+]([O-])c1ncn(CCO)c1[N+](=O)[O-]	1
3	CCCCCCCn1ccc(=N)cc1.I	Br.CCCCCCCCCCn1ccc(=N)cc1	1
4	CCCCCCCn1ccc(=N)cc1.I	Br.CCCCCCCCn1ccc(=N)cc1	1
5	CCc1ccc(O)c(/N=C/c2cc(I)cc(I)c2O)c1	Oc1cccnc1/N=C/c1cc(I)cc(I)c1O	3
6	CCc1ccc(OCCNc2cc(N3CC(C)NC(C)C3)ccc2[N+](=O)[O...	CCc1ccc(OCCNc2cc(N3CCNC(C)C3)ccc2[N+](=O)[O-])cc1	3
8	CCn1cc(C(=O)O)c(=O)c2cc(F)c(N/N=C/c3ccccc3)cc21	CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCN(C)CC3)cc21.C...	1
9	CCn1cc(C(=O)O)c(=O)c2cc(F)c(N/N=C/c3ccccc3)cc21	CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCNCC3)cc21	1
10	CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCN(C(=S)NC(=O)C...	CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCN(C(=S)NC(=O)c...	1
11	CCn1cc(C(=O)O)c(=O)c2cnc(N3CCN(C(=S)Nc4ccc(OC)...	CCn1cc(C(=O)O)c(=O)c2cnc(N3CCN(C(=S)Nc4ccc(C(=...	1
12	COc1ccc(/C=C/c2ccc(C(=O)Nc3cc(C(=O)Nc4cc(C(=O)...	COc1ccc(/C=C/c2ccc(C(=O)Nc3cc(C(=O)Nc4cc(C(=O)...	2
15	Cc1cc(C)c2nc3nc(C)cc(C)c3c(N)c2c1	Cc1ccc2nc3nc(C)cc(C)c3c(N)c2c1	1
16	Cn1cnc([N+](=O)[O-])c1Oc1ccccc1	Cn1cnc([N+](=O)[O-])c1S(=O)CC#N	1
17	Cn1cnc([N+](=O)[O-])c1Oc1ccccc1	Cn1cnc([N+](=O)[O-])c1Sc1nnnn1C	1
18	Cn1nc([N+](=O)[O-])c[n+]1[O-]	C=CCNc1c([N+](=O)[O-])nn(C)[n+]1[O-]	1
22	Cn1nc([N+](=O)[O-])c[n+]1[O-]	Nc1nonc1[N+](=O)[O-]	1
23	Cn1nc([N+](=O)[O-])c[n+]1[O-]	O=C(O)/C=C/c1ccc([N+](=O)[O-])o1	1
24	Cn1nc([N+](=O)[O-])c[n+]1[O-]	O=[N+]([O-])c1ccc2no[n+]([O-])c2c1	1
25	N#Cc1c(Cl)nc(NN)c(Cl)c1Cl	N#Cc1nc(Cl)c2sc(=O)sc2c1Cl	1
26	N/C(=C\C(=O)/C=C/c1cccs1)C(Cl)(Cl)Cl	O=C(/C=C/c1cccs1)CC(=O)C(F)(F)F	2
33	O=[N+]([O-])c1ccc(/N=C/c2cc(I)cc(I)c2O)cc1	O=C(NC(CO)C(O)c1ccc([N+](=O)[O-])cc1)C(Cl)Cl	2
34	Oc1c(Cl)cc(Br)cc1/C=N/c1ccc(F)cc1	O=[N+]([O-])c1ccc(/C=N/c2ccc(F)cc2)o1	1
35	Oc1c(Cl)cc(Br)cc1/C=N/c1ccccc1	O=C(CCl)C(=O)Nc1ccccc1	1

len(sub_to_evader_transforms.drop_duplicates(subset=['compound_structure_B']))

e_coli_wild_efflux[e_coli_wild_efflux['SMILES'] == 'N/C(=C\C(=O)/C=C/c1cccs1)C(Cl)(Cl)Cl'][['INHIB_AVE_wild', 'INHIB_AVE_efflux']].values[0]

array([21.71, 90.83])

e_coli_wild_efflux[e_coli_wild_efflux['SMILES'] == 'O=C(/C=C/c1cccs1)CC(=O)C(F)(F)F'][['INHIB_AVE_wild', 'INHIB_AVE_efflux']].values[0]

array([48.74, 93.  ])

sub_to_evader_transforms.drop_duplicates(subset=['compound_structure_B']).iloc[0].compound_structure_A

'N/C(=C\\C(=O)/C=C/c1cccs1)C(Cl)(Cl)Cl'

for i in range(24):
    
    trans = sub_to_evader_transforms.drop_duplicates(subset=['compound_structure_B'])
    # smiles:

    comp_a = sub_to_evader_transforms[sub_to_evader_transforms['smirks'] == dex.iloc[int(a/2)].smirks].compound_structure_A.values[0]

    comp_b = sub_to_evader_transforms[sub_to_evader_transforms['smirks'] == dex.iloc[int(a/2)].smirks].compound_structure_B.values[0]

    


    # wt and efflux pre
    pre =  e_coli_wild_efflux[e_coli_wild_efflux['SMILES'] == trans.iloc[i].compound_structure_A][['INHIB_AVE_wild', 'INHIB_AVE_efflux']].values[0]
    # wt and efflux post
    post = e_coli_wild_efflux[e_coli_wild_efflux['SMILES'] == trans.iloc[i].compound_structure_B][['INHIB_AVE_wild', 'INHIB_AVE_efflux']].values[0]
    print(i+1)

    print(trans.iloc[i].compound_structure_A)

    print('WT: {}%, tolC: {}%'.format(pre[0], pre[1]))

    print(trans.iloc[i].compound_structure_B)

    print('WT: {}%, tolC: {}%'.format(post[0], post[1]))

1
N/C(=C\C(=O)/C=C/c1cccs1)C(Cl)(Cl)Cl
WT: 21.71%, tolC: 90.83%
O=C(/C=C/c1cccs1)CC(=O)C(F)(F)F
WT: 48.74%, tolC: 93.0%
2
Oc1c(Cl)cc(Br)cc1/C=N/c1ccc(F)cc1
WT: 39.12%, tolC: 96.44%
O=[N+]([O-])c1ccc(/C=N/c2ccc(F)cc2)o1
WT: 93.81%, tolC: 91.72%
3
Oc1c(I)cc(I)cc1/C=N/c1ccc(F)cc1
WT: 37.97%, tolC: 100.98%
Oc1cccnc1/N=C/c1cc(I)cc(I)c1O
WT: 60.66%, tolC: 97.11%
4
N#Cc1c(Cl)nc(NN)c(Cl)c1Cl
WT: -0.99%, tolC: 86.71%
N#Cc1nc(Cl)c2sc(=O)sc2c1Cl
WT: 80.76%, tolC: 76.9%
5
CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCN(C(=S)NC(=O)C45CC6CC(CC(C6)C4)C5)CC3)cc21
WT: 9.66%, tolC: 97.46%
CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCN(C(=S)NC(=O)c4c(OC)cccc4OC)CC3)cc21
WT: 92.72%, tolC: 91.71%
6
CC(=O)Cn1cc([N+](=O)[O-])c([N+](=O)[O-])n1
WT: 14.09%, tolC: 100.19%
CC(=O)Cn1nnc([N+](=O)[O-])n1
WT: 45.0%, tolC: 77.9%
7
Cn1nc([N+](=O)[O-])c[n+]1[O-]
WT: 36.64%, tolC: 78.8%
Cn1cnc([N+](=O)[O-])c1S(=O)CC#N
WT: 93.87%, tolC: 90.29%
8
Cn1nc([N+](=O)[O-])c[n+]1[O-]
WT: 36.64%, tolC: 78.8%
C=CCNc1c([N+](=O)[O-])nn(C)[n+]1[O-]
WT: 100.62%, tolC: 102.1%
9
CCn1cc(C(=O)O)c(=O)c2cnc(N3CCN(C(=S)Nc4ccc(OC)cc4)CC3)nc21
WT: -0.57%, tolC: 80.9%
CCn1cc(C(=O)O)c(=O)c2cnc(N3CCN(C(=S)Nc4ccc(C(=O)OC)cc4)CC3)nc21
WT: 96.96%, tolC: 100.34%
10
CCc1ccc(OCCNc2cc(N3CC(C)NC(C)C3)ccc2[N+](=O)[O-])cc1
WT: 33.9%, tolC: 95.53%
CCc1ccc(OCCNc2cc(N3CCNC(C)C3)ccc2[N+](=O)[O-])cc1
WT: 52.44%, tolC: 96.71%
11
COc1ccc(/C=C/c2ccc(C(=O)Nc3cc(C(=O)Nc4cc(C(=O)NCCN5CCC(OC)CC5)n(C)c4)n(C)c3)cn2)cc1.O=C(O)C(F)(F)F.O=C(O)C(F)(F)F
WT: 5.08%, tolC: 100.53%
COc1ccc(/C=C/c2ccc(C(=O)Nc3cc(C(=O)Nc4cc(C(=O)NCCN5CCOCC5)n(C)c4)n(C)c3)cn2)cc1.O=C(O)C(F)(F)F.O=C(O)C(F)(F)F
WT: 100.46%, tolC: 100.31%
12
Oc1c(Cl)cc(Br)cc1/C=N/c1ccccc1
WT: 27.69%, tolC: 101.73%
O=C(CCl)C(=O)Nc1ccccc1
WT: 95.28%, tolC: 92.56%
13
O=[N+]([O-])c1ccc(/N=C/c2cc(I)cc(I)c2O)cc1
WT: 40.13%, tolC: 96.13%
O=C(NC(CO)C(O)c1ccc([N+](=O)[O-])cc1)C(Cl)Cl
WT: 98.55%, tolC: 98.37%
14
O=Cc1cc([N+](=O)[O-])cc(I)c1O
WT: 27.87%, tolC: 83.54%
O=[N+]([O-])c1ccc2no[n+]([O-])c2c1
WT: 96.24%, tolC: 94.26%
15
O=Cc1cc([N+](=O)[O-])cc(I)c1O
WT: 27.87%, tolC: 83.54%
Nc1ncc([N+](=O)[O-])cc1[N+](=O)[O-]
WT: 59.06%, tolC: 98.91%
16
O=Cc1cc([N+](=O)[O-])cc(I)c1O
WT: 27.87%, tolC: 83.54%
O=C(O)/C=C/c1ccc([N+](=O)[O-])o1
WT: 75.57%, tolC: 98.52%
17
O=Cc1cc([N+](=O)[O-])cc(I)c1O
WT: 27.87%, tolC: 83.54%
Nc1nonc1[N+](=O)[O-]
WT: 99.21%, tolC: 96.12%
18
CC(=O)Cn1cc([N+](=O)[O-])c([N+](=O)[O-])n1
WT: 14.09%, tolC: 100.19%
O=[N+]([O-])c1ncn(CCO)c1[N+](=O)[O-]
WT: 96.69%, tolC: 93.83%
19
Cn1cnc([N+](=O)[O-])c1Oc1ccccc1
WT: 15.81%, tolC: 94.86%
Cn1cnc([N+](=O)[O-])c1Sc1nnnn1C
WT: 53.09%, tolC: 100.9%
20
Cc1cc(C)c2nc3nc(C)cc(C)c3c(N)c2c1
WT: 42.19%, tolC: 100.93%
Cc1ccc2nc3nc(C)cc(C)c3c(N)c2c1
WT: 53.27%, tolC: 100.78%
21
CCn1cc(C(=O)O)c(=O)c2cc(F)c(N/N=C/c3ccccc3)cc21
WT: -2.02%, tolC: 81.37%
CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCN(C)CC3)cc21.CS(=O)(=O)O
WT: 100.16%, tolC: 100.18%
22
CCn1cc(C(=O)O)c(=O)c2cc(F)c(N/N=C/c3ccccc3)cc21
WT: -2.02%, tolC: 81.37%
CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCNCC3)cc21
WT: 98.83%, tolC: 98.54%
23
CCCCCCCn1ccc(=N)cc1.I
WT: 38.66%, tolC: 95.65%
Br.CCCCCCCCn1ccc(=N)cc1
WT: 58.22%, tolC: 90.97%
24
CCCCCCCn1ccc(=N)cc1.I
WT: 38.66%, tolC: 95.65%
Br.CCCCCCCCCCn1ccc(=N)cc1
WT: 101.08%, tolC: 95.52%

sub_to_evader_transforms['mol_a'] = sub_to_evader_transforms.LHS.apply(Chem.MolFromSmiles)

C:\Users\domin\AppData\Local\Temp\ipykernel_3876\1271073621.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_to_evader_transforms['mol_a'] = sub_to_evader_transforms.LHS.apply(Chem.MolFromSmiles)

sub_to_evader_transforms['mol_b'] = sub_to_evader_transforms.RHS.apply(Chem.MolFromSmiles)

[17:06:47] WARNING: not removing hydrogen atom with dummy atom neighbors
[17:06:47] WARNING: not removing hydrogen atom with dummy atom neighbors
C:\Users\domin\AppData\Local\Temp\ipykernel_3876\1879633430.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_to_evader_transforms['mol_b'] = sub_to_evader_transforms.RHS.apply(Chem.MolFromSmiles)

sub_to_evader_transforms.mol_b.isna().any()

False

feat_diff, feat_left, feat_right = master_functions.calculate_fractions_mk7_new_smarts(sub_to_evader_transforms)

H:\My Drive\co_add_jupyter
Generating molecular objects from pre-defined substructures
Calcualting LHS+RHS matches

feat_diff, feat_left, feat_right = calculate_fractions_mk7_new_smarts(sub_to_evader_transforms)

Generating molecular objects from pre-defined substructures
Calcualting LHS+RHS matches

#drop zeros
feat_diff = feat_diff.loc[:, (feat_diff != 0).any(axis=0)]

feat_diff

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	1 - Alkane group	1,2-Dicarbonyl not in ring	10 - Aldehyde	13 - Ether	15 - Secondary amine group	16 - Tertiary amine	17 - Aromatic amine	18 - Pyridine	19 - CCN	2 - Olefin group	...	Thionyl	Vinyl michael acceptor1	Primary amine, not amide	Primary or secondary amine, not amide.	tertiary aliphatic amine	carboxylic acid	smirks	measurement_A	measurement_B	target
0	0	0	0	0	0	0	0	0	0	-1	...	0	0	-1	-1	0	0	[:1]C(=O)/C=C(\N)C(Cl)(Cl)Cl>>[:1]C(=O)CC(=O...	69.12	44.26	-24.86
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]c1cc(Br)cc(Cl)c1O>>[:1]c1ccc([N+](=O)[O-...	57.32	-2.09	-59.41
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]c1cc(I)cc(I)c1O>>[:1]c1ccc([N+](=O)[O-])o1	63.01	-2.09	-65.1
3	0	0	0	0	0	0	0	1	0	0	...	0	0	0	0	0	0	[:1]c1ccc(F)cc1>>[:1]c1ncccc1O	63.01	36.45	-26.56
4	-1	0	0	0	0	0	0	1	0	0	...	0	0	0	0	0	0	[:1]c1cc(CC)ccc1O>>[:1]c1ncccc1O	53.18	36.45	-16.73
5	0	0	0	0	0	0	0	1	0	0	...	0	0	0	0	0	0	[:1]c1cccc(Cl)c1Cl>>[:1]c1ncccc1O	72.7	36.45	-36.25
6	0	0	0	0	0	0	0	1	0	0	...	0	0	0	0	0	0	[:1]c1cccc(F)c1>>[:1]c1ncccc1O	55.41	36.45	-18.96
7	0	0	0	-1	0	0	0	1	0	0	...	0	0	0	0	0	0	[:1]c1cccc(OC)c1>>[:1]c1ncccc1O	59.53	36.45	-23.08
8	0	0	0	0	0	0	0	1	0	0	...	0	0	0	0	0	0	[:1]c1cccc2ccccc12>>[:1]c1ncccc1O	64.42	36.45	-27.97
9	0	0	0	0	0	0	0	1	0	0	...	0	0	0	0	0	0	[:1]c1ccccc1I>>[:1]c1ncccc1O	62.45	36.45	-26.0
10	0	0	0	-1	0	0	0	1	0	0	...	0	0	0	0	0	0	[:1]c1ccccc1OC>>[:1]c1ncccc1O	64.28	36.45	-27.83
11	-1	0	0	0	0	0	0	1	0	0	...	0	0	0	0	0	0	[:1]c1ccc(CC)cc1[:2]>>[:1]c1cccnc1[:2]	53.18	36.45	-16.73
12	0	0	0	0	0	0	0	0	0	0	...	0	0	0	-1	0	0	[:1]c1c(Cl)nc(NN)c([:2])c1[:3]>>[:1]c1nc([...	87.7	-3.86	-91.56
13	0	0	0	0	0	0	0	0	0	-1	...	0	0	-1	-1	0	0	[:1]/C=C(\N)C(Cl)(Cl)Cl>>[:1]CC(=O)C(F)(F)F	69.12	44.26	-24.86
14	-1	0	0	1	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]C12CC3CC(CC(C3)C1)C2>>[:1]c1c(OC)cccc1OC	87.8	-1.01	-88.81
15	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]Cn1cc([:2])c([N+](=O)[O-])n1>>[*:1]Cn1nn...	86.1	32.9	-53.2
16	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]Cn1cc([N+](=O)[O-])c([:2])n1>>[*:1]Cn1nn...	86.1	32.9	-53.2
17	0	0	0	0	0	0	0	0	1	0	...	1	0	0	0	0	0	[:1]c1c[n+]([O-])n([:2])n1>>[:1]c1ncn([:2]...	42.16	-3.58	-45.74
18	0	0	0	0	1	0	0	0	0	1	...	0	0	0	1	0	0	[:1]c1c[n+]([O-])n([:2])n1>>[:1]c1nn([:2])...	42.16	1.48	-40.68
19	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]O[:2]>>[:1]OC([:2])=O	81.47	3.38	-78.09
20	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]C1CN([:2])CC(C)N1>>[:1]C1CN([:2])CCN1	61.63	44.27	-17.36
21	-1	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]C>>[:1][H]	61.63	44.27	-17.36
22	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]c1cn([:2])nc1[N+](=O)[O-]>>[:1]c1nnn([...	86.1	32.9	-53.2
23	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]c1nn([:2])cc1[N+](=O)[O-]>>[:1]c1nnn([...	86.1	32.9	-53.2
24	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]N1CCC(OC)CC1>>[:1]N1CCOCC1	95.45	-0.15	-95.6
25	0	1	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]/N=C/c1cc(Br)cc([:2])c1O>>[*:2]CC(=O)C(=...	74.04	-2.72	-76.76
26	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]CCN1CCC(OC)CC1>>[:1]CCN1CCOCC1	95.45	-0.15	-95.6
27	-1	0	0	0	0	0	0	1	0	0	...	0	0	0	0	0	0	[:1]/C=N\c1cc(CC)ccc1[:2]>>[*:1]/C=N\c1ncccc...	53.18	36.45	-16.73
28	1	0	0	0	1	0	0	0	0	0	...	0	0	0	0	0	0	[:1]/N=C\c1cc(I)cc(I)c1[:2]>>[:1]C([:2])C(...	56.0	-0.18	-56.18
29	1	0	0	0	1	0	0	0	0	0	...	0	0	0	0	0	0	[:1]/N=C\c1cc(I)cc(I)c1[:2]>>[*:2]CC(NC(=O)C...	56.0	-0.18	-56.18
30	0	0	-1	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]c1cc(I)c(O)c(C=O)c1>>[:1]c1ccc2no[n+]([O...	55.67	-1.98	-57.65
31	0	0	-1	0	0	0	1	0	0	0	...	0	0	1	1	0	0	[:1]c1cc(I)c(O)c(C=O)c1>>[:1]c1cc([N+](=O)[O...	55.67	39.85	-15.82
32	0	0	-1	0	0	0	1	0	0	0	...	0	0	1	1	0	0	[:1]c1cc(I)c(O)c(C=O)c1>>[:1]c1cnc(N)c([N+](...	55.67	39.85	-15.82
33	1	0	-1	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]c1cc(I)c(O)c(C=O)c1>>[:1]c1nnn(CC(C)=O)n1	55.67	32.9	-22.77
34	0	0	-1	0	0	0	0	0	0	1	...	0	1	0	0	0	1	[:1]c1cc(I)c(O)c(C=O)c1>>[:1]c1ccc(/C=C/C(=O...	55.67	22.95	-32.72
35	0	0	-1	0	0	0	1	0	0	0	...	0	0	1	1	0	0	[:1]c1cc(I)c(O)c(C=O)c1>>[:1]c1nonc1N	55.67	-3.09	-58.76
36	-1	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]c1c[n+]([O-])n(C)n1>>[:1]c1ccc2no[n+]([O...	42.16	-1.98	-44.14
37	-1	0	0	0	0	0	1	0	0	0	...	0	0	1	1	0	0	[:1]c1c[n+]([O-])n(C)n1>>[:1]c1cc([N+](=O)[O...	42.16	39.85	-2.31
38	-1	0	0	0	0	0	1	0	0	0	...	0	0	1	1	0	0	[:1]c1c[n+]([O-])n(C)n1>>[:1]c1cnc(N)c([N+](...	42.16	39.85	-2.31
39	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]c1c[n+]([O-])n(C)n1>>[:1]c1nnn(CC(C)=O)n1	42.16	32.9	-9.26
40	-1	0	0	0	0	0	0	0	0	1	...	0	1	0	0	0	1	[:1]c1c[n+]([O-])n(C)n1>>[:1]c1ccc(/C=C/C(=O...	42.16	22.95	-19.21
41	-1	0	0	0	0	0	1	0	0	0	...	0	0	1	1	0	0	[:1]c1c[n+]([O-])n(C)n1>>[:1]c1nonc1N	42.16	-3.09	-45.25
42	-1	0	0	0	0	0	1	0	0	0	...	0	0	1	1	0	0	[:1]c1cn(CC(C)=O)nc1[:2]>>[:1]c1cnc(N)c([:...	86.1	39.85	-46.25
43	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]c1cn(CC(C)=O)nc1[:2]>>[*:1]c1ncn(CCO)c1[...	86.1	-2.86	-88.96
44	0	0	0	0	0	0	0	0	1	0	...	1	0	0	0	0	0	[:1]Oc1ccccc1>>[:1]S(=O)CC#N	79.05	-3.58	-82.63
45	1	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]Oc1ccccc1>>[:1]Sc1nnnn1C	79.05	47.81	-31.24
46	-1	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]C>>[:1][H]	58.74	47.51	-11.23
47	0	0	0	0	0	0	0	1	0	0	...	0	0	0	0	0	0	[:1]/C=N\c1ccc(F)cc1>>[:1]/C=N\c1ncccc1O	63.01	36.45	-26.56
48	0	0	0	0	0	0	0	1	0	0	...	0	0	0	0	0	0	[:1]/C=N\c1cccc(Cl)c1Cl>>[:1]/C=N\c1ncccc1O	72.7	36.45	-36.25
49	0	0	0	0	0	0	0	1	0	0	...	0	0	0	0	0	0	[:1]/C=N\c1cccc(F)c1>>[:1]/C=N\c1ncccc1O	55.41	36.45	-18.96
50	0	0	0	-1	0	0	0	1	0	0	...	0	0	0	0	0	0	[:1]/C=N\c1cccc(OC)c1>>[:1]/C=N\c1ncccc1O	59.53	36.45	-23.08
51	0	0	0	0	0	0	0	1	0	0	...	0	0	0	0	0	0	[:1]/C=N\c1ccccc1I>>[:1]/C=N\c1ncccc1O	62.45	36.45	-26.0
52	0	0	0	-1	0	0	0	1	0	0	...	0	0	0	0	0	0	[:1]/C=N\c1ccccc1OC>>[:1]/C=N\c1ncccc1O	64.28	36.45	-27.83
53	0	0	0	0	0	0	-1	1	0	0	...	0	0	-1	-1	0	0	[:1]/C=N/c1nonc1N>>[:1]/C=N\c1ncccc1O	70.9	36.45	-34.45
54	1	0	0	0	0	1	0	0	0	0	...	0	0	0	-1	1	0	[:1]N/N=C/c1ccccc1>>[:1]N1CCN(C)CC1	83.39	0.02	-83.37
55	1	0	0	0	1	0	0	0	0	0	...	0	0	0	0	0	0	[:1]N/N=C/c1ccccc1>>[:1]N1CCNCC1	83.39	-0.29	-83.68
56	0	0	0	0	1	-1	0	0	0	0	...	0	0	0	1	-1	0	[:1]N1CCN(CCO)CC1>>[:1]N1CCNC(C)C1	70.95	44.27	-26.68
57	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]N1CC(C)NC(C)C1>>[:1]N1CCNC(C)C1	61.63	44.27	-17.36
58	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]CCCCCCC>>[:1]CCCCCCCC	56.99	32.75	-24.24
59	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]CCCCCCC>>[:1]CCCCCCCCCC	56.99	-5.56	-62.55

60 rows Ã— 119 columns

feat_diff.iloc[:,:-4].sum().sort_values(ascending=False).head(25)

B7                                        22
18 - Pyridine                             17
NUC                                       16
sp2 hybridized carbon atoms (12)          10
Nitrogen atoms (5)                         9
sp3 hybridized carbon atoms (10)           7
B9                                         7
Nitrogen atoms (2)                         7
N6                                         7
N9                                         7
ACID                                       7
17 - Aromatic amine                        6
sp3 hybridized carbon atoms (5)            5
A33 - phenol                               5
E3 - e.g., carbonates                      5
15 - Secondary amine group                 5
sp2 hybridized carbon atoms (10)           4
Primary amine, not amide                   4
Primary or secondary amine, not amide.     4
Alpha halo carbonyl                        4
9 - Ä�Â¡arbonyl                              3
Ketone                                     3
Imines_(not_ring)                          3
sp3 hybridized carbon atoms (2)            3
Aromatic NO2                               2
dtype: object

feat_diff.iloc[:,:-4].sum().sort_values(ascending=False).tail(25)

4 - Aromatic carbon-alkane                               -3
B8EXC                                                    -3
N4EXC                                                    -3
Positively charged atoms                                 -3
ELEC                                                     -3
Negatively charged atoms                                 -3
13 - Ether                                               -3
Acyclic N-,=N and not N bound to carbonyl or sulfone     -3
25 - Aromatic chloro                                     -4
38 - Aromatic fluoro                                     -4
N oxide                                                  -5
sp2 hybridized carbon atoms (8)                          -5
10 - Aldehyde                                            -6
1 - Alkane group                                         -6
sp2 hybridized carbon atoms (7)                          -6
Aldehyde carbon atoms                                    -6
E1 - alkyl and aryl ketones and aldehydes                -6
Quaternary nitrogen (1)                                  -7
8 - Aromatic carbon-alcohol                             -10
32 - Iodo compounds                                     -11
Aryl iodide                                             -11
Iodine                                                  -11
sp3 hybridized carbon atoms (11)                        -14
sp2 hybridized carbon atoms (11)                        -18
3 - Aromatic carbon                                     -22
dtype: object

feat_diff

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	1 - Alkane group	1,2-Dicarbonyl not in ring	10 - Aldehyde	13 - Ether	15 - Secondary amine group	16 - Tertiary amine	17 - Aromatic amine	18 - Pyridine	19 - CCN	2 - Olefin group	...	Thionyl	Vinyl michael acceptor1	Primary amine, not amide	Primary or secondary amine, not amide.	tertiary aliphatic amine	carboxylic acid	smirks	measurement_A	measurement_B	target
0	0	0	0	0	0	0	0	0	0	-1	...	0	0	-1	-1	0	0	[:1]C(=O)/C=C(\N)C(Cl)(Cl)Cl>>[:1]C(=O)CC(=O...	69.12	44.26	-24.86
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]c1cc(Br)cc(Cl)c1O>>[:1]c1ccc([N+](=O)[O-...	57.32	-2.09	-59.41
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]c1cc(I)cc(I)c1O>>[:1]c1ccc([N+](=O)[O-])o1	63.01	-2.09	-65.1
3	0	0	0	0	0	0	0	1	0	0	...	0	0	0	0	0	0	[:1]c1ccc(F)cc1>>[:1]c1ncccc1O	63.01	36.45	-26.56
4	-1	0	0	0	0	0	0	1	0	0	...	0	0	0	0	0	0	[:1]c1cc(CC)ccc1O>>[:1]c1ncccc1O	53.18	36.45	-16.73
5	0	0	0	0	0	0	0	1	0	0	...	0	0	0	0	0	0	[:1]c1cccc(Cl)c1Cl>>[:1]c1ncccc1O	72.7	36.45	-36.25
6	0	0	0	0	0	0	0	1	0	0	...	0	0	0	0	0	0	[:1]c1cccc(F)c1>>[:1]c1ncccc1O	55.41	36.45	-18.96
7	0	0	0	-1	0	0	0	1	0	0	...	0	0	0	0	0	0	[:1]c1cccc(OC)c1>>[:1]c1ncccc1O	59.53	36.45	-23.08
8	0	0	0	0	0	0	0	1	0	0	...	0	0	0	0	0	0	[:1]c1cccc2ccccc12>>[:1]c1ncccc1O	64.42	36.45	-27.97
9	0	0	0	0	0	0	0	1	0	0	...	0	0	0	0	0	0	[:1]c1ccccc1I>>[:1]c1ncccc1O	62.45	36.45	-26.0
10	0	0	0	-1	0	0	0	1	0	0	...	0	0	0	0	0	0	[:1]c1ccccc1OC>>[:1]c1ncccc1O	64.28	36.45	-27.83
11	-1	0	0	0	0	0	0	1	0	0	...	0	0	0	0	0	0	[:1]c1ccc(CC)cc1[:2]>>[:1]c1cccnc1[:2]	53.18	36.45	-16.73
12	0	0	0	0	0	0	0	0	0	0	...	0	0	0	-1	0	0	[:1]c1c(Cl)nc(NN)c([:2])c1[:3]>>[:1]c1nc([...	87.7	-3.86	-91.56
13	0	0	0	0	0	0	0	0	0	-1	...	0	0	-1	-1	0	0	[:1]/C=C(\N)C(Cl)(Cl)Cl>>[:1]CC(=O)C(F)(F)F	69.12	44.26	-24.86
14	-1	0	0	1	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]C12CC3CC(CC(C3)C1)C2>>[:1]c1c(OC)cccc1OC	87.8	-1.01	-88.81
15	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]Cn1cc([:2])c([N+](=O)[O-])n1>>[*:1]Cn1nn...	86.1	32.9	-53.2
16	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]Cn1cc([N+](=O)[O-])c([:2])n1>>[*:1]Cn1nn...	86.1	32.9	-53.2
17	0	0	0	0	0	0	0	0	1	0	...	1	0	0	0	0	0	[:1]c1c[n+]([O-])n([:2])n1>>[:1]c1ncn([:2]...	42.16	-3.58	-45.74
18	0	0	0	0	1	0	0	0	0	1	...	0	0	0	1	0	0	[:1]c1c[n+]([O-])n([:2])n1>>[:1]c1nn([:2])...	42.16	1.48	-40.68
19	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]O[:2]>>[:1]OC([:2])=O	81.47	3.38	-78.09
20	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]C1CN([:2])CC(C)N1>>[:1]C1CN([:2])CCN1	61.63	44.27	-17.36
21	-1	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]C>>[:1][H]	61.63	44.27	-17.36
22	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]c1cn([:2])nc1[N+](=O)[O-]>>[:1]c1nnn([...	86.1	32.9	-53.2
23	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]c1nn([:2])cc1[N+](=O)[O-]>>[:1]c1nnn([...	86.1	32.9	-53.2
24	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]N1CCC(OC)CC1>>[:1]N1CCOCC1	95.45	-0.15	-95.6
25	0	1	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]/N=C/c1cc(Br)cc([:2])c1O>>[*:2]CC(=O)C(=...	74.04	-2.72	-76.76
26	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]CCN1CCC(OC)CC1>>[:1]CCN1CCOCC1	95.45	-0.15	-95.6
27	-1	0	0	0	0	0	0	1	0	0	...	0	0	0	0	0	0	[:1]/C=N\c1cc(CC)ccc1[:2]>>[*:1]/C=N\c1ncccc...	53.18	36.45	-16.73
28	1	0	0	0	1	0	0	0	0	0	...	0	0	0	0	0	0	[:1]/N=C\c1cc(I)cc(I)c1[:2]>>[:1]C([:2])C(...	56.0	-0.18	-56.18
29	1	0	0	0	1	0	0	0	0	0	...	0	0	0	0	0	0	[:1]/N=C\c1cc(I)cc(I)c1[:2]>>[*:2]CC(NC(=O)C...	56.0	-0.18	-56.18
30	0	0	-1	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]c1cc(I)c(O)c(C=O)c1>>[:1]c1ccc2no[n+]([O...	55.67	-1.98	-57.65
31	0	0	-1	0	0	0	1	0	0	0	...	0	0	1	1	0	0	[:1]c1cc(I)c(O)c(C=O)c1>>[:1]c1cc([N+](=O)[O...	55.67	39.85	-15.82
32	0	0	-1	0	0	0	1	0	0	0	...	0	0	1	1	0	0	[:1]c1cc(I)c(O)c(C=O)c1>>[:1]c1cnc(N)c([N+](...	55.67	39.85	-15.82
33	1	0	-1	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]c1cc(I)c(O)c(C=O)c1>>[:1]c1nnn(CC(C)=O)n1	55.67	32.9	-22.77
34	0	0	-1	0	0	0	0	0	0	1	...	0	1	0	0	0	1	[:1]c1cc(I)c(O)c(C=O)c1>>[:1]c1ccc(/C=C/C(=O...	55.67	22.95	-32.72
35	0	0	-1	0	0	0	1	0	0	0	...	0	0	1	1	0	0	[:1]c1cc(I)c(O)c(C=O)c1>>[:1]c1nonc1N	55.67	-3.09	-58.76
36	-1	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]c1c[n+]([O-])n(C)n1>>[:1]c1ccc2no[n+]([O...	42.16	-1.98	-44.14
37	-1	0	0	0	0	0	1	0	0	0	...	0	0	1	1	0	0	[:1]c1c[n+]([O-])n(C)n1>>[:1]c1cc([N+](=O)[O...	42.16	39.85	-2.31
38	-1	0	0	0	0	0	1	0	0	0	...	0	0	1	1	0	0	[:1]c1c[n+]([O-])n(C)n1>>[:1]c1cnc(N)c([N+](...	42.16	39.85	-2.31
39	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]c1c[n+]([O-])n(C)n1>>[:1]c1nnn(CC(C)=O)n1	42.16	32.9	-9.26
40	-1	0	0	0	0	0	0	0	0	1	...	0	1	0	0	0	1	[:1]c1c[n+]([O-])n(C)n1>>[:1]c1ccc(/C=C/C(=O...	42.16	22.95	-19.21
41	-1	0	0	0	0	0	1	0	0	0	...	0	0	1	1	0	0	[:1]c1c[n+]([O-])n(C)n1>>[:1]c1nonc1N	42.16	-3.09	-45.25
42	-1	0	0	0	0	0	1	0	0	0	...	0	0	1	1	0	0	[:1]c1cn(CC(C)=O)nc1[:2]>>[:1]c1cnc(N)c([:...	86.1	39.85	-46.25
43	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]c1cn(CC(C)=O)nc1[:2]>>[*:1]c1ncn(CCO)c1[...	86.1	-2.86	-88.96
44	0	0	0	0	0	0	0	0	1	0	...	1	0	0	0	0	0	[:1]Oc1ccccc1>>[:1]S(=O)CC#N	79.05	-3.58	-82.63
45	1	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]Oc1ccccc1>>[:1]Sc1nnnn1C	79.05	47.81	-31.24
46	-1	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]C>>[:1][H]	58.74	47.51	-11.23
47	0	0	0	0	0	0	0	1	0	0	...	0	0	0	0	0	0	[:1]/C=N\c1ccc(F)cc1>>[:1]/C=N\c1ncccc1O	63.01	36.45	-26.56
48	0	0	0	0	0	0	0	1	0	0	...	0	0	0	0	0	0	[:1]/C=N\c1cccc(Cl)c1Cl>>[:1]/C=N\c1ncccc1O	72.7	36.45	-36.25
49	0	0	0	0	0	0	0	1	0	0	...	0	0	0	0	0	0	[:1]/C=N\c1cccc(F)c1>>[:1]/C=N\c1ncccc1O	55.41	36.45	-18.96
50	0	0	0	-1	0	0	0	1	0	0	...	0	0	0	0	0	0	[:1]/C=N\c1cccc(OC)c1>>[:1]/C=N\c1ncccc1O	59.53	36.45	-23.08
51	0	0	0	0	0	0	0	1	0	0	...	0	0	0	0	0	0	[:1]/C=N\c1ccccc1I>>[:1]/C=N\c1ncccc1O	62.45	36.45	-26.0
52	0	0	0	-1	0	0	0	1	0	0	...	0	0	0	0	0	0	[:1]/C=N\c1ccccc1OC>>[:1]/C=N\c1ncccc1O	64.28	36.45	-27.83
53	0	0	0	0	0	0	-1	1	0	0	...	0	0	-1	-1	0	0	[:1]/C=N/c1nonc1N>>[:1]/C=N\c1ncccc1O	70.9	36.45	-34.45
54	1	0	0	0	0	1	0	0	0	0	...	0	0	0	-1	1	0	[:1]N/N=C/c1ccccc1>>[:1]N1CCN(C)CC1	83.39	0.02	-83.37
55	1	0	0	0	1	0	0	0	0	0	...	0	0	0	0	0	0	[:1]N/N=C/c1ccccc1>>[:1]N1CCNCC1	83.39	-0.29	-83.68
56	0	0	0	0	1	-1	0	0	0	0	...	0	0	0	1	-1	0	[:1]N1CCN(CCO)CC1>>[:1]N1CCNC(C)C1	70.95	44.27	-26.68
57	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]N1CC(C)NC(C)C1>>[:1]N1CCNC(C)C1	61.63	44.27	-17.36
58	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]CCCCCCC>>[:1]CCCCCCCC	56.99	32.75	-24.24
59	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	[:1]CCCCCCC>>[:1]CCCCCCCCCC	56.99	-5.56	-62.55

60 rows Ã— 119 columns

find correlated feats:

corr_feat = feat_diff.iloc[:,:-4].astype(float)

corr = corr_feat.corr()

feat_diff.iloc[:,:-4][(feat_diff.iloc[:,:-4]['3 - Aromatic carbon']<0)].sum().sort_values(ascending=False).head(20)

sp2 hybridized carbon atoms (12)    17
18 - Pyridine                       17
B7                                  17
NUC                                 15
N6                                  15
ACID                                13
A33 - phenol                        13
Nitrogen atoms (5)                   4
sp2 hybridized carbon atoms (10)     4
15 - Secondary amine group           3
sp3 hybridized carbon atoms (10)     3
Enamine                              3
Alpha halo carbonyl                  2
22 - CCl2                            2
5 - Alcohol                          2
Alkyl halide                         2
Nitrogen atoms (1)                   2
sp3 hybridized carbon atoms (5)      2
sp3 hybridized carbon atoms (2)      2
sp3 hybridized carbon atoms (12)     2
dtype: object

sub_to_evader_index_reset = sub_to_evader_transforms.reset_index(drop=True)

feat_diff[feat_diff['Iodine']<0]

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	1 - Alkane group	10 - Aldehyde	15 - Secondary amine group	17 - Aromatic amine	18 - Pyridine	2 - Olefin group	...	Vinyl michael acceptor1	Primary amine, not amide	Primary or secondary amine, not amide.	carboxylic acid	smirks	measurement_A	measurement_B	target
2	0	0	0	0	0	0	...	0	0	0	0	[:1]c1cc(I)cc(I)c1O>>[:1]c1ccc([N+](=O)[O-])o1	63.01	-2.09	-65.1
9	0	0	0	0	1	0	...	0	0	0	0	[:1]c1ccccc1I>>[:1]c1ncccc1O	62.45	36.45	-26.0
28	1	0	1	0	0	0	...	0	0	0	0	[:1]/N=C\c1cc(I)cc(I)c1[:2]>>[:1]C([:2])C(...	56.0	-0.18	-56.18
29	1	0	1	0	0	0	...	0	0	0	0	[:1]/N=C\c1cc(I)cc(I)c1[:2]>>[*:2]CC(NC(=O)C...	56.0	-0.18	-56.18
30	0	-1	0	0	0	0	...	0	0	0	0	[:1]c1cc(I)c(O)c(C=O)c1>>[:1]c1ccc2no[n+]([O...	55.67	-1.98	-57.65
31	0	-1	0	1	0	0	...	0	1	1	0	[:1]c1cc(I)c(O)c(C=O)c1>>[:1]c1cc([N+](=O)[O...	55.67	39.85	-15.82
32	0	-1	0	1	0	0	...	0	1	1	0	[:1]c1cc(I)c(O)c(C=O)c1>>[:1]c1cnc(N)c([N+](...	55.67	39.85	-15.82
33	1	-1	0	0	0	0	...	0	0	0	0	[:1]c1cc(I)c(O)c(C=O)c1>>[:1]c1nnn(CC(C)=O)n1	55.67	32.9	-22.77
34	0	-1	0	0	0	1	...	1	0	0	1	[:1]c1cc(I)c(O)c(C=O)c1>>[:1]c1ccc(/C=C/C(=O...	55.67	22.95	-32.72
35	0	-1	0	1	0	0	...	0	1	1	0	[:1]c1cc(I)c(O)c(C=O)c1>>[:1]c1nonc1N	55.67	-3.09	-58.76
51	0	0	0	0	1	0	...	0	0	0	0	[:1]/C=N\c1ccccc1I>>[:1]/C=N\c1ncccc1O	62.45	36.45	-26.0

11 rows Ã— 119 columns

len(sub_to_evader_index_reset.iloc[feat_diff[feat_diff['Iodine']<0].index].compound_structure_B.unique())

feat_diff.iloc[:,:-4].sum().sort_values(ascending=False).tail(20)

Negatively charged atoms                                 -3
13 - Ether                                               -3
Acyclic N-,=N and not N bound to carbonyl or sulfone     -3
25 - Aromatic chloro                                     -4
38 - Aromatic fluoro                                     -4
N oxide                                                  -5
sp2 hybridized carbon atoms (8)                          -5
10 - Aldehyde                                            -6
1 - Alkane group                                         -6
sp2 hybridized carbon atoms (7)                          -6
Aldehyde carbon atoms                                    -6
E1 - alkyl and aryl ketones and aldehydes                -6
Quaternary nitrogen (1)                                  -7
8 - Aromatic carbon-alcohol                             -10
32 - Iodo compounds                                     -11
Aryl iodide                                             -11
Iodine                                                  -11
sp3 hybridized carbon atoms (11)                        -14
sp2 hybridized carbon atoms (11)                        -18
3 - Aromatic carbon                                     -22
dtype: object

feat_diff.iloc[:,:-4].sum().sort_values(ascending=False).head(20)

B7                                        22
18 - Pyridine                             17
NUC                                       16
sp2 hybridized carbon atoms (12)          10
Nitrogen atoms (5)                         9
sp3 hybridized carbon atoms (10)           7
B9                                         7
Nitrogen atoms (2)                         7
N6                                         7
N9                                         7
ACID                                       7
17 - Aromatic amine                        6
sp3 hybridized carbon atoms (5)            5
A33 - phenol                               5
E3 - e.g., carbonates                      5
15 - Secondary amine group                 5
sp2 hybridized carbon atoms (10)           4
Primary amine, not amide                   4
Primary or secondary amine, not amide.     4
Alpha halo carbonyl                        4
dtype: object

vis

search = feat_diff.iloc[:,:-4][(feat_diff.iloc[:,:-4]['B7']>0)].index

len(sub_to_evader_index_reset.iloc[search].compound_structure_B.unique())

feat_diff.iloc[:,:-4][(feat_diff.iloc[:,:-4]['E1 - alkyl and aryl ketones and aldehydes']<0)].sum().sort_values(ascending=False).head(20)

Primary or secondary amine, not amide.    3
Primary amine, not amide                  3
B8EXC                                     3
17 - Aromatic amine                       3
B9                                        3
Negatively charged atoms                  3
Positively charged atoms                  3
Nitrogen atoms (2)                        3
Nitrogen atoms (7)                        3
Nitrogen atoms (4)                        2
Nitrogen atoms (5)                        2
B7                                        2
Dye 16 (1)                                2
E3 - e.g., carbonates                     2
N4EXC                                     2
Nitro group                               2
Aromatic NO2                              2
27 - Aromatic nitro                       2
sp2 hybridized carbon atoms (12)          2
Oxygen-nitrogen single bond               2
dtype: object

feat_diff = feat_diff.drop(['N9'], axis=1)

to_drop=['18 - Pyridine', 'N9']

# to_drop=[]

feat_diff = feat_diff.drop(to_drop, axis = 1)

feat_left =  feat_left.drop(to_drop, axis = 1)
feat_right =  feat_right.drop(to_drop, axis = 1)

fr_sig_descriptors_evade = master_functions.find_sig_feats_mk2(feat_left, feat_right, 0.05)

fractions_to_drop=[]

results_evader = master_functions.results_arr(feat_diff, fr_sig_descriptors_evade, feat_right, feat_left, fractions_to_drop )

Found significant fractions:  21
10 - Aldehyde has negative correlation 
percentage_loss 100
15 - Secondary amine group has positive correlation 
0/1/2 loss
[('3 - Aromatic carbon', 'Nitrogen atoms (5)', 'N4EXC'), 'sp2 hybridized carbon atoms (11)', 'Iodine']
[-60.0, -40.0, -40.0]
percentage gain under -100
17 - Aromatic amine has positive correlation 
0/1/2 loss
[('1 - Alkane group', 'ELEC', 'sp3 hybridized carbon atoms (11)'), 'E1 - alkyl and aryl ketones and aldehydes', 'Iodine']
[-57.14, -42.86, -42.86]
percentage gain under -100
25 - Aromatic chloro has negative correlation 
first_gain
[('sp2 hybridized carbon atoms (12)', 'B7'), 'ACID', 'N4EXC']
[50.0, 25.0, 25.0]
3 - Aromatic carbon has negative correlation 
first_gain
[('sp2 hybridized carbon atoms (12)', 'B7'), 'NUC', 'N6']
[73.91, 65.22, 65.22]
percentage_loss 100
32 - Iodo compounds has negative correlation 
percentage_loss 100
38 - Aromatic fluoro has negative correlation 
percentage_loss 100
8 - Aromatic carbon-alcohol has negative correlation 
all gain
[('B8EXC', 'Positively charged atoms', 'Negatively charged atoms'), 'Dye 16 (1)', 'Nitrogen atoms (2)']
[50.0, 40.0, 40.0]
percentage_loss 100
Aldehyde carbon atoms has negative correlation 
percentage_loss 100
Alpha halo carbonyl has positive correlation 
1/2/3 loss
['sp2 hybridized carbon atoms (11)', ('32 - Iodo compounds', '3 - Aromatic carbon', 'Nitrogen atoms (5)'), 'Iodine']
[-100.0, -50.0, -50.0]
percentage gain under -100
Aryl iodide has negative correlation 
percentage_loss 100
B7 has positive correlation 
percentage gain under -100
B9 has positive correlation 
0/1/2 loss
[('1 - Alkane group', 'ELEC', 'sp3 hybridized carbon atoms (11)'), 'Aldehyde carbon atoms', 'ACID']
[-50.0, -37.5, -37.5]
percentage gain under -100
E1 - alkyl and aryl ketones and aldehydes has negative correlation 
percentage_loss 100
Iodine has negative correlation 
percentage_loss 100
Nitrogen atoms (5) has positive correlation 
percentage gain under -100
NUC has positive correlation 
second double loss
['3 - Aromatic carbon', ('sp3 hybridized carbon atoms (11)', 'sp2 hybridized carbon atoms (11)'), 'sp2 hybridized carbon atoms (7)']
[-72.73, -40.91, -22.73]
percentage gain under -100
Quaternary nitrogen (1) has negative correlation 
percentage_loss 100
sp2 hybridized carbon atoms (11) has negative correlation 
first_gain
[('B7', 'sp2 hybridized carbon atoms (12)'), 'NUC', 'sp3 hybridized carbon atoms (10)']
[50.0, 36.36, 31.82]
percentage_loss 100
sp2 hybridized carbon atoms (7) has negative correlation 
first_gain
[('sp2 hybridized carbon atoms (12)', 'B7'), 'A33 - phenol', 'NUC']
[100.0, 83.33, 83.33]
percentage_loss 100
sp3 hybridized carbon atoms (11) has negative correlation 
percentage_loss 100

results_evader.sort_values(by='dof')

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	Main fraction	Correlation	$\overline{\Delta P}$	sem	std	dof	Opposite fraction 1	% of opposite 1	Opposite fraction 2	% of opposite 2	Opposite fraction 3	% of opposite 3
3	25 - Aromatic chloro	Negative	55.87	13.09	26.18	4	(sp2 hybridized carbon atoms (12), B7)	50.00	ACID	25.00	N4EXC	25.00
9	Alpha halo carbonyl	Positive	-40.52	9.04	18.08	4	sp2 hybridized carbon atoms (11)	-100.00	(32 - Iodo compounds, 3 - Aromatic carbon, Nit...	-50.00	Iodine	-50.00
6	38 - Aromatic fluoro	Negative	22.76	2.19	4.39	4	ACID	100.00	(NUC, A33 - phenol, sp2 hybridized carbon atom...	100.00	N6	100.00
1	15 - Secondary amine group	Positive	-52.68	9.50	21.25	5	(3 - Aromatic carbon, Nitrogen atoms (5), N4EXC)	-60.00	sp2 hybridized carbon atoms (11)	-40.00	Iodine	-40.00
19	sp2 hybridized carbon atoms (7)	Negative	24.81	2.85	6.98	6	(sp2 hybridized carbon atoms (12), B7)	100.00	A33 - phenol	83.33	NUC	83.33
0	10 - Aldehyde	Negative	33.92	8.09	19.80	6	Primary or secondary amine, not amide.	50.00	(Primary amine, not amide, B8EXC, 17 - Aromati...	50.00	B9	50.00
13	E1 - alkyl and aryl ketones and aldehydes	Negative	33.92	8.09	19.80	6	Primary or secondary amine, not amide.	50.00	(Primary amine, not amide, B8EXC, 17 - Aromati...	50.00	B9	50.00
8	Aldehyde carbon atoms	Negative	33.92	8.09	19.80	6	Primary or secondary amine, not amide.	50.00	(Primary amine, not amide, B8EXC, 17 - Aromati...	50.00	B9	50.00
2	17 - Aromatic amine	Positive	-26.65	8.70	23.02	7	(1 - Alkane group, ELEC, sp3 hybridized carbon...	-57.14	E1 - alkyl and aryl ketones and aldehydes	-42.86	Iodine	-42.86
17	Quaternary nitrogen (1)	Negative	24.03	7.73	20.46	7	NUC	42.86	(Primary or secondary amine, not amide., Prima...	42.86	17 - Aromatic amine	42.86
12	B9	Positive	-28.40	7.74	21.89	8	(1 - Alkane group, ELEC, sp3 hybridized carbon...	-50.00	Aldehyde carbon atoms	-37.50	ACID	-37.50
7	8 - Aromatic carbon-alcohol	Negative	42.15	7.48	23.65	10	(B8EXC, Positively charged atoms, Negatively c...	50.00	Dye 16 (1)	40.00	Nitrogen atoms (2)	40.00
5	32 - Iodo compounds	Negative	39.36	5.82	19.30	11	Nitrogen atoms (2)	45.45	(B8EXC, B7, Negatively charged atoms)	36.36	Positively charged atoms	36.36
10	Aryl iodide	Negative	39.36	5.82	19.30	11	Nitrogen atoms (2)	45.45	(B8EXC, B7, Negatively charged atoms)	36.36	Positively charged atoms	36.36
14	Iodine	Negative	39.36	5.82	19.30	11	Nitrogen atoms (2)	45.45	(B8EXC, B7, Negatively charged atoms)	36.36	Positively charged atoms	36.36
15	Nitrogen atoms (5)	Positive	-21.62	3.16	11.81	14	sp2 hybridized carbon atoms (11)	-78.57	3 - Aromatic carbon	-64.29	sp3 hybridized carbon atoms (11)	-50.00
20	sp3 hybridized carbon atoms (11)	Negative	33.56	6.73	29.34	19	B7	52.63	NUC	47.37	Nitrogen atoms (5)	36.84
16	NUC	Positive	-34.87	4.61	21.64	22	3 - Aromatic carbon	-72.73	(sp3 hybridized carbon atoms (11), sp2 hybridi...	-40.91	sp2 hybridized carbon atoms (7)	-22.73
11	B7	Positive	-23.29	2.20	10.34	22	3 - Aromatic carbon	-77.27	sp2 hybridized carbon atoms (11)	-50.00	sp3 hybridized carbon atoms (11)	-45.45
18	sp2 hybridized carbon atoms (11)	Negative	38.77	5.16	24.20	22	(B7, sp2 hybridized carbon atoms (12))	50.00	NUC	36.36	sp3 hybridized carbon atoms (10)	31.82
4	3 - Aromatic carbon	Negative	37.77	4.81	23.06	23	(sp2 hybridized carbon atoms (12), B7)	73.91	NUC	65.22	N6	65.22

master_functions.plot_feats(results_evader)

find examples visually

feat_diff.iloc[:,:-4][(feat_diff.iloc[:,:-4]['Quaternary nitrogen (1)']<0)].sum().sort_values(ascending=False).head(20)

NUC                                                             3
Primary or secondary amine, not amide.                          3
Primary amine, not amide                                        3
B9                                                              3
17 - Aromatic amine                                             3
Nitrogen atoms (2)                                              3
Nitro group                                                     2
Nitrogen atoms (4)                                              2
Dye 16 (1)                                                      2
Nitrogen atoms (5)                                              2
27 - Aromatic nitro                                             2
sp2 hybridized carbon atoms (11)                                2
E3 - e.g., carbonates                                           2
B7                                                              2
sp3 hybridized carbon atoms (10)                                2
N4EXC                                                           2
Oxygen-nitrogen single bond                                     2
Aromatic NO2                                                    2
sp3 hybridized carbon atoms (5)                                 2
Alpha beta-unsaturated ketones; center of Michael reactivity    1
dtype: object

# get example of positive transforms

# substrates
to_fg = '17 - Aromatic amine'

from_fg = 'Quaternary nitrogen (1)'

dex = feat_diff[(feat_diff[to_fg]>0)&(feat_diff[from_fg]<0)] # multiple examples of said transformation with different smirks

print(len(dex))

print('number of unique smirks:', len(dex.smirks.unique()) )

# grab those smirks and produce examples


low=4 # take first smirk

display_arr = []
for i in range(len(dex)):
    display_lhs_sub = sub_to_evader_transforms[sub_to_evader_transforms['smirks']==dex.smirks.iloc[i]].LHS.iloc[0]
    display_rhs_sub = sub_to_evader_transforms[sub_to_evader_transforms['smirks']==dex.smirks.iloc[i]].RHS.iloc[0]
    display_arr.append(Chem.MolFromSmiles(display_lhs_sub))
    display_arr.append(Chem.MolFromSmiles(display_rhs_sub))
# Chem.Draw.MolsToGridImage([Chem.MolFromSmiles(display_lhs_sub),Chem.MolFromSmiles(display_rhs_sub)], molsPerRow=2, subImgSize=(400,400), useSVG=True)
leg=[str(x) for x in range(len(dex))]
Chem.Draw.MolsToGridImage(display_arr, molsPerRow=2, subImgSize=(400,400), useSVG=True, maxMols = 50)

3
number of unique smirks: 3

# choose the compounds we're interested in:

a=4

# smiles:
    
comp_a = sub_to_evader_transforms[sub_to_evader_transforms['smirks'] == dex.iloc[int(a/2)].smirks].compound_structure_A.values[0]

comp_b = sub_to_evader_transforms[sub_to_evader_transforms['smirks'] == dex.iloc[int(a/2)].smirks].compound_structure_B.values[0]

# wt and efflux pre
pre =  e_coli_wild_efflux[e_coli_wild_efflux['SMILES'] == comp_a][['INHIB_AVE_wild', 'INHIB_AVE_efflux']].values[0]
# wt and efflux post
post = e_coli_wild_efflux[e_coli_wild_efflux['SMILES'] == comp_b][['INHIB_AVE_wild', 'INHIB_AVE_efflux']].values[0]


print(comp_a)

print('WT: {}%, tolC: {}%'.format(pre[0], pre[1]))

print(comp_b)

print('WT: {}%, tolC: {}%'.format(post[0], post[1]))

Cn1nc([N+](=O)[O-])c[n+]1[O-]
WT: 36.64%, tolC: 78.8%
Nc1nonc1[N+](=O)[O-]
WT: 99.21%, tolC: 96.12%

# get example of negative transforms

# Filter9_metal	Negative	47.02	6.41	21.27	11	Nitrogen atoms (2)	

# substrates
to_fg = 'B7'

from_fg = 'Iodine'

dex = feat_diff[(feat_diff[to_fg]>0)&(feat_diff[from_fg]<0)] # multiple examples of said transformation with different smirks

print(len(dex))

print('number of unique smirks:', len(dex.smirks.unique()) )

# grab those smirks and produce examples


low=4 # take first smirk

display_arr = []
for i in range(len(dex)):
    display_lhs_sub = sub_to_evader_transforms[sub_to_evader_transforms['smirks']==dex.smirks.iloc[i]].LHS.iloc[0]
    display_rhs_sub = sub_to_evader_transforms[sub_to_evader_transforms['smirks']==dex.smirks.iloc[i]].RHS.iloc[0]
    display_arr.append(Chem.MolFromSmiles(display_lhs_sub))
    display_arr.append(Chem.MolFromSmiles(display_rhs_sub))
# Chem.Draw.MolsToGridImage([Chem.MolFromSmiles(display_lhs_sub),Chem.MolFromSmiles(display_rhs_sub)], molsPerRow=2, subImgSize=(400,400), useSVG=True)
Chem.Draw.MolsToGridImage(display_arr, molsPerRow=2, subImgSize=(400,400), useSVG=True, maxMols = 50)

4
number of unique smirks: 4

dex

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	13 - Ether	17 - Aromatic amine	18 - Pyridine	...	Primary amine, not amide	Primary or secondary amine, not amide.	smirks	measurement_A	measurement_B	target
3	0	0	1	...	0	0	[:1]c1ccc(F)cc1>>[:1]c1ncccc1O	63.01	36.45	-26.56
5	0	0	1	...	0	0	[:1]c1cccc(Cl)c1Cl>>[:1]c1ncccc1O	72.7	36.45	-36.25
6	0	0	1	...	0	0	[:1]c1cccc(F)c1>>[:1]c1ncccc1O	55.41	36.45	-18.96
7	-1	0	1	...	0	0	[:1]c1cccc(OC)c1>>[:1]c1ncccc1O	59.53	36.45	-23.08
8	0	0	1	...	0	0	[:1]c1cccc2ccccc12>>[:1]c1ncccc1O	64.42	36.45	-27.97
9	0	0	1	...	0	0	[:1]c1ccccc1I>>[:1]c1ncccc1O	62.45	36.45	-26.0
10	-1	0	1	...	0	0	[:1]c1ccccc1OC>>[:1]c1ncccc1O	64.28	36.45	-27.83
47	0	0	1	...	0	0	[:1]/C=N\c1ccc(F)cc1>>[:1]/C=N\c1ncccc1O	63.01	36.45	-26.56
48	0	0	1	...	0	0	[:1]/C=N\c1cccc(Cl)c1Cl>>[:1]/C=N\c1ncccc1O	72.7	36.45	-36.25
49	0	0	1	...	0	0	[:1]/C=N\c1cccc(F)c1>>[:1]/C=N\c1ncccc1O	55.41	36.45	-18.96
50	-1	0	1	...	0	0	[:1]/C=N\c1cccc(OC)c1>>[:1]/C=N\c1ncccc1O	59.53	36.45	-23.08
51	0	0	1	...	0	0	[:1]/C=N\c1ccccc1I>>[:1]/C=N\c1ncccc1O	62.45	36.45	-26.0
52	-1	0	1	...	0	0	[:1]/C=N\c1ccccc1OC>>[:1]/C=N\c1ncccc1O	64.28	36.45	-27.83
53	0	-1	1	...	-1	-1	[:1]/C=N/c1nonc1N>>[:1]/C=N\c1ncccc1O	70.9	36.45	-34.45

14 rows Ã— 119 columns

# choose the compounds we're interested in:

a=18

# smiles:
    
comp_a = sub_to_evader_transforms[sub_to_evader_transforms['smirks'] == dex.iloc[int(a/2)].smirks].compound_structure_A.values[0]

comp_b = sub_to_evader_transforms[sub_to_evader_transforms['smirks'] == dex.iloc[int(a/2)].smirks].compound_structure_B.values[0]

# wt and efflux pre
pre =  e_coli_wild_efflux[e_coli_wild_efflux['SMILES'] == comp_a][['INHIB_AVE_wild', 'INHIB_AVE_efflux']].values[0]
# wt and efflux post
post = e_coli_wild_efflux[e_coli_wild_efflux['SMILES'] == comp_b][['INHIB_AVE_wild', 'INHIB_AVE_efflux']].values[0]


print(comp_a)

print('WT: {}%, tolC: {}%'.format(pre[0], pre[1]))

print(comp_b)

print('WT: {}%, tolC: {}%'.format(post[0], post[1]))

CCc1ccc(O)c(/N=C/c2cc(I)cc(I)c2O)c1
WT: 38.8%, tolC: 91.98%
Oc1cccnc1/N=C/c1cc(I)cc(I)c1O
WT: 60.66%, tolC: 97.11%

physcichemical of sub to evade transforms:

sub_and_evade_logd['Class'] = sub_and_evade_om_corrected['Class']

sub_and_evade_logd.columns

Index(['Index', 'SMILES', 'logS', 'logS @ pH7.4', 'logD', '2C9 pKi', 'logP',
       'MW', 'HBD', 'HBA', 'TPSA', 'Flexibility', 'Rotatable Bonds', 'mol',
       'Class'],
      dtype='object')

feat='Rotatable Bonds'
sub_and_evade_logd[sub_and_evade_logd['Class']=='Efflux Substrate'][feat].mean(), sub_and_evade_logd[sub_and_evade_logd['Class']=='Efflux Evader'][feat].mean()

(5.730560578661844, 4.859459459459459)

sub_and_evade_logd

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	Index	SMILES	logS	logS @ pH7.4	logD	2C9 pKi	logP	MW	HBD	HBA	TPSA	Flexibility	Rotatable Bonds	mol	Class
0	0	OB1OCc2ccccc21	5.188	2.2370	0.07439	4.217	0.07439	133.9	1	2	29.46	0.00000	0	<rdkit.Chem.rdchem.Mol object at 0x000002CDAA1...	Efflux Evader
1	1	BrC(/C=N/Nc1nc(N2CCOCC2)nc(N2CCOCC2)n1)=C/c1cc...	2.053	0.4994	2.27200	5.529	2.78000	474.4	1	9	88.00	0.18180	6	<rdkit.Chem.rdchem.Mol object at 0x000002CDAA1...	Efflux Evader
2	2	Clc1ccc(C(=C2CN3CCC2CC3)c2ccc(Cl)s2)s1	1.303	0.8745	3.51100	5.096	4.87400	356.3	0	1	3.24	0.08333	2	<rdkit.Chem.rdchem.Mol object at 0x000002CDAA1...	Efflux Evader
3	3	O=C(/C=C(\O)c1ccc(Br)cc1)C(F)(F)F	2.361	2.2380	1.63100	4.581	3.76600	295.1	1	2	37.30	0.18750	3	<rdkit.Chem.rdchem.Mol object at 0x000002CDAA1...	Efflux Evader
4	4	O=C(CCl)C(=O)Nc1ccccc1	4.326	2.9250	1.00300	3.932	1.00300	197.6	1	3	46.17	0.30770	4	<rdkit.Chem.rdchem.Mol object at 0x000002CDAA1...	Efflux Evader
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
735	735	c1ccc2c(c1)ccc1c2nc2ccccn21	1.606	1.6420	4.15400	4.902	4.15400	218.3	0	2	17.30	0.00000	0	<rdkit.Chem.rdchem.Mol object at 0x000002CDE31...	Efflux Substrate
736	736	O=C(CSc1ccc2ccccc2n1)N/N=C/c1ccc(O)cc1O	1.119	2.5010	2.21900	4.954	2.21900	353.4	3	6	94.81	0.22220	6	<rdkit.Chem.rdchem.Mol object at 0x000002CDE31...	Efflux Substrate
737	737	Cc1c2ccncc2c(C)c2c1[nH]c1ccccc12	1.294	0.9868	4.80000	5.346	4.80000	246.3	1	2	28.68	0.00000	0	<rdkit.Chem.rdchem.Mol object at 0x000002CDE31...	Efflux Substrate
738	738	Cc1cc(C)c(CSc2nnc(C)s2)c(C)c1	1.607	2.4660	3.86300	4.569	3.86300	264.4	0	2	25.78	0.16670	3	<rdkit.Chem.rdchem.Mol object at 0x000002CDE31...	Efflux Substrate
739	739	COc1cc([C@@H]2c3cc4c(cc3[C@@H](OC3OC5CO[C@@H](...	1.052	2.1080	1.28600	5.984	1.28600	656.7	3	13	160.80	0.11320	6	<rdkit.Chem.rdchem.Mol object at 0x000002CDE31...	Efflux Substrate

738 rows Ã— 15 columns

sub_and_evade_logd = pd.read_csv('data_curated/sub_and_evade_PE.csv')

sub_and_evade_logd['mol'] = sub_and_evade_logd['SMILES'].apply(Chem.MolFromSmiles)

[09:35:17] Explicit valence for atom # 2 N, 4, is greater than permitted
[09:35:17] Explicit valence for atom # 17 N, 5, is greater than permitted

sub_and_evade_logd = sub_and_evade_logd.dropna(subset='mol')

sub_and_evade_logd['SMILES'] = sub_and_evade_logd['SMILES'].apply(Chem.CanonSmiles)

a_features = calcualte_features_single(sub_to_evader_transforms, 'compound_structure_A')
b_features = calcualte_features_single(sub_to_evader_transforms, 'compound_structure_B')

a_features= a_features.iloc[:,:-87]
b_features= b_features.iloc[:,:-87]


# sub_evade_inactive_features['Class'] = sub_evade_inactive['Class']

Computing features: 


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 60/60 [00:00<00:00, 133.08it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 60/60 [00:00<00:00, 1890.13it/s]


Computing features: 


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 60/60 [00:00<00:00, 139.04it/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 60/60 [00:00<00:00, 1890.13it/s]

def get_change(current, previous):
    if current == previous:
        return 0
    try:
        return (abs(current - previous) / previous) * 100.0
    except ZeroDivisionError:
        return float('inf')

rets=[]
for column in a_features.columns:
    rets.append(get_change(b_features[column].mean(), a_features[column].mean()))

pd.DataFrame(rets, index=a_features.columns).sort_values(by=0).head(20)

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	0
VSA_EState5	-163.547238
MinEStateIndex	-45.150857
HallKierAlpha	-14.961832
MinPartialCharge	-12.158647
NumRadicalElectrons	0.000000
EState_VSA11	0.000000
SlogP_VSA9	0.000000
SMR_VSA8	0.000000
VSA_EState8	0.066408
SlogP_VSA1	0.426011
MaxEStateIndex	1.157330
MaxAbsEStateIndex	1.157330
FpDensityMorgan1	1.441856
NumValenceElectrons	1.888042
Chi1	1.999946
Chi0	2.082437
HeavyAtomCount	2.088773
BertzCT	2.155824
FpDensityMorgan2	2.326498
EState_VSA8	2.640954

pd.DataFrame(rets, index=a_features.columns).sort_values(by=0).tail(30)

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	0
PEOE_VSA7	20.914404
EState_VSA4	23.165149
SlogP_VSA2	24.599014
NumHDonors	24.615385
NumAliphaticHeterocycles	25.000000
NumSaturatedHeterocycles	25.000000
VSA_EState6	25.295822
NHOHCount	27.142857
MolLogP	29.078748
SlogP_VSA8	29.091388
SlogP_VSA12	30.354076
EState_VSA6	30.865566
SMR_VSA6	36.032338
VSA_EState10	38.687729
SMR_VSA4	38.929079
EState_VSA7	39.171792
NumAromaticCarbocycles	42.028986
PEOE_VSA6	42.891886
EState_VSA5	46.730088
EState_VSA1	55.387805
Ipc	58.987509
PEOE_VSA13	59.265545
SMR_VSA3	60.530420
SMR_VSA2	65.444545
NumAromaticHeterocycles	74.285714
SlogP_VSA7	77.553925
NumAliphaticCarbocycles	100.000000
NumSaturatedCarbocycles	100.000000
PEOE_VSA3	159.404918
PEOE_VSA11	171.605736

a_features.MolLogP.mean()

2.7694200000000015

b_features.MolLogP.mean()

1.9641073333333339

feat='MolWt'
a_features[feat].mean(), b_features[feat].mean()

feat='MolWt'
a_features[feat].mean(), b_features[feat].mean()

feat='MolWt'
a_features[feat].mean(), b_features[feat].mean()

(365.2758000000001, 330.7681666666667)

feat='TPSA'
a_features[feat].mean(), b_features[feat].mean()

(73.45366666666665, 88.39083333333335)

feat='NumRotatableBonds'
a_features[feat].mean(), b_features[feat].mean()

(3.316666666666667, 3.4166666666666665)

feat='NumHAcceptors'
a_features[feat].mean(), b_features[feat].mean()

(4.383333333333334, 5.283333333333333)

feat='NumHDonors'
a_features[feat].mean(), b_features[feat].mean()

(1.0833333333333333, 1.35)

sns.histplot(a_features.MolLogP, color='r')
sns.histplot(b_features.MolLogP, color='b')

<Axes: xlabel='MolLogP', ylabel='Count'>

cluster 8

cluster_8 = pd.read_csv('data_curated/cluster_8.csv')

cluster_8

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	SMILES	INHIB_AVE_wild	INHIB_AVE_efflux	Mol	fps	abs_diff	sub_class	wild_stds	tolc_stds	wild_class	tolc_class	Class	mol
0	O=C(NC(=S)N1CCN(c2cc3c(cc2F)c(=O)c(C(=O)O)cn3C...	90.32	88.08	<rdkit.Chem.rdchem.Mol object at 0x00000271FF9...	<rdkit.DataStructs.cDataStructs.ExplicitBitVec...	-2.24	decrease	8.862059	4.772322	active	active	Efflux Evader	<rdkit.Chem.rdchem.Mol object at 0x00000271FF8...
1	CCn1cc(C(=O)O)c(=O)c2cc([N+](=O)[O-])ccc21	92.33	83.35	<rdkit.Chem.rdchem.Mol object at 0x00000271FF9...	<rdkit.DataStructs.cDataStructs.ExplicitBitVec...	-8.98	decrease	9.068579	4.495245	active	active	Efflux Evader	<rdkit.Chem.rdchem.Mol object at 0x00000271FF8...
2	CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCN(C(=S)NC(=O)c...	92.72	91.71	<rdkit.Chem.rdchem.Mol object at 0x00000271FF9...	<rdkit.DataStructs.cDataStructs.ExplicitBitVec...	-1.01	decrease	9.108650	4.984962	active	active	Efflux Evader	<rdkit.Chem.rdchem.Mol object at 0x00000271FF8...
3	CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCN(C(=S)Nc4ccc(...	94.83	93.26	<rdkit.Chem.rdchem.Mol object at 0x00000271FF9...	<rdkit.DataStructs.cDataStructs.ExplicitBitVec...	-1.57	decrease	9.325446	5.075759	active	active	Efflux Evader	<rdkit.Chem.rdchem.Mol object at 0x00000271FF8...
4	CCn1cc(C(=O)O)c(=O)c2cnc(N3CCN(C(=S)NC(=O)c4cc...	59.56	88.04	<rdkit.Chem.rdchem.Mol object at 0x00000271FF9...	<rdkit.DataStructs.cDataStructs.ExplicitBitVec...	28.48	increase	5.701576	4.769979	active	active	Efflux Evader	<rdkit.Chem.rdchem.Mol object at 0x00000271FF8...
5	CCn1cc(C(=O)O)c(=O)c2cnc(N3CCN(C(=S)Nc4ccc(C(=...	96.96	100.34	<rdkit.Chem.rdchem.Mol object at 0x00000271FF9...	<rdkit.DataStructs.cDataStructs.ExplicitBitVec...	3.38	increase	9.544296	5.490497	active	active	Efflux Evader	<rdkit.Chem.rdchem.Mol object at 0x00000271FF8...
6	CCOC(=O)c1cn(CC)c2cc(N3CCN(C)CC3)c(F)cc2c1=O	94.15	89.71	<rdkit.Chem.rdchem.Mol object at 0x00000271FF9...	<rdkit.DataStructs.cDataStructs.ExplicitBitVec...	-4.44	decrease	9.255578	4.867805	active	active	Efflux Evader	<rdkit.Chem.rdchem.Mol object at 0x00000271FF8...
7	Cc1c(NC(=O)c2cn3c4c(c(N5CCN(C)CC5)c(F)cc4c2=O)...	97.04	94.43	<rdkit.Chem.rdchem.Mol object at 0x00000271FF9...	<rdkit.DataStructs.cDataStructs.ExplicitBitVec...	-2.61	decrease	9.552515	5.144296	active	active	Efflux Evader	<rdkit.Chem.rdchem.Mol object at 0x00000271FF8...
8	CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23	99.54	98.79	<rdkit.Chem.rdchem.Mol object at 0x00000271FF9...	<rdkit.DataStructs.cDataStructs.ExplicitBitVec...	-0.75	decrease	9.809382	5.399700	active	active	Efflux Evader	<rdkit.Chem.rdchem.Mol object at 0x00000271FF8...
9	CCN1CCN(c2cc3c(cc2F)c(=O)c(C(=O)O)cn3C2CC2)CC1	101.15	101.88	<rdkit.Chem.rdchem.Mol object at 0x00000271FF9...	<rdkit.DataStructs.cDataStructs.ExplicitBitVec...	0.73	increase	9.974803	5.580708	active	active	Efflux Evader	<rdkit.Chem.rdchem.Mol object at 0x00000271FF8...
10	CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCN(C)CC3)cc21.C...	100.16	100.18	<rdkit.Chem.rdchem.Mol object at 0x00000271FF9...	<rdkit.DataStructs.cDataStructs.ExplicitBitVec...	0.02	increase	9.873084	5.481124	active	active	Efflux Evader	<rdkit.Chem.rdchem.Mol object at 0x00000271FF8...
11	CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCNCC3)cc21	98.83	98.54	<rdkit.Chem.rdchem.Mol object at 0x00000271FF9...	<rdkit.DataStructs.cDataStructs.ExplicitBitVec...	-0.29	decrease	9.736432	5.385055	active	active	Efflux Evader	<rdkit.Chem.rdchem.Mol object at 0x00000271FF8...
12	CCn1cc(C(=O)O)c(=O)c2ccc(C)nc21	100.81	101.30	<rdkit.Chem.rdchem.Mol object at 0x00000271FF9...	<rdkit.DataStructs.cDataStructs.ExplicitBitVec...	0.49	increase	9.939870	5.546732	active	active	Efflux Evader	<rdkit.Chem.rdchem.Mol object at 0x00000271FF8...
13	CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)Nc4ccc...	74.97	93.00	<rdkit.Chem.rdchem.Mol object at 0x00000271FF9...	<rdkit.DataStructs.cDataStructs.ExplicitBitVec...	18.03	increase	7.284900	5.060529	active	active	Efflux Evader	<rdkit.Chem.rdchem.Mol object at 0x00000271FF8...
14	CN1CCN(c2c(F)cc3c(=O)c(C(=O)O)cn4c3c2SCC4)CC1.Cl	101.07	101.69	<rdkit.Chem.rdchem.Mol object at 0x00000271FF9...	<rdkit.DataStructs.cDataStructs.ExplicitBitVec...	0.62	increase	9.966584	5.569578	active	active	Efflux Evader	<rdkit.Chem.rdchem.Mol object at 0x00000271FF8...
15	COc1c(N2CCNC(C)C2)c(F)cc2c(=O)c(C(=O)O)cn(C3CC...	99.27	98.34	<rdkit.Chem.rdchem.Mol object at 0x00000271FF9...	<rdkit.DataStructs.cDataStructs.ExplicitBitVec...	-0.93	decrease	9.781640	5.373339	active	active	Efflux Evader	<rdkit.Chem.rdchem.Mol object at 0x000002726D8...
16	COc1c(N2C[C@@H]3CCCN[C@@H]3C2)c(F)cc2c(=O)c(C(...	99.99	99.89	<rdkit.Chem.rdchem.Mol object at 0x00000271FF9...	<rdkit.DataStructs.cDataStructs.ExplicitBitVec...	-0.10	decrease	9.855617	5.464136	active	active	Efflux Evader	<rdkit.Chem.rdchem.Mol object at 0x000002726D8...
17	C[C@H]1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)...	99.45	98.37	<rdkit.Chem.rdchem.Mol object at 0x00000271FF9...	<rdkit.DataStructs.cDataStructs.ExplicitBitVec...	-1.08	decrease	9.800134	5.375097	active	active	Efflux Evader	<rdkit.Chem.rdchem.Mol object at 0x000002726D8...
18	C[C@H]1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)...	100.58	100.90	<rdkit.Chem.rdchem.Mol object at 0x00000271FF9...	<rdkit.DataStructs.cDataStructs.ExplicitBitVec...	0.32	increase	9.916238	5.523301	active	active	Efflux Evader	<rdkit.Chem.rdchem.Mol object at 0x000002726D8...
19	C[C@H]1COc2c(C3(N)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23	98.12	97.94	<rdkit.Chem.rdchem.Mol object at 0x00000271FF9...	<rdkit.DataStructs.cDataStructs.ExplicitBitVec...	-0.18	decrease	9.663482	5.349908	active	active	Efflux Evader	<rdkit.Chem.rdchem.Mol object at 0x000002726D8...
20	Cl.O=C(Nc1ccc(-c2n[nH]c(=S)o2)cc1)c1cn(C2CC2)c...	90.63	81.87	<rdkit.Chem.rdchem.Mol object at 0x00000271FF9...	<rdkit.DataStructs.cDataStructs.ExplicitBitVec...	-8.76	decrease	8.893910	4.408548	active	active	Efflux Evader	<rdkit.Chem.rdchem.Mol object at 0x000002726D8...
21	COc1c(N2CC3CCCNC3C2)c(F)cc2c(=O)c(C(=O)Nc3ccc(...	85.55	90.95	<rdkit.Chem.rdchem.Mol object at 0x00000271FF9...	<rdkit.DataStructs.cDataStructs.ExplicitBitVec...	5.40	increase	8.371958	4.940443	active	active	Efflux Evader	<rdkit.Chem.rdchem.Mol object at 0x000002726D8...
22	Cc1ccc(S(=O)(=O)O)cc1.NC1CCN(c2nc3c(cc2F)c(=O)...	94.82	90.03	<rdkit.Chem.rdchem.Mol object at 0x00000271FF9...	<rdkit.DataStructs.cDataStructs.ExplicitBitVec...	-4.79	decrease	9.324418	4.886550	active	active	Efflux Evader	<rdkit.Chem.rdchem.Mol object at 0x000002726D8...
23	CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCN(C(=S)NC(=O)C...	9.66	97.46	<rdkit.Chem.rdchem.Mol object at 0x00000272495...	<rdkit.DataStructs.cDataStructs.ExplicitBitVec...	87.80	increase	0.574526	5.321790	inactive	active	Efflux Substrate	<rdkit.Chem.rdchem.Mol object at 0x000002724F6...
24	CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCN(C(=S)NC(=O)c...	2.34	93.17	<rdkit.Chem.rdchem.Mol object at 0x00000272283...	<rdkit.DataStructs.cDataStructs.ExplicitBitVec...	90.83	increase	-0.177578	5.070487	inactive	active	Efflux Substrate	<rdkit.Chem.rdchem.Mol object at 0x000002724F6...
25	CCn1cc(C(=O)O)c(=O)c2cc(F)c(N/N=C/c3ccccc3)cc21	-2.02	81.37	<rdkit.Chem.rdchem.Mol object at 0x00000271FF7...	<rdkit.DataStructs.cDataStructs.ExplicitBitVec...	83.39	increase	-0.625553	4.379259	inactive	active	Efflux Substrate	<rdkit.Chem.rdchem.Mol object at 0x000002724F6...
26	CCn1cc(C(=O)O)c(=O)c2cnc(N3CCN(C(=S)NC(=O)c4cc...	-3.27	97.79	<rdkit.Chem.rdchem.Mol object at 0x00000271FF7...	<rdkit.DataStructs.cDataStructs.ExplicitBitVec...	101.06	increase	-0.753986	5.341121	inactive	active	Efflux Substrate	<rdkit.Chem.rdchem.Mol object at 0x000002724F6...
27	CCOc1cccc(C(=O)NC(=S)N2CCN(c3ncc4c(=O)c(C(=O)O...	-5.55	88.93	<rdkit.Chem.rdchem.Mol object at 0x00000271FF7...	<rdkit.DataStructs.cDataStructs.ExplicitBitVec...	94.48	increase	-0.988248	4.822114	inactive	active	Efflux Substrate	<rdkit.Chem.rdchem.Mol object at 0x000002724F6...
28	CCn1cc(C(=O)O)c(=O)c2cnc(N3CCN(C(=S)NC(=O)c4cc...	6.81	97.95	<rdkit.Chem.rdchem.Mol object at 0x00000271FF7...	<rdkit.DataStructs.cDataStructs.ExplicitBitVec...	91.14	increase	0.281699	5.350493	inactive	active	Efflux Substrate	<rdkit.Chem.rdchem.Mol object at 0x000002724F6...
29	CCn1cc(C(=O)O)c(=O)c2cnc(N3CCN(C(=S)Nc4ccc(OC)...	-0.57	80.90	<rdkit.Chem.rdchem.Mol object at 0x00000271FF7...	<rdkit.DataStructs.cDataStructs.ExplicitBitVec...	81.47	increase	-0.476571	4.351727	inactive	active	Efflux Substrate	<rdkit.Chem.rdchem.Mol object at 0x000002724F6...
30	CCn1cc(C(=O)O)c(=O)c2cnc(N3CCN(C(=S)Nc4ccccc4C...	1.49	103.44	<rdkit.Chem.rdchem.Mol object at 0x00000271FF7...	<rdkit.DataStructs.cDataStructs.ExplicitBitVec...	101.95	increase	-0.264913	5.672090	inactive	active	Efflux Substrate	<rdkit.Chem.rdchem.Mol object at 0x000002724F6...
31	CCn1cc(C(=O)O)c(=O)c2cnc(N3CCN(C(=S)Nc4ccccc4C...	0.73	97.25	<rdkit.Chem.rdchem.Mol object at 0x00000271B69...	<rdkit.DataStructs.cDataStructs.ExplicitBitVec...	96.52	increase	-0.343000	5.309488	inactive	active	Efflux Substrate	<rdkit.Chem.rdchem.Mol object at 0x000002724F6...

small_set = main_transforms[main_transforms['compound_structure_B'].isin(cluster_8.SMILES)]

small_set_diff, small_set_left, small_set_right = calculate_fractions_mk7_new_smarts(small_set)

Generating molecular objects from pre-defined substructures
Calcualting LHS+RHS matches

small_set_diff.iloc[:,:-4].sum().sort_values(ascending=False).tail(20)

Dinitrobenzene_3                                         0
Dipeptide                                                0
Disulfide                                                0
Disulfides                                               0
Disulphide                                               0
Dithiocarbamate                                          0
Dithiole-2-thione                                        0
Dithiole-3-thione                                        0
Dithiomethylene_acetal                                   0
Dye 1 (1)                                                0
Dye 11                                                   0
Dye 16 (1)                                               0
E3 - e.g., carbonates                                    0
Nitrogen atoms (2)                                      -1
Adamantyl                                               -1
Primary or secondary amine, not amide.                  -1
Acyclic N-,=N and not N bound to carbonyl or sulfone    -2
N5EXC                                                   -2
N4EXC                                                   -2
Oxygen-nitrogen single bond                             -2
dtype: object

small_set_diff.iloc[:,:-4].sum().sort_values(ascending=False).head(20)

B9                                 2
N9                                 2
sp2 hybridized carbon atoms (4)    2
phenylpiperazine                   2
sp3 hybridized carbon atoms (2)    2
16 - Tertiary amine                2
NUC                                2
Nitrogen atoms (4)                 2
Sulphates                          1
B2 - secondary amine               1
S/PO3 groups                       1
5 - Alcohol                        1
41 - Acrylate                      1
B3 - tertiary amine                1
sp3 hybridized carbon atoms (9)    1
N2 - secondary amines              1
Ester                              1
sp2 hybridized carbon atoms (8)    1
ELEC                               1
Nitrogen atoms (1)                 1
dtype: object

Tripple Transforms

evader_transforms = evader_transforms.drop(columns=['idsmiles_A', 'idsmiles_B', 'measurement_A', 'measurement_B', 'measurement_delta'])

substrate_transforms = substrate_transforms.drop(columns=['idsmiles_A', 'idsmiles_B', 'measurement_A', 'measurement_B', 'measurement_delta'])

comp_a_lhs_overlap = evader_transforms.merge(substrate_transforms, on=['compound_structure_A', 'LHS', 'common_core'], suffixes=['_evader','_substrate'])

len(comp_a_lhs_overlap)

len(comp_a_lhs_overlap.compound_structure_A.unique())

len(comp_a_lhs_overlap.compound_structure_B_substrate.unique())

len(comp_a_lhs_overlap.compound_structure_B_evader.unique())

comp_a_lhs_overlap

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	compound_structure_A	compound_structure_B_evader	smirks_evader	common_core	LHS	RHS_evader	compound_structure_B_substrate	smirks_substrate	RHS_substrate
0	Cc1cccc(O)c1/N=C/c1cc(I)cc(I)c1O	Oc1cccnc1/N=C/c1cc(I)cc(I)c1O	[:1]c1c(C)cccc1O>>[:1]c1ncccc1O	[*:1]/N=C/c1cc(I)cc(I)c1O	[*:1]c1c(C)cccc1O	[*:1]c1ncccc1O	Oc1c(I)cc(I)cc1/C=N/c1ccc(F)cc1	[:1]c1c(C)cccc1O>>[:1]c1ccc(F)cc1	[*:1]c1ccc(F)cc1
1	Cc1cccc(O)c1/N=C/c1cc(I)cc(I)c1O	Oc1cccnc1/N=C/c1cc(I)cc(I)c1O	[:1]c1c(C)cccc1O>>[:1]c1ncccc1O	[*:1]/N=C/c1cc(I)cc(I)c1O	[*:1]c1c(C)cccc1O	[*:1]c1ncccc1O	CCc1ccc(O)c(/N=C/c2cc(I)cc(I)c2O)c1	[:1]c1c(C)cccc1O>>[:1]c1cc(CC)ccc1O	[*:1]c1cc(CC)ccc1O
2	Cc1cccc(O)c1/N=C/c1cc(I)cc(I)c1O	Oc1cccnc1/N=C/c1cc(I)cc(I)c1O	[:1]c1c(C)cccc1O>>[:1]c1ncccc1O	[*:1]/N=C/c1cc(I)cc(I)c1O	[*:1]c1c(C)cccc1O	[*:1]c1ncccc1O	Oc1c(I)cc(I)cc1/C=N/c1cccc(Cl)c1Cl	[:1]c1c(C)cccc1O>>[:1]c1cccc(Cl)c1Cl	[*:1]c1cccc(Cl)c1Cl
3	Cc1cccc(O)c1/N=C/c1cc(I)cc(I)c1O	Oc1cccnc1/N=C/c1cc(I)cc(I)c1O	[:1]c1c(C)cccc1O>>[:1]c1ncccc1O	[*:1]/N=C/c1cc(I)cc(I)c1O	[*:1]c1c(C)cccc1O	[*:1]c1ncccc1O	Oc1c(I)cc(I)cc1/C=N/c1cccc(F)c1	[:1]c1c(C)cccc1O>>[:1]c1cccc(F)c1	[*:1]c1cccc(F)c1
4	Cc1cccc(O)c1/N=C/c1cc(I)cc(I)c1O	Oc1cccnc1/N=C/c1cc(I)cc(I)c1O	[:1]c1c(C)cccc1O>>[:1]c1ncccc1O	[*:1]/N=C/c1cc(I)cc(I)c1O	[*:1]c1c(C)cccc1O	[*:1]c1ncccc1O	COc1cccc(/N=C/c2cc(I)cc(I)c2O)c1	[:1]c1c(C)cccc1O>>[:1]c1cccc(OC)c1	[*:1]c1cccc(OC)c1
...	...	...	...	...	...	...	...	...	...
120	CCCn1ccc(=N)cc1.I	Br.CCCCCCCCCCn1ccc(=N)cc1	[:1]CCC>>[:1]CCCCCCCCCC	[*:1]n1ccc(=N)cc1	[*:1]CCC	[*:1]CCCCCCCCCC	CCCCCCCn1ccc(=N)cc1.I	[:1]CCC>>[:1]CCCCCCC	[*:1]CCCCCCC
121	CCCCn1ccc(=N)cc1.I	Br.CCCCCCCCn1ccc(=N)cc1	[:1]CCCC>>[:1]CCCCCCCC	[*:1]n1ccc(=N)cc1	[*:1]CCCC	[*:1]CCCCCCCC	CCCCCCCn1ccc(=N)cc1.I	[:1]CCCC>>[:1]CCCCCCC	[*:1]CCCCCCC
122	CCCCn1ccc(=N)cc1.I	Br.CCCCCCCCCCn1ccc(=N)cc1	[:1]CCCC>>[:1]CCCCCCCCCC	[*:1]n1ccc(=N)cc1	[*:1]CCCC	[*:1]CCCCCCCCCC	CCCCCCCn1ccc(=N)cc1.I	[:1]CCCC>>[:1]CCCCCCC	[*:1]CCCCCCC
123	Br.CCCCCCn1ccc(=N)cc1	Br.CCCCCCCCn1ccc(=N)cc1	[:1]CCCCCC>>[:1]CCCCCCCC	[*:1]n1ccc(=N)cc1	[*:1]CCCCCC	[*:1]CCCCCCCC	CCCCCCCn1ccc(=N)cc1.I	[:1]CCCCCC>>[:1]CCCCCCC	[*:1]CCCCCCC
124	Br.CCCCCCn1ccc(=N)cc1	Br.CCCCCCCCCCn1ccc(=N)cc1	[:1]CCCCCC>>[:1]CCCCCCCCCC	[*:1]n1ccc(=N)cc1	[*:1]CCCCCC	[*:1]CCCCCCCCCC	CCCCCCCn1ccc(=N)cc1.I	[:1]CCCCCC>>[:1]CCCCCCC	[*:1]CCCCCCC

125 rows Ã— 9 columns

comp_a_lhs_overlap[comp_a_lhs_overlap.compound_structure_B_evader.isin(comp_a_lhs_overlap.compound_structure_B_evader.unique())]

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	compound_structure_A	compound_structure_B_evader	smirks_evader	common_core	LHS	RHS_evader	compound_structure_B_substrate	smirks_substrate	RHS_substrate
0	Cc1cccc(O)c1/N=C/c1cc(I)cc(I)c1O	Oc1cccnc1/N=C/c1cc(I)cc(I)c1O	[:1]c1c(C)cccc1O>>[:1]c1ncccc1O	[*:1]/N=C/c1cc(I)cc(I)c1O	[*:1]c1c(C)cccc1O	[*:1]c1ncccc1O	Oc1c(I)cc(I)cc1/C=N/c1ccc(F)cc1	[:1]c1c(C)cccc1O>>[:1]c1ccc(F)cc1	[*:1]c1ccc(F)cc1
1	Cc1cccc(O)c1/N=C/c1cc(I)cc(I)c1O	Oc1cccnc1/N=C/c1cc(I)cc(I)c1O	[:1]c1c(C)cccc1O>>[:1]c1ncccc1O	[*:1]/N=C/c1cc(I)cc(I)c1O	[*:1]c1c(C)cccc1O	[*:1]c1ncccc1O	CCc1ccc(O)c(/N=C/c2cc(I)cc(I)c2O)c1	[:1]c1c(C)cccc1O>>[:1]c1cc(CC)ccc1O	[*:1]c1cc(CC)ccc1O
2	Cc1cccc(O)c1/N=C/c1cc(I)cc(I)c1O	Oc1cccnc1/N=C/c1cc(I)cc(I)c1O	[:1]c1c(C)cccc1O>>[:1]c1ncccc1O	[*:1]/N=C/c1cc(I)cc(I)c1O	[*:1]c1c(C)cccc1O	[*:1]c1ncccc1O	Oc1c(I)cc(I)cc1/C=N/c1cccc(Cl)c1Cl	[:1]c1c(C)cccc1O>>[:1]c1cccc(Cl)c1Cl	[*:1]c1cccc(Cl)c1Cl
3	Cc1cccc(O)c1/N=C/c1cc(I)cc(I)c1O	Oc1cccnc1/N=C/c1cc(I)cc(I)c1O	[:1]c1c(C)cccc1O>>[:1]c1ncccc1O	[*:1]/N=C/c1cc(I)cc(I)c1O	[*:1]c1c(C)cccc1O	[*:1]c1ncccc1O	Oc1c(I)cc(I)cc1/C=N/c1cccc(F)c1	[:1]c1c(C)cccc1O>>[:1]c1cccc(F)c1	[*:1]c1cccc(F)c1
4	Cc1cccc(O)c1/N=C/c1cc(I)cc(I)c1O	Oc1cccnc1/N=C/c1cc(I)cc(I)c1O	[:1]c1c(C)cccc1O>>[:1]c1ncccc1O	[*:1]/N=C/c1cc(I)cc(I)c1O	[*:1]c1c(C)cccc1O	[*:1]c1ncccc1O	COc1cccc(/N=C/c2cc(I)cc(I)c2O)c1	[:1]c1c(C)cccc1O>>[:1]c1cccc(OC)c1	[*:1]c1cccc(OC)c1
...	...	...	...	...	...	...	...	...	...
120	CCCn1ccc(=N)cc1.I	Br.CCCCCCCCCCn1ccc(=N)cc1	[:1]CCC>>[:1]CCCCCCCCCC	[*:1]n1ccc(=N)cc1	[*:1]CCC	[*:1]CCCCCCCCCC	CCCCCCCn1ccc(=N)cc1.I	[:1]CCC>>[:1]CCCCCCC	[*:1]CCCCCCC
121	CCCCn1ccc(=N)cc1.I	Br.CCCCCCCCn1ccc(=N)cc1	[:1]CCCC>>[:1]CCCCCCCC	[*:1]n1ccc(=N)cc1	[*:1]CCCC	[*:1]CCCCCCCC	CCCCCCCn1ccc(=N)cc1.I	[:1]CCCC>>[:1]CCCCCCC	[*:1]CCCCCCC
122	CCCCn1ccc(=N)cc1.I	Br.CCCCCCCCCCn1ccc(=N)cc1	[:1]CCCC>>[:1]CCCCCCCCCC	[*:1]n1ccc(=N)cc1	[*:1]CCCC	[*:1]CCCCCCCCCC	CCCCCCCn1ccc(=N)cc1.I	[:1]CCCC>>[:1]CCCCCCC	[*:1]CCCCCCC
123	Br.CCCCCCn1ccc(=N)cc1	Br.CCCCCCCCn1ccc(=N)cc1	[:1]CCCCCC>>[:1]CCCCCCCC	[*:1]n1ccc(=N)cc1	[*:1]CCCCCC	[*:1]CCCCCCCC	CCCCCCCn1ccc(=N)cc1.I	[:1]CCCCCC>>[:1]CCCCCCC	[*:1]CCCCCCC
124	Br.CCCCCCn1ccc(=N)cc1	Br.CCCCCCCCCCn1ccc(=N)cc1	[:1]CCCCCC>>[:1]CCCCCCCCCC	[*:1]n1ccc(=N)cc1	[*:1]CCCCCC	[*:1]CCCCCCCCCC	CCCCCCCn1ccc(=N)cc1.I	[:1]CCCCCC>>[:1]CCCCCCC	[*:1]CCCCCCC

125 rows Ã— 9 columns

mols=[]

labels=[]


for i in range(len(comp_a_lhs_overlap)):

    # compound_A
    core = Chem.MolFromSmiles(comp_a_lhs_overlap.common_core.iloc[i])
    # LHS
    lhs = Chem.MolFromSmiles(comp_a_lhs_overlap.LHS.iloc[i])
    # compound_B_evader
    RHS_evader = Chem.MolFromSmiles(comp_a_lhs_overlap.RHS_evader.iloc[i])
    # compound_B_substrate
    RHS_substrate = Chem.MolFromSmiles(comp_a_lhs_overlap.RHS_substrate.iloc[i])

#     mols=[core, lhs , RHS_substrate, RHS_evader]

    # labels
    inactive_label = e_coli_wild_efflux[e_coli_wild_efflux['SMILES'] == comp_a_lhs_overlap.compound_structure_A.iloc[i]][['INHIB_AVE_wild', 'INHIB_AVE_efflux']].values

    evader_label = e_coli_wild_efflux[e_coli_wild_efflux['SMILES'] == comp_a_lhs_overlap.compound_structure_B_evader.iloc[i]][['INHIB_AVE_wild', 'INHIB_AVE_efflux']].values

    substrate_label = e_coli_wild_efflux[e_coli_wild_efflux['SMILES'] == comp_a_lhs_overlap.compound_structure_B_substrate.iloc[i]][['INHIB_AVE_wild', 'INHIB_AVE_efflux']].values

    lab = ['Common core no_{}'.format(i), 'Inactive\n WT: {:.1f}%; tolC: {:.1f}%'.format(inactive_label[0][0], inactive_label[0][1]), 'Substrate\n WT: {:.1f}%; tolC: {:.1f}%'.format(substrate_label[0][0], substrate_label[0][1]), 'Evader\n WT: {:.1f}%; tolC: {:.1f}%'.format(evader_label[0][0], evader_label[0][1]),]

# img = Chem.Draw.MolsToGridImage(mols, molsPerRow=4, subImgSize=(250,250), legends=lab, useSVG=True)
    mols.append(core)
    mols.append(lhs)
    mols.append(RHS_substrate)
    mols.append(RHS_evader)
    
    labels.append(lab[0])
    labels.append(lab[1])
    labels.append(lab[2])
    labels.append(lab[3])


img = Chem.Draw.MolsToGridImage(mols, molsPerRow=4, subImgSize=(250,250), legends=labels, useSVG=False, maxMols= 600, returnPNG=False)


# with open('master_transform_2' + '.svg', 'w') as f:
#     f.write(img.data)

[10:54:46] WARNING: not removing hydrogen atom with dummy atom neighbors

img

substrate_transforms

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	compound_structure_A	compound_structure_B	smirks	common_core	LHS	RHS
2258	C/C(=N/Nc1nc(Nc2cccc(Br)c2)nc(N2CCOCC2)n1)c1cc...	C/C(=N/Nc1nc(Nc2cccc(Br)c2)nc(N2CCOCC2)n1)c1cc...	[:1]c1ccc(Br)cc1>>[:1]c1ccccc1	[*:1]/C(C)=N\Nc1nc(Nc2cccc(Br)c2)nc(N2CCOCC2)n1	[*:1]c1ccc(Br)cc1	[*:1]c1ccccc1
2259	C/C(=N/Nc1nc(Nc2cccc(Br)c2)nc(N2CCOCC2)n1)c1cc...	C/C(=N/Nc1nc(Nc2cccc(Br)c2)nc(N2CCOCC2)n1)c1cc...	[:1]c1ccc(F)cc1>>[:1]c1ccccc1	[*:1]/C(C)=N\Nc1nc(Nc2cccc(Br)c2)nc(N2CCOCC2)n1	[*:1]c1ccc(F)cc1	[*:1]c1ccccc1
3224	N#C/C(=C\c1c(F)cccc1Cl)c1nc2ccccc2[nH]1	Cl.N#C/C(=C\c1ccccc1[N+](=O)[O-])c1nc2ccccc2[nH]1	[:1]c1c(F)cccc1Cl>>[:1]c1ccccc1[N+](=O)[O-]	[*:1]/C=C(\C#N)c1nc2ccccc2[nH]1	[*:1]c1c(F)cccc1Cl	[*:1]c1ccccc1[N+](=O)[O-]
3245	N#C/C(=C\c1cc(Br)c(O)c(Br)c1O)c1nc2ccccc2[nH]1	Cl.N#C/C(=C\c1ccccc1[N+](=O)[O-])c1nc2ccccc2[nH]1	[:1]c1cc(Br)c(O)c(Br)c1O>>[:1]c1ccccc1[N+](=...	[*:1]/C=C(\C#N)c1nc2ccccc2[nH]1	[*:1]c1cc(Br)c(O)c(Br)c1O	[*:1]c1ccccc1[N+](=O)[O-]
3265	COc1c(Cl)cc(Cl)cc1/C=C(\C#N)c1nc2ccccc2[nH]1	Cl.N#C/C(=C\c1ccccc1[N+](=O)[O-])c1nc2ccccc2[nH]1	[:1]c1cc(Cl)cc(Cl)c1OC>>[:1]c1ccccc1[N+](=O)...	[*:1]/C=C(\C#N)c1nc2ccccc2[nH]1	[*:1]c1cc(Cl)cc(Cl)c1OC	[*:1]c1ccccc1[N+](=O)[O-]
...	...	...	...	...	...	...
1404497	CCOC(=O)Cn1ccc(=N)cc1.Cl	CCCCCCCn1ccc(=N)cc1.I	[:1]CC(=O)OCC>>[:1]CCCCCCC	[*:1]n1ccc(=N)cc1	[*:1]CC(=O)OCC	[*:1]CCCCCCC
1404504	Br.CCn1ccc(=N)cc1	CCCCCCCn1ccc(=N)cc1.I	[:1]CC>>[:1]CCCCCCC	[*:1]n1ccc(=N)cc1	[*:1]CC	[*:1]CCCCCCC
1404510	CCCn1ccc(=N)cc1.I	CCCCCCCn1ccc(=N)cc1.I	[:1]CCC>>[:1]CCCCCCC	[*:1]n1ccc(=N)cc1	[*:1]CCC	[*:1]CCCCCCC
1404515	CCCCn1ccc(=N)cc1.I	CCCCCCCn1ccc(=N)cc1.I	[:1]CCCC>>[:1]CCCCCCC	[*:1]n1ccc(=N)cc1	[*:1]CCCC	[*:1]CCCCCCC
1404519	Br.CCCCCCn1ccc(=N)cc1	CCCCCCCn1ccc(=N)cc1.I	[:1]CCCCCC>>[:1]CCCCCCC	[*:1]n1ccc(=N)cc1	[*:1]CCCCCC	[*:1]CCCCCCC

4900 rows Ã— 6 columns

comp_a_lhs_overlap = evader_transforms.merge(substrate_transforms, on=['compound_structure_A', 'LHS', 'common_core'], suffixes=['_evader','_substrate'])

comp_a_lhs_overlap = substrate_transforms.merge(evader_transforms, on=['compound_structure_A'], suffixes=['_substrate', '_evader'])

len(comp_a_lhs_overlap.compound_structure_A.unique())

len(comp_a_lhs_overlap.compound_structure_B_evader.unique())

len(comp_a_lhs_overlap.compound_structure_B_substrate.unique())

substarte_to_evader_feats.iloc[:,:-4].sum().sort_values(ascending=False).head(50)

B7                                                      135
18 - Pyridine                                           135
sp2 hybridized carbon atoms (12)                        120
N5EXC                                                    59
sp3 hybridized carbon atoms (10)                         56
Alpha halo carbonyl                                      47
sp3 hybridized carbon atoms (7)                          46
Alkyl halide                                             46
15 - Secondary amine group                               41
5 - Alcohol                                              40
22 - CCl2                                                39
Enamine                                                  39
sp3 hybridized carbon atoms (12)                         37
4 - Aromatic carbon-alkane                               35
Nitrogen atoms (2)                                       22
1 - Alkane group                                         21
Nitrogen atoms (6)                                       20
33 - Bromo compounds                                     19
2 - Olefin group                                         17
I1 - Aliphatic methylene chains 7 or more long           12
Thiazolidinone                                           12
Dithiocarbamate                                          12
Thiocarbonyl group                                       12
ELEC                                                     12
Aromatic NO2                                             11
Nitrogen atoms (4)                                       11
Dye 16 (1)                                               11
27 - Aromatic nitro                                      11
Imines_(not_ring)                                        10
sp3 hybridized carbon atoms (5)                          10
Nitro group                                              10
Ketone                                                   10
E3 - e.g., carbonates                                     9
48 - CH2S                                                 9
Sulphur atom (3)                                          9
sp3 hybridized carbon atoms (4)                           9
9 - Ä�Â¡arbonyl                                             9
Filter39_imine                                            8
Acyclic N-,=N and not N bound to carbonyl or sulfone      8
Vinyl_halide                                              8
Filter64_halo_ketone_sulfone                              8
Dye 25                                                    7
Filter41_12_dicarbonyl                                    7
Sulphur atom (5)                                          7
Alpha_halo_carbonyl                                       7
Oxalyl                                                    7
Stilbene                                                  7
Diketo group                                              7
Filter26_alkyl_halide                                     7
Beta halo carbonyl                                        7
dtype: int64

substarte_to_evader_feats[substarte_to_evader_feats['B7']>0].iloc[:,:-4].sum().sort_values(ascending=False).tail(20)

Nitrogen atoms (2)                   -1
B8EXC                                -1
sp2 hybridized carbon atoms (4)      -1
Oxygen-nitrogen single bond          -1
Dye 16 (1)                           -1
Nitrogen atoms (4)                   -1
Negatively charged atoms             -1
4 - Aromatic carbon-alkane           -7
sp3 hybridized carbon atoms (7)      -7
1 - Alkane group                     -7
sp3 hybridized carbon atoms (10)     -7
4-chlorobenzene                      -8
38 - Aromatic fluoro                -19
High halogen content (>3)           -22
25 - Aromatic chloro                -30
sp2 hybridized carbon atoms (8)     -54
13 - Ether                          -54
sp3 hybridized carbon atoms (6)     -54
sp3 hybridized carbon atoms (11)    -61
sp2 hybridized carbon atoms (7)    -117
dtype: int64

comp_a_lhs_overlap

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	compound_structure_A	compound_structure_B_substrate	idsmiles_A_substrate	idsmiles_B_substrate	smirks_substrate	common_core_substrate	measurement_A_substrate	measurement_B_substrate	measurement_delta_substrate	LHS_substrate	...	smirks_evader	common_core_evader	measurement_A_evader	measurement_B_evader	measurement_delta_evader	LHS_evader	RHS_evader	mol_inactive	mol_substrate	mol_evader
0	O=[N+]([O-])c1ccc(/N=C/c2cc(Cl)cc(Cl)c2O)cc1	O=[N+]([O-])c1ccc(/N=C/c2cc(I)cc(I)c2O)cc1	45889	45890	[:1]c1cc(Cl)cc(Cl)c1O>>[:1]c1cc(I)cc(I)c1O	[*:1]/C=N/c1ccc([N+](=O)[O-])cc1	2.60	56.00	53.40	[*:1]c1cc(Cl)cc(Cl)c1O	...	[:1]/N=C\c1cc([:2])cc([:3])c1O>>[:2]C([*:3...	[:2]Cl.[:3]Cl.[*:1]c1ccc([N+](=O)[O-])cc1	2.60	-0.18	-2.78	[:1]/N=C\c1cc([:2])cc([*:3])c1O	[:2]C([:3])C(=O)NC(CO)C([*:1])O	<rdkit.Chem.rdchem.Mol object at 0x000002A9F42...	<rdkit.Chem.rdchem.Mol object at 0x000002A9F75...	<rdkit.Chem.rdchem.Mol object at 0x000002A9F42...
1	O=[N+]([O-])c1ccc(/N=C/c2cc(Cl)cc(Cl)c2O)cc1	O=[N+]([O-])c1ccc(/N=C/c2cc(I)cc(I)c2O)cc1	45889	45890	[:1]c1cc(Cl)cc(Cl)c1O>>[:1]c1cc(I)cc(I)c1O	[*:1]/C=N/c1ccc([N+](=O)[O-])cc1	2.60	56.00	53.40	[*:1]c1cc(Cl)cc(Cl)c1O	...	[:1]/N=C\c1cc([:2])cc(Cl)c1[:3]>>[:1]C([*:...	[:2]Cl.[:3]O.[*:1]c1ccc([N+](=O)[O-])cc1	2.60	-0.18	-2.78	[:1]/N=C\c1cc([:2])cc(Cl)c1[*:3]	[:1]C([:3])C(CO)NC(=O)[C@@H]([*:2])Cl	<rdkit.Chem.rdchem.Mol object at 0x000002A9F42...	<rdkit.Chem.rdchem.Mol object at 0x000002A9F75...	<rdkit.Chem.rdchem.Mol object at 0x000002A9F75...
2	O=[N+]([O-])c1ccc(/N=C/c2cc(Cl)cc(Cl)c2O)cc1	O=[N+]([O-])c1ccc(/N=C/c2cc(I)cc(I)c2O)cc1	45889	45890	[:1]c1cc(Cl)cc(Cl)c1O>>[:1]c1cc(I)cc(I)c1O	[*:1]/C=N/c1ccc([N+](=O)[O-])cc1	2.60	56.00	53.40	[*:1]c1cc(Cl)cc(Cl)c1O	...	[:1]/N=C\c1cc(Cl)cc([:2])c1[:3]>>[:1]C([*:...	[:2]Cl.[:3]O.[*:1]c1ccc([N+](=O)[O-])cc1	2.60	-0.18	-2.78	[:1]/N=C\c1cc(Cl)cc([:2])c1[*:3]	[:1]C([:3])C(CO)NC(=O)[C@@H]([*:2])Cl	<rdkit.Chem.rdchem.Mol object at 0x000002A9F42...	<rdkit.Chem.rdchem.Mol object at 0x000002A9F75...	<rdkit.Chem.rdchem.Mol object at 0x000002A9F75...
3	O=[N+]([O-])c1ccc(/N=C/c2cc(Cl)cc(Cl)c2O)cc1	O=[N+]([O-])c1ccc(/N=C/c2cc(I)cc(I)c2O)cc1	45889	45890	[:1]c1cc(Cl)cc(Cl)c1O>>[:1]c1cc(I)cc(I)c1O	[*:1]/C=N/c1ccc([N+](=O)[O-])cc1	2.60	56.00	53.40	[*:1]c1cc(Cl)cc(Cl)c1O	...	[:1]/N=C\c1cc([:2])cc(Cl)c1[:3]>>[:1]C([*:...	[:2]Cl.[:3]O.[*:1]c1ccc([N+](=O)[O-])cc1	2.60	-0.18	-2.78	[:1]/N=C\c1cc([:2])cc(Cl)c1[*:3]	[:1]C([:3])C(CO)NC(=O)[C@H]([*:2])Cl	<rdkit.Chem.rdchem.Mol object at 0x000002A9F42...	<rdkit.Chem.rdchem.Mol object at 0x000002A9F75...	<rdkit.Chem.rdchem.Mol object at 0x000002A9F75...
4	O=[N+]([O-])c1ccc(/N=C/c2cc(Cl)cc(Cl)c2O)cc1	O=[N+]([O-])c1ccc(/N=C/c2cc(I)cc(I)c2O)cc1	45889	45890	[:1]c1cc(Cl)cc(Cl)c1O>>[:1]c1cc(I)cc(I)c1O	[*:1]/C=N/c1ccc([N+](=O)[O-])cc1	2.60	56.00	53.40	[*:1]c1cc(Cl)cc(Cl)c1O	...	[:1]/N=C\c1cc(Cl)cc([:2])c1[:3]>>[:1]C([*:...	[:2]Cl.[:3]O.[*:1]c1ccc([N+](=O)[O-])cc1	2.60	-0.18	-2.78	[:1]/N=C\c1cc(Cl)cc([:2])c1[*:3]	[:1]C([:3])C(CO)NC(=O)[C@H]([*:2])Cl	<rdkit.Chem.rdchem.Mol object at 0x000002A9F42...	<rdkit.Chem.rdchem.Mol object at 0x000002A9F75...	<rdkit.Chem.rdchem.Mol object at 0x000002A9F75...
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
304	CCCn1ccc(=N)cc1.I	CCCCCCCn1ccc(=N)cc1.I	28118	28233	[:1]CCC>>[:1]CCCCCCC	[*:1]n1ccc(=N)cc1	1.42	56.99	55.57	[*:1]CCC	...	[:1]CCC>>[:1]CCCCCCCCCC	[*:1]n1ccc(=N)cc1	1.42	-5.56	-6.98	[*:1]CCC	[*:1]CCCCCCCCCC	<rdkit.Chem.rdchem.Mol object at 0x000002A9F75...	<rdkit.Chem.rdchem.Mol object at 0x000002A9F75...	<rdkit.Chem.rdchem.Mol object at 0x000002A9F75...
305	CCCCn1ccc(=N)cc1.I	CCCCCCCn1ccc(=N)cc1.I	28145	28233	[:1]CCCC>>[:1]CCCCCCC	[*:1]n1ccc(=N)cc1	-14.52	56.99	71.51	[*:1]CCCC	...	[:1]CCCC>>[:1]CCCCCCCC	[*:1]n1ccc(=N)cc1	-14.52	32.75	47.27	[*:1]CCCC	[*:1]CCCCCCCC	<rdkit.Chem.rdchem.Mol object at 0x000002A9F75...	<rdkit.Chem.rdchem.Mol object at 0x000002A9F75...	<rdkit.Chem.rdchem.Mol object at 0x000002A9F75...
306	CCCCn1ccc(=N)cc1.I	CCCCCCCn1ccc(=N)cc1.I	28145	28233	[:1]CCCC>>[:1]CCCCCCC	[*:1]n1ccc(=N)cc1	-14.52	56.99	71.51	[*:1]CCCC	...	[:1]CCCC>>[:1]CCCCCCCCCC	[*:1]n1ccc(=N)cc1	-14.52	-5.56	8.96	[*:1]CCCC	[*:1]CCCCCCCCCC	<rdkit.Chem.rdchem.Mol object at 0x000002A9F75...	<rdkit.Chem.rdchem.Mol object at 0x000002A9F75...	<rdkit.Chem.rdchem.Mol object at 0x000002A9F75...
307	Br.CCCCCCn1ccc(=N)cc1	CCCCCCCn1ccc(=N)cc1.I	28228	28233	[:1]CCCCCC>>[:1]CCCCCCC	[*:1]n1ccc(=N)cc1	13.72	56.99	43.27	[*:1]CCCCCC	...	[:1]CCCCCC>>[:1]CCCCCCCC	[*:1]n1ccc(=N)cc1	13.72	32.75	19.03	[*:1]CCCCCC	[*:1]CCCCCCCC	<rdkit.Chem.rdchem.Mol object at 0x000002A9F75...	<rdkit.Chem.rdchem.Mol object at 0x000002A9F75...	<rdkit.Chem.rdchem.Mol object at 0x000002A9F75...
308	Br.CCCCCCn1ccc(=N)cc1	CCCCCCCn1ccc(=N)cc1.I	28228	28233	[:1]CCCCCC>>[:1]CCCCCCC	[*:1]n1ccc(=N)cc1	13.72	56.99	43.27	[*:1]CCCCCC	...	[:1]CCCCCC>>[:1]CCCCCCCCCC	[*:1]n1ccc(=N)cc1	13.72	-5.56	-19.28	[*:1]CCCCCC	[*:1]CCCCCCCCCC	<rdkit.Chem.rdchem.Mol object at 0x000002A9F75...	<rdkit.Chem.rdchem.Mol object at 0x000002A9F75...	<rdkit.Chem.rdchem.Mol object at 0x000002A9F75...

309 rows Ã— 24 columns

substarte_to_evader_feats

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	0	26	28	42	43	> 2 ester groups	1 - Alkane group	1,2-Dicarbonyl not in ring	10 - Aldehyde	11 - Acetate group	...	Vinyl michael acceptor2	Vinyl_halide	Vinyl_sulphone	Primary amine, not amide	Primary or secondary amine, not amide.	tertiary aliphatic amine	carboxylic acid	Smiles	smirks_evader	smirks_substrate
0	0	0	0	0	0	0	1	0	0	0	...	0	0	0	0	0	0	0	0	[:1]/N=C\c1cc([:2])cc([:3])c1O>>[:2]C([*:3...	[:1]c1cc(Cl)cc(Cl)c1O>>[:1]c1cc(I)cc(I)c1O
1	0	0	0	0	0	0	1	0	0	0	...	0	0	0	0	0	0	0	0	[:1]/N=C\c1cc([:2])cc(Cl)c1[:3]>>[:1]C([*:...	[:1]c1cc(Cl)cc(Cl)c1O>>[:1]c1cc(I)cc(I)c1O
2	0	0	0	0	0	0	1	0	0	0	...	0	0	0	0	0	0	0	0	[:1]/N=C\c1cc(Cl)cc([:2])c1[:3]>>[:1]C([*:...	[:1]c1cc(Cl)cc(Cl)c1O>>[:1]c1cc(I)cc(I)c1O
3	0	0	0	0	0	0	1	0	0	0	...	0	0	0	0	0	0	0	0	[:1]/N=C\c1cc([:2])cc(Cl)c1[:3]>>[:1]C([*:...	[:1]c1cc(Cl)cc(Cl)c1O>>[:1]c1cc(I)cc(I)c1O
4	0	0	0	0	0	0	1	0	0	0	...	0	0	0	0	0	0	0	0	[:1]/N=C\c1cc(Cl)cc([:2])c1[:3]>>[:1]C([*:...	[:1]c1cc(Cl)cc(Cl)c1O>>[:1]c1cc(I)cc(I)c1O
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
304	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	[:1]CCC>>[:1]CCCCCCCCCC	[:1]CCC>>[:1]CCCCCCC
305	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	[:1]CCCC>>[:1]CCCCCCCC	[:1]CCCC>>[:1]CCCCCCC
306	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	[:1]CCCC>>[:1]CCCCCCCCCC	[:1]CCCC>>[:1]CCCCCCC
307	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	[:1]CCCCCC>>[:1]CCCCCCCC	[:1]CCCCCC>>[:1]CCCCCCC
308	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	[:1]CCCCCC>>[:1]CCCCCCCCCC	[:1]CCCCCC>>[:1]CCCCCCC

309 rows Ã— 761 columns

Name		Name	Last commit message	Last commit date
Latest commit History 20 Commits
data_curated		data_curated
example_workflow_files		example_workflow_files
master_functions		master_functions
README.md		README.md

domgurvic/efflux_evaders_and_substrates

Folders and files

Latest commit

History

Repository files navigation

E. coli efflux evaders and substrates - chemical space

Initial set up

Importing master dataset

Plotting WT vs tolC

Paired t-test

Defining evaders and substartes

Resulting Number of classes

Scatter Plot

Resulting evaders and substartes

OM Bias

Resulting evaders and substartes

Re-defining inactive mols

t-SNE of evaders vs substartes

t-SNE of evader + substrate + inactive

PCA of evaders, substrates and inactives

MMPA

Evader Transforms

Substrate Transforms

Transforming substarte into evaders

find correlated feats:

vis

find examples visually

physcichemical of sub to evade transforms:

cluster 8

Tripple Transforms

About

Resources

Stars

Watchers

Forks

Languages