# load conda environment
from master_functions import master_functions
# data process
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
#chem
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, MACCSkeys, Descriptors, Descriptors3D, Draw, rdMolDescriptors, Draw, PandasTools, rdFingerprintGenerator
from rdkit.DataManip.Metric.rdMetricMatrixCalc import GetTanimotoSimMat, GetTanimotoDistMat
# from rdkit.Chem.Draw import IPythonConsole
# import curated datasets
efflux_evaders_om_corrected = pd.read_pickle('data_curated/efflux_evaders_om_corrected.pkl')
efflux_substrates_om_corrected = pd.read_pickle('data_curated/efflux_substrates_om_corrected.pkl')
inactive = pd.read_pickle('data_curated/new_inactive.pkl') # this file is too big to upload to github, you can get your inactives from the inhibition file
# import master inhibition data
inhibition = pd.read_csv('data/CO-ADD_InhibitionData_r03_01-02-2020_CSV.csv', low_memory=False)
# this dataset can be downlaoded from: "https://www.co-add.org/"
# check strains avilable in organism == e. coli
inhibition[inhibition['ORGANISM'] == 'Escherichia coli'].STRAIN.value_counts()
ATCC 25922 82517
lpxC; MB4902 81058
tolC; MB5747 74177
Name: STRAIN, dtype: int64
# one compound has outlying values of -213.7 and -278.75 and -329.47 for WT, tolC and lpxC respectivley, it skews data, I will drop it.
inhibition = inhibition[inhibition.SMILES != 'S(O)(=O)(=O)c1ccccc1\\C(\\c(cc(C)c(c2Br)O)c2)=C(\\C=C3C)/C=C(C3=O)Br']
# define subsets:
e_coli_wild = inhibition[(inhibition['ORGANISM']=='Escherichia coli') & (inhibition['STRAIN']=='ATCC 25922')][['SMILES', 'INHIB_AVE']].groupby('SMILES').mean().reset_index()
e_coli_efflux = inhibition[(inhibition['ORGANISM']=='Escherichia coli') & (inhibition['STRAIN']=='tolC; MB5747')][['SMILES', 'INHIB_AVE']].groupby('SMILES').mean().reset_index()
e_coli_pore = inhibition[(inhibition['ORGANISM']=='Escherichia coli') & (inhibition['STRAIN']=='lpxC; MB4902')][['SMILES', 'INHIB_AVE']].groupby('SMILES').mean().reset_index()
# collect overlping data:
e_coli_wild_efflux = e_coli_wild[['SMILES', 'INHIB_AVE']].merge(e_coli_efflux[['SMILES', 'INHIB_AVE']], on='SMILES', suffixes=('_wild', '_efflux'))
e_coli_wild_perm = e_coli_wild[['SMILES', 'INHIB_AVE']].merge(e_coli_pore[['SMILES', 'INHIB_AVE']], on='SMILES', suffixes=('_wild', '_lpxC'))
# e_coli_wild_efflux[['INHIB_AVE_wild', 'INHIB_AVE_efflux']].plot.hist(bins=200, alpha=0.5, figsize=[10,7])
sns.set(context='notebook', style='ticks', font='Times New Roman', font_scale=1.6, rc=None)
fig, ax = plt.subplots(figsize=(7,7))
sns.set(context='notebook', style='ticks', font='Times New Roman', font_scale=1.9, rc=None)
sns.histplot(e_coli_wild_efflux[['INHIB_AVE_efflux', 'INHIB_AVE_wild']], alpha=0.5, bins=150)
plt.legend(labels = ['Wild Type', '$\Delta TolC$'], fontsize=15)
plt.xlim([-120, 120])
plt.xlabel('Growth Inhibition based on $OD_{600}$ (%)', fontsize=22);
plt.ylabel('Number of Compounds', fontsize=22);
plt.yticks(fontsize=20)
plt.xticks(fontsize=20)
plt.tight_layout()
sns.despine()
# we can now compoute paired t-test to see if removing TolC made a significant difference or not:
stats.ttest_rel(e_coli_wild_efflux['INHIB_AVE_wild'], e_coli_wild_efflux['INHIB_AVE_efflux'])
Ttest_relResult(statistic=-44.099887587864416, pvalue=0.0)
# calculate z-score:
e_coli_wild_efflux['wild_stds'] = stats.zscore(e_coli_wild_efflux.INHIB_AVE_wild)
e_coli_wild_efflux['tolc_stds'] = stats.zscore(e_coli_wild_efflux.INHIB_AVE_efflux)
# label each compounds according to threshold of 4
threshold = 4
def label_it(row):
if row['wild_stds'] >=threshold:
return 'active'
if row['wild_stds'] <threshold:
return 'inactive'
e_coli_wild_efflux['wild_class'] = e_coli_wild_efflux.apply(label_it, axis=1)
def label_it_tolc(row):
if row['tolc_stds'] >=threshold:
return 'active'
if row['tolc_stds'] <threshold:
return 'inactive'
e_coli_wild_efflux['tolc_class'] = e_coli_wild_efflux.apply(label_it_tolc, axis=1)
# label compounds based on combination of activity defined above
def label_substrate(row):
if row['tolc_class'] == 'active' and row['wild_class'] == 'inactive':
return 'Efflux Substrate'
if row['tolc_class'] == 'active' and row['wild_class'] == 'active':
return 'Efflux Evader'
if row['tolc_class'] == 'inactive' and row['wild_class'] == 'inactive':
return 'Inactive'
if row['tolc_class'] == 'inactive' and row['wild_class'] == 'active':
return 'WT-only Active'
# check the numbers of classified data
e_coli_wild_efflux['Class'] = e_coli_wild_efflux.apply(label_substrate, axis=1)
e_coli_wild_efflux.Class.value_counts()
Inactive 72730
Efflux Substrate 760
Efflux Evader 200
WT-only Active 53
Name: Class, dtype: int64
sns.set(context='notebook', style='ticks', font='Times New Roman', font_scale=1.6, rc=None)
fig, ax = plt.subplots(figsize=(7,7))
sns.scatterplot(data = e_coli_wild_efflux, x='INHIB_AVE_wild', y='INHIB_AVE_efflux', hue='Class', s=30)
sns.despine()
# plt.legend(fontsize=20)
# plt.xlim([-120, 120])
plt.xlabel('$\it{E. coli}$ WT Growth Inhibition (%)', font='Sans serif');
plt.ylabel('$\it{E. coli}$ $\it{tolC}$ Growth Inhibition (%)', font='Sans serif');
# plt.yticks(fontsize=20)
# plt.xticks(fontsize=20)
# plt.axvline(x=43.02, color='red', linestyle='--', alpha=0.5)
# plt.axhline(y=74.98, color='red', linestyle='--', alpha=0.5)
plt.axvline(x=e_coli_wild_efflux[e_coli_wild_efflux['wild_stds']>=4].sort_values(by='wild_stds').INHIB_AVE_wild.iloc[0], color='red', linestyle='--', alpha=0.5)
plt.axhline(y=e_coli_wild_efflux[e_coli_wild_efflux['tolc_stds']>=4].sort_values(by='tolc_stds').INHIB_AVE_efflux.iloc[0], color='red', linestyle='--', alpha=0.5)
plt.legend(fontsize=15)
plt.tight_layout()
plt.savefig('figures/wild_tolc_class_scatter.png', dpi=600)
# we can save those datasets seperately
efflux_substrate = e_coli_wild_efflux[e_coli_wild_efflux['Class']=='Efflux Substrate']
efflux_evader = e_coli_wild_efflux[e_coli_wild_efflux['Class']=='Efflux Evader']
wt_only = e_coli_wild_efflux[e_coli_wild_efflux['Class']=='WT-only Active']
inactive = e_coli_wild_efflux[e_coli_wild_efflux['Class']=='Inactive']
print('No. of resulting evaders: {} \nNo. of resulting substrates: {}'.format(len(efflux_evader), len(efflux_substrate)))
No. of resulting evaders: 200
No. of resulting substrates: 760
# import permeating and non-permeating datapoints, they were achived using same process as described above
om_permeating = pd.read_pickle('data_curated/om_permeating.pkl')
om_non_permeating = pd.read_pickle('data_curated/om_non_permeating.pkl')
# to compare the smiles between the two we fisr turn all smiles into same canonical smiles format
efflux_evader['SMILES'] = efflux_evader.SMILES.apply(Chem.CanonSmiles)
efflux_substrate['SMILES'] = efflux_substrate.SMILES.apply(Chem.CanonSmiles)
C:\Users\domin\AppData\Local\Temp\ipykernel_23268\348032441.py:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
efflux_evader['SMILES'] = efflux_evader.SMILES.apply(Chem.CanonSmiles)
C:\Users\domin\AppData\Local\Temp\ipykernel_23268\348032441.py:4: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
efflux_substrate['SMILES'] = efflux_substrate.SMILES.apply(Chem.CanonSmiles)
# grab only evaders that are also in OM permeating class
efflux_evaders_om_corrected = efflux_evader[efflux_evader['SMILES'].isin(om_permeating['SMILES'])]
# grab only substrates that are not in non-permeating class
efflux_substrates_om_corrected = efflux_substrate[~efflux_substrate['SMILES'].isin(om_non_permeating['SMILES'])]
print('No. of resulting evaders: {} \nNo. of resulting substrates: {}'.format(len(efflux_evaders_om_corrected), len(efflux_substrates_om_corrected)))
No. of resulting evaders: 186
No. of resulting substrates: 554
e_coli_wild_efflux['mol'] = e_coli_wild_efflux.SMILES.apply(Chem.MolFromSmiles)
[20:56:01] Explicit valence for atom # 2 C, 6, is greater than permitted
[20:56:04] Explicit valence for atom # 0 B, 6, is greater than permitted
[20:56:04] Explicit valence for atom # 0 B, 6, is greater than permitted
[20:56:04] Explicit valence for atom # 0 B, 6, is greater than permitted
[20:56:04] Explicit valence for atom # 0 B, 6, is greater than permitted
[20:56:04] Explicit valence for atom # 0 C, 6, is greater than permitted
e_coli_wild_efflux = e_coli_wild_efflux.dropna()
e_coli_wild_efflux['SMILES'] = e_coli_wild_efflux.SMILES.apply(Chem.CanonSmiles)
C:\Users\domin\AppData\Local\Temp\ipykernel_3876\1164120927.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
e_coli_wild_efflux['SMILES'] = e_coli_wild_efflux.SMILES.apply(Chem.CanonSmiles)
wt_only['mol'] = wt_only.SMILES.apply(Chem.MolFromSmiles)
wt_only = wt_only.dropna()
wt_only['SMILES'] = wt_only.SMILES.apply(Chem.CanonSmiles)
# Since efflux evaders and substartes have changed we must redifine inactive molecules, as:
# Original dataset without evaders and substartes and wt-active only
not_inactive = pd.concat([efflux_evaders_om_corrected, efflux_substrates_om_corrected, wt_only])
inactive = e_coli_wild_efflux[~e_coli_wild_efflux['SMILES'].isin(not_inactive['SMILES'])]
inactive['mol'] = inactive.SMILES.apply(Chem.MolFromSmiles)
inactive = inactive.dropna(subset=['mol'])
inactive.reset_index(drop=True, inplace=True)
inactive['SMILES'] = inactive.SMILES.apply(Chem.CanonSmiles)
C:\Users\domin\AppData\Local\Temp\ipykernel_23268\1771852805.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
inactive['mol'] = inactive.SMILES.apply(Chem.MolFromSmiles)
inactive.to_pickle('data_curated/new_inactive.pkl')
# sample of what the dataset currently looks like
efflux_substrates_om_corrected.head(5)
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
SMILES | INHIB_AVE_wild | INHIB_AVE_efflux | Mol | fps | abs_diff | sub_class | wild_stds | tolc_stds | wild_class | tolc_class | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
145 | Brc1cncc(-c2cc(NCCCn3ccnc3)nc(-c3ccccc3)n2)c1 | 4.60 | 80.47 | <rdkit.Chem.rdchem.Mol object at 0x000002164E6... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | 75.87 | increase | 0.054629 | 4.326538 | inactive | active | Efflux Substrate |
308 | N#C/C(=N\Nc1cccc(C(F)(F)F)c1)C(N)=S | 18.36 | 87.98 | <rdkit.Chem.rdchem.Mol object at 0x000002164E6... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | 69.62 | increase | 1.468421 | 4.766464 | inactive | active | Efflux Substrate |
403 | CC(C)C(=O)/C(=C/c1ccc(Cl)cc1Cl)n1cncn1 | 5.84 | 97.31 | <rdkit.Chem.rdchem.Mol object at 0x00000215D73... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | 91.47 | increase | 0.182034 | 5.313003 | inactive | active | Efflux Substrate |
585 | O=C(N/N=C(/CC(=O)c1cccs1)C(F)(F)F)c1cccc([N+](... | -3.58 | 88.80 | <rdkit.Chem.rdchem.Mol object at 0x000002164E6... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | 92.38 | increase | -0.785838 | 4.814498 | inactive | active | Efflux Substrate |
589 | O=C(N/N=C(/CC(=O)c1cccs1)C(F)(F)F)c1ccc(Cl)cc1 | 20.78 | 77.14 | <rdkit.Chem.rdchem.Mol object at 0x000002164E6... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | 56.36 | increase | 1.717067 | 4.131471 | inactive | active | Efflux Substrate |
# we need to compute fingerprints from SMILES for t-sne:
mfpgen =rdFingerprintGenerator.GetMorganGenerator(radius=2,fpSize=2048)
efflux_evaders_om_corrected['mol'] = efflux_evaders_om_corrected.SMILES.apply(Chem.MolFromSmiles)
efflux_evaders_om_corrected.dropna(subset=['mol'], inplace=True)
efflux_evaders_om_corrected['fps']=efflux_evaders_om_corrected.mol.apply(mfpgen.GetFingerprint)
# substartes
efflux_substrates_om_corrected['mol'] = efflux_substrates_om_corrected.SMILES.apply(Chem.MolFromSmiles)
efflux_substrates_om_corrected.dropna(subset=['mol'], inplace=True)
efflux_substrates_om_corrected['fps']=efflux_substrates_om_corrected.mol.apply(mfpgen.GetFingerprint)
C:\Users\domin\AppData\Local\Temp\ipykernel_4000\1058469890.py:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
efflux_evaders_om_corrected['mol'] = efflux_evaders_om_corrected.SMILES.apply(Chem.MolFromSmiles)
C:\Users\domin\AppData\Local\Temp\ipykernel_4000\1058469890.py:6: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
efflux_evaders_om_corrected.dropna(subset=['mol'], inplace=True)
C:\Users\domin\AppData\Local\Temp\ipykernel_4000\1058469890.py:8: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
efflux_evaders_om_corrected['fps']=efflux_evaders_om_corrected.mol.apply(mfpgen.GetFingerprint)
C:\Users\domin\AppData\Local\Temp\ipykernel_4000\1058469890.py:12: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
efflux_substrates_om_corrected['mol'] = efflux_substrates_om_corrected.SMILES.apply(Chem.MolFromSmiles)
C:\Users\domin\AppData\Local\Temp\ipykernel_4000\1058469890.py:13: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
efflux_substrates_om_corrected.dropna(subset=['mol'], inplace=True)
C:\Users\domin\AppData\Local\Temp\ipykernel_4000\1058469890.py:15: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
efflux_substrates_om_corrected['fps']=efflux_substrates_om_corrected.mol.apply(mfpgen.GetFingerprint)
# combine two datasets and reset index
sub_and_evade_om_corrected = pd.concat([efflux_evaders_om_corrected,efflux_substrates_om_corrected]).reset_index(drop=True)
def tsne_no_plot(df, perp):
sample=df.values
tanimoto_sim_mat_lower_triangle=GetTanimotoSimMat(sample) # similartity matrix
n_mol = len(sample)
similarity_matrix = np.ones([n_mol,n_mol])
i_lower= np.tril_indices(n=n_mol,m=n_mol,k=-1)
i_upper= np.triu_indices(n=n_mol,m=n_mol,k=1)
similarity_matrix[i_lower] = tanimoto_sim_mat_lower_triangle
similarity_matrix[i_upper] = similarity_matrix.T[i_upper]
distance_matrix = np.subtract(1,similarity_matrix)
TSNE_sim = TSNE(verbose=1, n_components=2, init='pca', method='barnes_hut', perplexity=perp).fit_transform(distance_matrix)
tsne_result = pd.DataFrame(data = TSNE_sim , columns=["TC1","TC2"])
return tsne_result
sub_and_evade_om_corrected_tsne = tsne_no_plot(sub_and_evade_om_corrected['fps'], perp=50)
fig, ax = plt.subplots(figsize=(8,8))
sns.scatterplot(x='TC1',y='TC2',data=sub_and_evade_om_corrected_tsne, s=30 ,alpha=0.9, hue=sub_and_evade_om_corrected['Class'])
# plt.legend(fontsize=20)
fig, ax = plt.subplots(figsize=(8,8))
sns.kdeplot(x='TC1',y='TC2',data=sub_and_evade_om_corrected_tsne,alpha=0.7, hue=sub_and_evade_om_corrected['Class'], levels = 4)
# plt.legend(fontsize=20)
[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 740 samples in 0.001s...
[t-SNE] Computed neighbors for 740 samples in 0.126s...
[t-SNE] Computed conditional probabilities for sample 740 / 740
[t-SNE] Mean sigma: 0.709102
[t-SNE] KL divergence after 250 iterations with early exaggeration: 59.515961
[t-SNE] KL divergence after 1000 iterations: 0.638264
<Axes: xlabel='TC1', ylabel='TC2'>
inactive_sample = inactive.sample(500, random_state= 42)
inactive_sample['mol'] = inactive_sample.SMILES.apply(Chem.MolFromSmiles)
inactive_sample.dropna(subset=['mol'], inplace=True)
inactive_sample['fps']=inactive_sample.mol.apply(mfpgen.GetFingerprint)
# add sample of inactive mols into the mix
sub_evade_inactive = pd.concat([sub_and_evade_om_corrected, inactive_sample])
sub_evade_inactive.reset_index(drop=True, inplace=True)
sub_evade_inactive_tsne = tsne_no_plot(sub_evade_inactive['fps'], perp=50)
[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 1240 samples in 0.002s...
[t-SNE] Computed neighbors for 1240 samples in 0.083s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1240
[t-SNE] Computed conditional probabilities for sample 1240 / 1240
[t-SNE] Mean sigma: 0.725025
[t-SNE] KL divergence after 250 iterations with early exaggeration: 65.622444
[t-SNE] KL divergence after 1000 iterations: 0.954569
sns.set(context='notebook', style='ticks', font='Times New Roman', font_scale=1.6, rc=None)
fig, ax = plt.subplots(figsize=(8,8))
sns.scatterplot(x='TC1',y='TC2',data=sub_evade_inactive_tsne, s=20 ,alpha=0.5, hue=sub_evade_inactive['Class'], legend=False)
sns.kdeplot(x='TC1',y='TC2',data=sub_evade_inactive_tsne, hue=sub_evade_inactive['Class'], levels = 2, linewidths=2)
sns.despine()
plt.savefig('tsne_all.svg')
We find some overlapping compounds
om_permeating = pd.read_pickle('data_curated/om_permeating.pkl')
mfpgen =rdFingerprintGenerator.GetMorganGenerator(radius=2,fpSize=2048)
om_permeating['mol'] = om_permeating.SMILES.apply(Chem.MolFromSmiles)
om_permeating.dropna(subset=['mol'], inplace=True)
om_permeating['fps']=om_permeating.mol.apply(mfpgen.GetFingerprint)
sub_evade_om_permeating = pd.concat([sub_and_evade_om_corrected, om_permeating])
sub_evade_om_permeating.reset_index(drop=True, inplace=True)
sub_evade_om_permeating_tsne = tsne_no_plot(sub_evade_om_permeating['fps'], perp=50)
fig, ax = plt.subplots(figsize=(8,8))
sns.scatterplot(x='TC1',y='TC2',data=sub_evade_om_permeating_tsne, s=30 ,alpha=0.9, hue=sub_evade_om_permeating['Class'])
# plt.legend(fontsize=20)
fig, ax = plt.subplots(figsize=(8,8))
sns.kdeplot(x='TC1',y='TC2',data=sub_evade_om_permeating_tsne,alpha=0.7, hue=sub_evade_om_permeating['Class'], levels = 4)
# plt.legend(fontsize=20)
[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 961 samples in 0.001s...
[t-SNE] Computed neighbors for 961 samples in 0.068s...
[t-SNE] Computed conditional probabilities for sample 961 / 961
[t-SNE] Mean sigma: 0.854897
[t-SNE] KL divergence after 250 iterations with early exaggeration: 61.729122
[t-SNE] KL divergence after 1000 iterations: 0.519036
<Axes: xlabel='TC1', ylabel='TC2'>
we'll sue same dataset as for t-sne
from descriptastorus.descriptors.DescriptorGenerator import MakeGenerator
from tqdm import trange, tqdm
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
def calcualte_features_single(df, col):
generator = MakeGenerator(("rdkit2d",))
names=[name[0] for name in generator.GetColumns()]
l_feat=[]
print('Computing features: ')
for i in trange(len(df[col].values)):
l_data = generator.process(df[col].values[i])
if l_data[0] == True:
l_feat.append(l_data[1:])
else:
print('left: ', l_data[0])
print(df[col].values[i])
# add descriptors to existing dataframe
feats = pd.DataFrame()
for i in trange(len(l_feat)):
feats = feats.append(pd.Series(l_feat[i]), ignore_index=True)
feats.columns = names[1:]
return feats
sub_evade_inactive_features = calcualte_features_single(sub_evade_inactive, 'SMILES')
sub_evade_inactive_features['Class'] = sub_evade_inactive['Class']
Computing features:
100%|██████████████████████████████████████████████████████████████████████████████| 1240/1240 [00:13<00:00, 90.43it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1240/1240 [00:00<00:00, 1435.85it/s]
sub_evade_inactive_features
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
BalabanJ | BertzCT | Chi0 | Chi0n | Chi0v | Chi1 | Chi1n | Chi1v | Chi2n | Chi2v | ... | fr_sulfone | fr_term_acetylene | fr_tetrazole | fr_thiazole | fr_thiocyan | fr_thiophene | fr_unbrch_alkane | fr_urea | qed | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2.508772 | 249.116352 | 6.974691 | 5.449320 | 5.449320 | 4.877010 | 3.252155 | 3.252155 | 2.362178 | 2.362178 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.501865 | Efflux Evader |
1 | 1.508609 | 845.728650 | 20.597801 | 16.576049 | 18.162045 | 14.775990 | 9.905963 | 10.698961 | 6.767766 | 7.683442 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.504707 | Efflux Evader |
2 | 0.000001 | 653.569301 | 14.396977 | 11.850173 | 15.811520 | 10.203510 | 7.173237 | 9.562159 | 5.658176 | 9.088344 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.599582 | Efflux Evader |
3 | 2.939539 | 420.685437 | 12.344935 | 7.754071 | 9.340068 | 7.303549 | 4.082377 | 4.875376 | 2.898481 | 3.814156 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.669689 | Efflux Evader |
4 | 2.603746 | 310.650557 | 9.681798 | 6.788319 | 7.544248 | 6.236382 | 3.689747 | 4.224269 | 2.376957 | 2.644218 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.588792 | Efflux Evader |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1235 | 1.003357 | 1984.841727 | 34.329487 | 27.979443 | 27.979443 | 23.749555 | 16.372378 | 16.372378 | 12.307394 | 12.307394 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.174004 | Inactive |
1236 | 1.749666 | 1383.833437 | 21.957455 | 16.503270 | 17.259199 | 15.011570 | 9.340691 | 9.718655 | 6.803797 | 7.210998 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.427471 | Inactive |
1237 | 1.531621 | 1346.959571 | 25.070339 | 20.361266 | 21.947263 | 17.546045 | 12.317981 | 13.110979 | 9.033422 | 9.887759 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.302495 | Inactive |
1238 | 1.868993 | 1028.780943 | 15.648054 | 12.477331 | 12.477331 | 11.326500 | 7.553489 | 7.553489 | 5.475973 | 5.475973 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.490238 | Inactive |
1239 | 2.184490 | 517.236837 | 13.120956 | 10.329726 | 11.146223 | 8.592224 | 5.624243 | 6.503896 | 4.230048 | 5.177742 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.860297 | Inactive |
1240 rows × 201 columns
# drop feature columns that contain Nans, its only 4 columns
sub_evade_inactive_features=sub_evade_inactive_features.dropna(axis=1)
# pca on all physcicochemical features:
table = sub_evade_inactive_features
#The non-redundant molecular descriptors chosen for PCA
descriptors = table.iloc[:,:-87] # grab only physicochemical values
descriptors_std = StandardScaler().fit_transform(descriptors) #Important to avoid scaling problems between our different descriptors
pca = PCA()
descriptors_2d = pca.fit_transform(descriptors_std)
descriptors_pca= pd.DataFrame(descriptors_2d) # Saving PCA values to a new table
descriptors_pca.index = table.index
descriptors_pca.columns = ['PC{}'.format(i+1) for i in descriptors_pca.columns]
descriptors_pca.head(5) #Displays the PCA table
scale1 = 1.0/(max(descriptors_pca['PC1']) - min(descriptors_pca['PC1']))
scale2 = 1.0/(max(descriptors_pca['PC2']) - min(descriptors_pca['PC2']))
# And we add the new values to our PCA table
descriptors_pca['PC1_normalized']=[i*scale1 for i in descriptors_pca['PC1']]
descriptors_pca['PC2_normalized']=[i*scale2 for i in descriptors_pca['PC2']]
descriptors_pca['Class'] = sub_evade_inactive_features['Class']
# plt.rcParams['axes.linewidth'] = 1.5
cmap = sns.diverging_palette(133, 10, as_cmap=True)
fig, ax = plt.subplots(figsize=(10,5))
sns.scatterplot(x='PC1',y='PC2',data=descriptors_pca, alpha=0.7, hue='Class', s=20)#, palette=["C0", "C1", "C2", "k"])
pca_lab = ('PC1 '+str([np.round(pca.explained_variance_ratio_[0]*100, 1)]), 'PC2 '+str([np.round(pca.explained_variance_ratio_[1]*100, 1)]))
plt.xlabel(pca_lab[0],fontsize=16,fontweight='bold')
plt.ylabel(pca_lab[1],fontsize=16,fontweight='bold')
plt.tick_params ('both',width=2,labelsize=14)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
handles, labels = ax.get_legend_handles_labels()
#ax.legend(handles=handles[1:], labels=labels[1:])
#plt.legend(loc='lower right',frameon=False,prop={'size': 22},ncol=1)
plt.tight_layout()
# plt.savefig('figures/pca_evade_substrate.png', dpi=600)
plt.show()
print('same but in contours, for ease of read')
cmap = sns.diverging_palette(133, 10, as_cmap=True)
############ kdeplot
fig, ax = plt.subplots(figsize=(10,7))
sns.set_style("ticks")
# sns.set(font_scale=2)
sns.kdeplot(x='PC1',y='PC2',data=descriptors_pca, hue='Class' , levels=3,)
pca_lab= ('PC1 '+str([np.round(pca.explained_variance_ratio_[0]*100, 1)]), 'PC2 '+str([np.round(pca.explained_variance_ratio_[1]*100, 1)]))
plt.xlabel(pca_lab[0],fontweight='bold',fontsize=22)
plt.ylabel(pca_lab[1],fontweight='bold', fontsize=22)
plt.tick_params ('both',width=2,labelsize=20)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
handles, labels = ax.get_legend_handles_labels()
#ax.legend(handles=handles[1:], labels=labels[1:])
#plt.legend(loc='lower right',frameon=False,prop={'size': 22},ncol=1)
# plt.legend()
plt.tight_layout()
# plt.savefig('figures/pca_evade_substrate_contour.png', dpi=600)
same but in contours, for ease of read
explained variance is too low, I will chose only 7 mian features for PCA next:
# pca on only 8 main physcicochemical features:
table = sub_evade_inactive_features
#The non-redundant molecular descriptors chosen for PCA
descriptors = table[['MolWt', 'MolLogP', 'RingCount','FractionCSP3', 'TPSA','NumHAcceptors', 'NumHDonors', 'NumRotatableBonds' ]].values
# descriptors = table.iloc[:,:-87]
descriptors_std = StandardScaler().fit_transform(descriptors)
pca = PCA()
descriptors_2d = pca.fit_transform(descriptors_std)
descriptors_pca= pd.DataFrame(descriptors_2d)
descriptors_pca.index = table.index
descriptors_pca.columns = ['PC{}'.format(i+1) for i in descriptors_pca.columns]
scale1 = 1.0/(max(descriptors_pca['PC1']) - min(descriptors_pca['PC1']))
scale2 = 1.0/(max(descriptors_pca['PC2']) - min(descriptors_pca['PC2']))
# And we add the new values to our PCA table
descriptors_pca['PC1_normalized']=[i*scale1 for i in descriptors_pca['PC1']]
descriptors_pca['PC2_normalized']=[i*scale2 for i in descriptors_pca['PC2']]
descriptors_pca['Class'] = sub_evade_inactive_features['Class']
# plt.rcParams['axes.linewidth'] = 1.5
cmap = sns.diverging_palette(133, 10, as_cmap=True)
fig, ax = plt.subplots(figsize=(10,5))
sns.scatterplot(x='PC1',y='PC2',data=descriptors_pca, alpha=0.7, hue='Class', s=20)#, palette=["C0", "C1", "C2", "k"])
pca_lab = ('PC1 '+str([np.round(pca.explained_variance_ratio_[0]*100, 1)]), 'PC2 '+str([np.round(pca.explained_variance_ratio_[1]*100, 1)]))
plt.xlabel(pca_lab[0],fontsize=16,fontweight='bold')
plt.ylabel(pca_lab[1],fontsize=16,fontweight='bold')
plt.tick_params ('both',width=2,labelsize=14)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
handles, labels = ax.get_legend_handles_labels()
#ax.legend(handles=handles[1:], labels=labels[1:])
#plt.legend(loc='lower right',frameon=False,prop={'size': 22},ncol=1)
plt.tight_layout()
# plt.savefig('figures/pca_evade_substrate.png', dpi=600)
plt.show()
print('same but in contours, for ease of read')
cmap = sns.diverging_palette(133, 10, as_cmap=True)
############ kdeplot
fig, ax = plt.subplots(figsize=(10,7))
sns.set_style("ticks")
# sns.set(font_scale=2)
sns.kdeplot(x='PC1',y='PC2',data=descriptors_pca, hue='Class' , levels=3,)
pca_lab= ('PC1 '+str([np.round(pca.explained_variance_ratio_[0]*100, 1)]), 'PC2 '+str([np.round(pca.explained_variance_ratio_[1]*100, 1)]))
plt.xlabel(pca_lab[0],fontweight='bold',fontsize=22)
plt.ylabel(pca_lab[1],fontweight='bold', fontsize=22)
plt.tick_params ('both',width=2,labelsize=20)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
handles, labels = ax.get_legend_handles_labels()
#ax.legend(handles=handles[1:], labels=labels[1:])
#plt.legend(loc='lower right',frameon=False,prop={'size': 22},ncol=1)
# plt.legend()
plt.tight_layout()
# plt.savefig('figures/pca_evade_substrate_contour.png', dpi=600)
same but in contours, for ease of read
sns.set(context='notebook', style='ticks', font='Times New Roman', font_scale=1.6, rc=None)
fig, ax = plt.subplots(figsize=(8,8))
sns.scatterplot(x='PC1',y='PC2',data=descriptors_pca, alpha=0.5, hue='Class', s=20)
pca_lab = ('PC1 '+str([np.round(pca.explained_variance_ratio_[0]*100, 1)]), 'PC2 '+str([np.round(pca.explained_variance_ratio_[1]*100, 1)]))
plt.xlabel(pca_lab[0])
plt.ylabel(pca_lab[1])
sns.kdeplot(x='PC1',y='PC2',data=descriptors_pca, hue='Class' , levels=2, linewidths=2)
pca_lab= ('PC1 - '+str(np.round(pca.explained_variance_ratio_[0]*100, 1)) + '%', 'PC2 - '+str(np.round(pca.explained_variance_ratio_[1]*100, 1)) + '%')
plt.xlabel(pca_lab[0])
plt.ylabel(pca_lab[1])
sns.despine()
# plt.savefig('pca_all.svg')
similar result where exlpained variance is about 70% but classes are still not seperated at all
To carry out mmpa I used modified mmpdb : https://github.com/rdkit/mmpdb
publication : https://doi.org/10.1021/acs.jcim.8b00173
# import results from MMPA:
efflux_mmpa_index = pd.read_pickle('data_curated/efflux_mmpa_index.pkl')
# it contains 1.4M pairs
def split_transition(df, col):
df['LHS'] = [re.split('>>',df[col].values[i])[0] for i in range(len(df)) ]
df['RHS'] = [re.split('>>',df[col].values[i])[1] for i in range(len(df)) ]
return df
def mols_to_NHA(mol):
return Chem.MolFromSmarts(mol).GetNumHeavyAtoms()
def clean_mmpa_pairs_len(mmpa_df):
temp=pd.DataFrame() # temp dataframe
if 'LHS' not in mmpa_df.columns: # add LHS and RHS if not present
mmpa_df = split_transition(mmpa_df, 'smirks') # produce LHS and RHS
else:
temp['common_core_HA'] = mmpa_df['common_core'].apply(mols_to_NHA) # produce number of heavy atoms
temp['LHS_HA'] = mmpa_df['LHS'].apply(mols_to_NHA)
temp['RHS_HA'] = mmpa_df['LHS'].apply(mols_to_NHA)
temp['len_check'] = np.where((temp['LHS_HA'] >= temp['common_core_HA']) & (temp['RHS_HA'] >= temp['common_core_HA'])
, 'fail', 'pass') # compare lengths of heavy atoms
mmpa_df = mmpa_df.drop(temp[temp['len_check']=='fail'].index) # drop index that failed length check
print('Initial number of transofrms: {} \nNumber fo transforms disqualified based on length discrepancy: {} \nRemaining number of transforms: {}'.format(len(temp[temp['len_check']=='fail']) + len(mmpa_df) , len(temp[temp['len_check']=='fail']), len(mmpa_df)))
# return temp to debug
return mmpa_df
# find evader transforms:
evader_transforms = efflux_mmpa_index[(efflux_mmpa_index['compound_structure_B'].isin(efflux_evaders_om_corrected.SMILES)) & (efflux_mmpa_index['compound_structure_A'].isin(inactive.SMILES))]
evader_transforms = clean_mmpa_pairs_len(evader_transforms)
Initial number of transofrms: 2468
Number fo transforms disqualified based on length discrepancy: 1856
Remaining number of transforms: 612
len(evader_transforms.compound_structure_B.unique())
77
len(evader_transforms.compound_structure_A.unique())
397
substrate_transforms = efflux_mmpa_index[(efflux_mmpa_index['compound_structure_B'].isin(efflux_substrates_om_corrected.SMILES)) & (efflux_mmpa_index['compound_structure_A'].isin(inactive.SMILES)) ]
substrate_transforms = clean_mmpa_pairs_len(substrate_transforms)
Initial number of transofrms: 6827
Number fo transforms disqualified based on length discrepancy: 1927
Remaining number of transforms: 4900
len(substrate_transforms.compound_structure_A.unique())
2053
len(substrate_transforms.compound_structure_B.unique())
349
def calculate_fractions_mk7_new_smarts_spec(df, smirks, measurement_delta, measurement_A, measurement_B):
mol_substructures, name_substructure = new_smarts()
name_substructure = name_substructure + ['smirks', 'measurement' ,'target']
smirks=smirks
measurement_delta=measurement_delta
measurement_A = measurement_A
measurement_B = measurement_B
# Comapre left hand side
# frame_left=pd.DataFrame(columns=name_substructure)
frame_left=[]
frame_right=[]
print('Calcualting LHS+RHS matches')
#for index in enumerate(df.LHS.values)):
for index in range(len(df)):
#grab structure
frame_temp_left=pd.DataFrame(0, index=range(1), columns=name_substructure)
frame_temp_right=pd.DataFrame(0, index=range(1), columns=name_substructure)
frame_temp_left['smirks'] = df[smirks].values[index]
frame_temp_left['target'] = df[measurement_delta].values[index]
frame_temp_left['measurement'] = df[measurement_A].values[index]
for sub_nr, sub in enumerate(mol_substructures):
if df['mol_a'].iloc[index].HasSubstructMatch(sub):
frame_temp_left[name_substructure[sub_nr]] = [1]
# if mol_target_left.HasSubstructMatch(sub):
# frame_temp_left[name_substructure[sub_nr]] = [1]
frame_temp_right['smirks'] = df[smirks].values[index]
frame_temp_right['target'] = df[measurement_delta].values[index]
frame_temp_right['measurement'] = df[measurement_B].values[index]
for sub_nr, sub in enumerate(mol_substructures):
if df['mol_b'].iloc[index].HasSubstructMatch(sub):
frame_temp_right[name_substructure[sub_nr]] = [1]
frame_left.append(frame_temp_left.values)
frame_right.append(frame_temp_right.values)
frame_left_df = pd.DataFrame(np.concatenate(frame_left), columns = name_substructure)
# compare right hand side
frame_right_df = pd.DataFrame(np.concatenate(frame_right), columns = name_substructure)
diff = frame_right_df.iloc[:,:-3] - frame_left_df.iloc[:,:-3]
diff['smirks'] = frame_right_df['smirks']
diff['measurement_A'] = frame_left_df['measurement']
diff['measurement_B'] = frame_right_df['measurement']
diff['target'] = frame_right_df['target']
return diff.reset_index(drop=True), frame_left_df.reset_index(drop=True), frame_right_df.reset_index(drop=True)
def new_smarts():
# print(os.getcwd())
func_groups=pd.read_csv('ml_mmpa/fg_smarts_2.csv')
#fetch all substructure definitions and calculate mosl for them
print('Generating molecular objects from pre-defined substructures')
mol_substructures=[]
for substructure in func_groups.SMARTS:
mol_substructures.append(Chem.MolFromSmarts(substructure))
return mol_substructures, func_groups.name.to_list()
def calculate_fractions_mk7_new_smarts(df):
mol_substructures, name_substructure = new_smarts()
name_substructure = name_substructure + ['smirks', 'measurement' ,'target']
# Comapre left hand side
# frame_left=pd.DataFrame(columns=name_substructure)
frame_left=[]
frame_right=[]
print('Calcualting LHS+RHS matches')
#for index in enumerate(df.LHS.values)):
for index in range(len(df)):
#grab structure
frame_temp_left=pd.DataFrame(0, index=range(1), columns=name_substructure)
frame_temp_right=pd.DataFrame(0, index=range(1), columns=name_substructure)
frame_temp_left['smirks'] = df.smirks.values[index]
frame_temp_left['target'] = df.measurement_delta.values[index]
frame_temp_left['measurement'] = df.measurement_A.values[index]
for sub_nr, sub in enumerate(mol_substructures):
if df['mol_a'].iloc[index].HasSubstructMatch(sub):
frame_temp_left[name_substructure[sub_nr]] = [1]
# if mol_target_left.HasSubstructMatch(sub):
# frame_temp_left[name_substructure[sub_nr]] = [1]
frame_temp_right['smirks'] = df.smirks.values[index]
frame_temp_right['target'] = df.measurement_delta.values[index]
frame_temp_right['measurement'] = df.measurement_B.values[index]
for sub_nr, sub in enumerate(mol_substructures):
if df['mol_b'].iloc[index].HasSubstructMatch(sub):
frame_temp_right[name_substructure[sub_nr]] = [1]
frame_left.append(frame_temp_left.values)
frame_right.append(frame_temp_right.values)
frame_left_df = pd.DataFrame(np.concatenate(frame_left), columns = name_substructure)
# compare right hand side
frame_right_df = pd.DataFrame(np.concatenate(frame_right), columns = name_substructure)
diff = frame_right_df.iloc[:,:-3] - frame_left_df.iloc[:,:-3]
diff['smirks'] = frame_right_df['smirks']
diff['measurement_A'] = frame_left_df['measurement']
diff['measurement_B'] = frame_right_df['measurement']
diff['target'] = frame_right_df['target']
return diff.reset_index(drop=True), frame_left_df.reset_index(drop=True), frame_right_df.reset_index(drop=True)
def new_smarts():
# print(os.getcwd())
func_groups=pd.read_csv('ml_mmpa/fg_smarts_2.csv')
#fetch all substructure definitions and calculate mosl for them
print('Generating molecular objects from pre-defined substructures')
mol_substructures=[]
for substructure in func_groups.SMARTS:
mol_substructures.append(Chem.MolFromSmarts(substructure))
return mol_substructures, func_groups.name.to_list()
# find substrate to evader transforms:
sub_to_evader_transforms = efflux_mmpa_index[(efflux_mmpa_index['compound_structure_B'].isin(efflux_evaders_om_corrected.SMILES)) & (efflux_mmpa_index['compound_structure_A'].isin(efflux_substrates_om_corrected.SMILES))]
len(sub_to_evader_transforms), len(sub_to_evader_transforms.compound_structure_A.unique()), len(sub_to_evader_transforms.compound_structure_B.unique())
(60, 26, 24)
sub_to_evader_transforms[sub_to_evader_transforms['compound_structure_B']=='O=[N+]([O-])c1ccc2no[n+]([O-])c2c1']
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
compound_structure_A | compound_structure_B | idsmiles_A | idsmiles_B | smirks | common_core | measurement_A | measurement_B | measurement_delta | LHS | RHS | mol_a | mol_b | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1037285 | O=Cc1cc([N+](=O)[O-])cc(I)c1O | O=[N+]([O-])c1ccc2no[n+]([O-])c2c1 | 43577 | 47709 | [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1ccc2no[n+]([O... | [*:1][N+](=O)[O-] | 55.67 | -1.98 | -57.65 | [*:1]c1cc(I)c(O)c(C=O)c1 | [*:1]c1ccc2no[n+]([O-])c2c1 | <rdkit.Chem.rdchem.Mol object at 0x000002AA5A4... | <rdkit.Chem.rdchem.Mol object at 0x000002AA5A2... |
1038977 | Cn1nc([N+](=O)[O-])c[n+]1[O-] | O=[N+]([O-])c1ccc2no[n+]([O-])c2c1 | 47632 | 47709 | [*:1]c1c[n+]([O-])n(C)n1>>[*:1]c1ccc2no[n+]([O... | [*:1][N+](=O)[O-] | 42.16 | -1.98 | -44.14 | [*:1]c1c[n+]([O-])n(C)n1 | [*:1]c1ccc2no[n+]([O-])c2c1 | <rdkit.Chem.rdchem.Mol object at 0x000002AA5A4... | <rdkit.Chem.rdchem.Mol object at 0x000002AA5A2... |
new_df = sub_to_evader_transforms.groupby(['compound_structure_A', 'compound_structure_B']).size().reset_index(name='Freq')
new_df.drop_duplicates(subset=['compound_structure_B'])
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
compound_structure_A | compound_structure_B | Freq | |
---|---|---|---|
0 | CC(=O)Cn1cc([N+](=O)[O-])c([N+](=O)[O-])n1 | CC(=O)Cn1nnc([N+](=O)[O-])n1 | 4 |
1 | CC(=O)Cn1cc([N+](=O)[O-])c([N+](=O)[O-])n1 | Nc1ncc([N+](=O)[O-])cc1[N+](=O)[O-] | 1 |
2 | CC(=O)Cn1cc([N+](=O)[O-])c([N+](=O)[O-])n1 | O=[N+]([O-])c1ncn(CCO)c1[N+](=O)[O-] | 1 |
3 | CCCCCCCn1ccc(=N)cc1.I | Br.CCCCCCCCCCn1ccc(=N)cc1 | 1 |
4 | CCCCCCCn1ccc(=N)cc1.I | Br.CCCCCCCCn1ccc(=N)cc1 | 1 |
5 | CCc1ccc(O)c(/N=C/c2cc(I)cc(I)c2O)c1 | Oc1cccnc1/N=C/c1cc(I)cc(I)c1O | 3 |
6 | CCc1ccc(OCCNc2cc(N3CC(C)NC(C)C3)ccc2[N+](=O)[O... | CCc1ccc(OCCNc2cc(N3CCNC(C)C3)ccc2[N+](=O)[O-])cc1 | 3 |
8 | CCn1cc(C(=O)O)c(=O)c2cc(F)c(N/N=C/c3ccccc3)cc21 | CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCN(C)CC3)cc21.C... | 1 |
9 | CCn1cc(C(=O)O)c(=O)c2cc(F)c(N/N=C/c3ccccc3)cc21 | CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCNCC3)cc21 | 1 |
10 | CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCN(C(=S)NC(=O)C... | CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCN(C(=S)NC(=O)c... | 1 |
11 | CCn1cc(C(=O)O)c(=O)c2cnc(N3CCN(C(=S)Nc4ccc(OC)... | CCn1cc(C(=O)O)c(=O)c2cnc(N3CCN(C(=S)Nc4ccc(C(=... | 1 |
12 | COc1ccc(/C=C/c2ccc(C(=O)Nc3cc(C(=O)Nc4cc(C(=O)... | COc1ccc(/C=C/c2ccc(C(=O)Nc3cc(C(=O)Nc4cc(C(=O)... | 2 |
15 | Cc1cc(C)c2nc3nc(C)cc(C)c3c(N)c2c1 | Cc1ccc2nc3nc(C)cc(C)c3c(N)c2c1 | 1 |
16 | Cn1cnc([N+](=O)[O-])c1Oc1ccccc1 | Cn1cnc([N+](=O)[O-])c1S(=O)CC#N | 1 |
17 | Cn1cnc([N+](=O)[O-])c1Oc1ccccc1 | Cn1cnc([N+](=O)[O-])c1Sc1nnnn1C | 1 |
18 | Cn1nc([N+](=O)[O-])c[n+]1[O-] | C=CCNc1c([N+](=O)[O-])nn(C)[n+]1[O-] | 1 |
22 | Cn1nc([N+](=O)[O-])c[n+]1[O-] | Nc1nonc1[N+](=O)[O-] | 1 |
23 | Cn1nc([N+](=O)[O-])c[n+]1[O-] | O=C(O)/C=C/c1ccc([N+](=O)[O-])o1 | 1 |
24 | Cn1nc([N+](=O)[O-])c[n+]1[O-] | O=[N+]([O-])c1ccc2no[n+]([O-])c2c1 | 1 |
25 | N#Cc1c(Cl)nc(NN)c(Cl)c1Cl | N#Cc1nc(Cl)c2sc(=O)sc2c1Cl | 1 |
26 | N/C(=C\C(=O)/C=C/c1cccs1)C(Cl)(Cl)Cl | O=C(/C=C/c1cccs1)CC(=O)C(F)(F)F | 2 |
33 | O=[N+]([O-])c1ccc(/N=C/c2cc(I)cc(I)c2O)cc1 | O=C(NC(CO)C(O)c1ccc([N+](=O)[O-])cc1)C(Cl)Cl | 2 |
34 | Oc1c(Cl)cc(Br)cc1/C=N/c1ccc(F)cc1 | O=[N+]([O-])c1ccc(/C=N/c2ccc(F)cc2)o1 | 1 |
35 | Oc1c(Cl)cc(Br)cc1/C=N/c1ccccc1 | O=C(CCl)C(=O)Nc1ccccc1 | 1 |
len(sub_to_evader_transforms.drop_duplicates(subset=['compound_structure_B']))
24
e_coli_wild_efflux[e_coli_wild_efflux['SMILES'] == 'N/C(=C\C(=O)/C=C/c1cccs1)C(Cl)(Cl)Cl'][['INHIB_AVE_wild', 'INHIB_AVE_efflux']].values[0]
array([21.71, 90.83])
e_coli_wild_efflux[e_coli_wild_efflux['SMILES'] == 'O=C(/C=C/c1cccs1)CC(=O)C(F)(F)F'][['INHIB_AVE_wild', 'INHIB_AVE_efflux']].values[0]
array([48.74, 93. ])
sub_to_evader_transforms.drop_duplicates(subset=['compound_structure_B']).iloc[0].compound_structure_A
'N/C(=C\\C(=O)/C=C/c1cccs1)C(Cl)(Cl)Cl'
for i in range(24):
trans = sub_to_evader_transforms.drop_duplicates(subset=['compound_structure_B'])
# smiles:
comp_a = sub_to_evader_transforms[sub_to_evader_transforms['smirks'] == dex.iloc[int(a/2)].smirks].compound_structure_A.values[0]
comp_b = sub_to_evader_transforms[sub_to_evader_transforms['smirks'] == dex.iloc[int(a/2)].smirks].compound_structure_B.values[0]
# wt and efflux pre
pre = e_coli_wild_efflux[e_coli_wild_efflux['SMILES'] == trans.iloc[i].compound_structure_A][['INHIB_AVE_wild', 'INHIB_AVE_efflux']].values[0]
# wt and efflux post
post = e_coli_wild_efflux[e_coli_wild_efflux['SMILES'] == trans.iloc[i].compound_structure_B][['INHIB_AVE_wild', 'INHIB_AVE_efflux']].values[0]
print(i+1)
print(trans.iloc[i].compound_structure_A)
print('WT: {}%, tolC: {}%'.format(pre[0], pre[1]))
print(trans.iloc[i].compound_structure_B)
print('WT: {}%, tolC: {}%'.format(post[0], post[1]))
1
N/C(=C\C(=O)/C=C/c1cccs1)C(Cl)(Cl)Cl
WT: 21.71%, tolC: 90.83%
O=C(/C=C/c1cccs1)CC(=O)C(F)(F)F
WT: 48.74%, tolC: 93.0%
2
Oc1c(Cl)cc(Br)cc1/C=N/c1ccc(F)cc1
WT: 39.12%, tolC: 96.44%
O=[N+]([O-])c1ccc(/C=N/c2ccc(F)cc2)o1
WT: 93.81%, tolC: 91.72%
3
Oc1c(I)cc(I)cc1/C=N/c1ccc(F)cc1
WT: 37.97%, tolC: 100.98%
Oc1cccnc1/N=C/c1cc(I)cc(I)c1O
WT: 60.66%, tolC: 97.11%
4
N#Cc1c(Cl)nc(NN)c(Cl)c1Cl
WT: -0.99%, tolC: 86.71%
N#Cc1nc(Cl)c2sc(=O)sc2c1Cl
WT: 80.76%, tolC: 76.9%
5
CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCN(C(=S)NC(=O)C45CC6CC(CC(C6)C4)C5)CC3)cc21
WT: 9.66%, tolC: 97.46%
CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCN(C(=S)NC(=O)c4c(OC)cccc4OC)CC3)cc21
WT: 92.72%, tolC: 91.71%
6
CC(=O)Cn1cc([N+](=O)[O-])c([N+](=O)[O-])n1
WT: 14.09%, tolC: 100.19%
CC(=O)Cn1nnc([N+](=O)[O-])n1
WT: 45.0%, tolC: 77.9%
7
Cn1nc([N+](=O)[O-])c[n+]1[O-]
WT: 36.64%, tolC: 78.8%
Cn1cnc([N+](=O)[O-])c1S(=O)CC#N
WT: 93.87%, tolC: 90.29%
8
Cn1nc([N+](=O)[O-])c[n+]1[O-]
WT: 36.64%, tolC: 78.8%
C=CCNc1c([N+](=O)[O-])nn(C)[n+]1[O-]
WT: 100.62%, tolC: 102.1%
9
CCn1cc(C(=O)O)c(=O)c2cnc(N3CCN(C(=S)Nc4ccc(OC)cc4)CC3)nc21
WT: -0.57%, tolC: 80.9%
CCn1cc(C(=O)O)c(=O)c2cnc(N3CCN(C(=S)Nc4ccc(C(=O)OC)cc4)CC3)nc21
WT: 96.96%, tolC: 100.34%
10
CCc1ccc(OCCNc2cc(N3CC(C)NC(C)C3)ccc2[N+](=O)[O-])cc1
WT: 33.9%, tolC: 95.53%
CCc1ccc(OCCNc2cc(N3CCNC(C)C3)ccc2[N+](=O)[O-])cc1
WT: 52.44%, tolC: 96.71%
11
COc1ccc(/C=C/c2ccc(C(=O)Nc3cc(C(=O)Nc4cc(C(=O)NCCN5CCC(OC)CC5)n(C)c4)n(C)c3)cn2)cc1.O=C(O)C(F)(F)F.O=C(O)C(F)(F)F
WT: 5.08%, tolC: 100.53%
COc1ccc(/C=C/c2ccc(C(=O)Nc3cc(C(=O)Nc4cc(C(=O)NCCN5CCOCC5)n(C)c4)n(C)c3)cn2)cc1.O=C(O)C(F)(F)F.O=C(O)C(F)(F)F
WT: 100.46%, tolC: 100.31%
12
Oc1c(Cl)cc(Br)cc1/C=N/c1ccccc1
WT: 27.69%, tolC: 101.73%
O=C(CCl)C(=O)Nc1ccccc1
WT: 95.28%, tolC: 92.56%
13
O=[N+]([O-])c1ccc(/N=C/c2cc(I)cc(I)c2O)cc1
WT: 40.13%, tolC: 96.13%
O=C(NC(CO)C(O)c1ccc([N+](=O)[O-])cc1)C(Cl)Cl
WT: 98.55%, tolC: 98.37%
14
O=Cc1cc([N+](=O)[O-])cc(I)c1O
WT: 27.87%, tolC: 83.54%
O=[N+]([O-])c1ccc2no[n+]([O-])c2c1
WT: 96.24%, tolC: 94.26%
15
O=Cc1cc([N+](=O)[O-])cc(I)c1O
WT: 27.87%, tolC: 83.54%
Nc1ncc([N+](=O)[O-])cc1[N+](=O)[O-]
WT: 59.06%, tolC: 98.91%
16
O=Cc1cc([N+](=O)[O-])cc(I)c1O
WT: 27.87%, tolC: 83.54%
O=C(O)/C=C/c1ccc([N+](=O)[O-])o1
WT: 75.57%, tolC: 98.52%
17
O=Cc1cc([N+](=O)[O-])cc(I)c1O
WT: 27.87%, tolC: 83.54%
Nc1nonc1[N+](=O)[O-]
WT: 99.21%, tolC: 96.12%
18
CC(=O)Cn1cc([N+](=O)[O-])c([N+](=O)[O-])n1
WT: 14.09%, tolC: 100.19%
O=[N+]([O-])c1ncn(CCO)c1[N+](=O)[O-]
WT: 96.69%, tolC: 93.83%
19
Cn1cnc([N+](=O)[O-])c1Oc1ccccc1
WT: 15.81%, tolC: 94.86%
Cn1cnc([N+](=O)[O-])c1Sc1nnnn1C
WT: 53.09%, tolC: 100.9%
20
Cc1cc(C)c2nc3nc(C)cc(C)c3c(N)c2c1
WT: 42.19%, tolC: 100.93%
Cc1ccc2nc3nc(C)cc(C)c3c(N)c2c1
WT: 53.27%, tolC: 100.78%
21
CCn1cc(C(=O)O)c(=O)c2cc(F)c(N/N=C/c3ccccc3)cc21
WT: -2.02%, tolC: 81.37%
CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCN(C)CC3)cc21.CS(=O)(=O)O
WT: 100.16%, tolC: 100.18%
22
CCn1cc(C(=O)O)c(=O)c2cc(F)c(N/N=C/c3ccccc3)cc21
WT: -2.02%, tolC: 81.37%
CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCNCC3)cc21
WT: 98.83%, tolC: 98.54%
23
CCCCCCCn1ccc(=N)cc1.I
WT: 38.66%, tolC: 95.65%
Br.CCCCCCCCn1ccc(=N)cc1
WT: 58.22%, tolC: 90.97%
24
CCCCCCCn1ccc(=N)cc1.I
WT: 38.66%, tolC: 95.65%
Br.CCCCCCCCCCn1ccc(=N)cc1
WT: 101.08%, tolC: 95.52%
sub_to_evader_transforms['mol_a'] = sub_to_evader_transforms.LHS.apply(Chem.MolFromSmiles)
C:\Users\domin\AppData\Local\Temp\ipykernel_3876\1271073621.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
sub_to_evader_transforms['mol_a'] = sub_to_evader_transforms.LHS.apply(Chem.MolFromSmiles)
sub_to_evader_transforms['mol_b'] = sub_to_evader_transforms.RHS.apply(Chem.MolFromSmiles)
[17:06:47] WARNING: not removing hydrogen atom with dummy atom neighbors
[17:06:47] WARNING: not removing hydrogen atom with dummy atom neighbors
C:\Users\domin\AppData\Local\Temp\ipykernel_3876\1879633430.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
sub_to_evader_transforms['mol_b'] = sub_to_evader_transforms.RHS.apply(Chem.MolFromSmiles)
sub_to_evader_transforms.mol_b.isna().any()
False
feat_diff, feat_left, feat_right = master_functions.calculate_fractions_mk7_new_smarts(sub_to_evader_transforms)
H:\My Drive\co_add_jupyter
Generating molecular objects from pre-defined substructures
Calcualting LHS+RHS matches
feat_diff, feat_left, feat_right = calculate_fractions_mk7_new_smarts(sub_to_evader_transforms)
Generating molecular objects from pre-defined substructures
Calcualting LHS+RHS matches
#drop zeros
feat_diff = feat_diff.loc[:, (feat_diff != 0).any(axis=0)]
feat_diff
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
1 - Alkane group | 1,2-Dicarbonyl not in ring | 10 - Aldehyde | 13 - Ether | 15 - Secondary amine group | 16 - Tertiary amine | 17 - Aromatic amine | 18 - Pyridine | 19 - CCN | 2 - Olefin group | ... | Thionyl | Vinyl michael acceptor1 | Primary amine, not amide | Primary or secondary amine, not amide. | tertiary aliphatic amine | carboxylic acid | smirks | measurement_A | measurement_B | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -1 | ... | 0 | 0 | -1 | -1 | 0 | 0 | [*:1]C(=O)/C=C(\N)C(Cl)(Cl)Cl>>[*:1]C(=O)CC(=O... | 69.12 | 44.26 | -24.86 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1cc(Br)cc(Cl)c1O>>[*:1]c1ccc([N+](=O)[O-... | 57.32 | -2.09 | -59.41 |
2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1cc(I)cc(I)c1O>>[*:1]c1ccc([N+](=O)[O-])o1 | 63.01 | -2.09 | -65.1 |
3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1ccc(F)cc1>>[*:1]c1ncccc1O | 63.01 | 36.45 | -26.56 |
4 | -1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1cc(CC)ccc1O>>[*:1]c1ncccc1O | 53.18 | 36.45 | -16.73 |
5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1cccc(Cl)c1Cl>>[*:1]c1ncccc1O | 72.7 | 36.45 | -36.25 |
6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1cccc(F)c1>>[*:1]c1ncccc1O | 55.41 | 36.45 | -18.96 |
7 | 0 | 0 | 0 | -1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1cccc(OC)c1>>[*:1]c1ncccc1O | 59.53 | 36.45 | -23.08 |
8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1cccc2ccccc12>>[*:1]c1ncccc1O | 64.42 | 36.45 | -27.97 |
9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1ccccc1I>>[*:1]c1ncccc1O | 62.45 | 36.45 | -26.0 |
10 | 0 | 0 | 0 | -1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1ccccc1OC>>[*:1]c1ncccc1O | 64.28 | 36.45 | -27.83 |
11 | -1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1ccc(CC)cc1[*:2]>>[*:1]c1cccnc1[*:2] | 53.18 | 36.45 | -16.73 |
12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | -1 | 0 | 0 | [*:1]c1c(Cl)nc(NN)c([*:2])c1[*:3]>>[*:1]c1nc([... | 87.7 | -3.86 | -91.56 |
13 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -1 | ... | 0 | 0 | -1 | -1 | 0 | 0 | [*:1]/C=C(\N)C(Cl)(Cl)Cl>>[*:1]CC(=O)C(F)(F)F | 69.12 | 44.26 | -24.86 |
14 | -1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]C12CC3CC(CC(C3)C1)C2>>[*:1]c1c(OC)cccc1OC | 87.8 | -1.01 | -88.81 |
15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]Cn1cc([*:2])c([N+](=O)[O-])n1>>[*:1]Cn1nn... | 86.1 | 32.9 | -53.2 |
16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]Cn1cc([N+](=O)[O-])c([*:2])n1>>[*:1]Cn1nn... | 86.1 | 32.9 | -53.2 |
17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | [*:1]c1c[n+]([O-])n([*:2])n1>>[*:1]c1ncn([*:2]... | 42.16 | -3.58 | -45.74 |
18 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 1 | 0 | 0 | [*:1]c1c[n+]([O-])n([*:2])n1>>[*:1]c1nn([*:2])... | 42.16 | 1.48 | -40.68 |
19 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]O[*:2]>>[*:1]OC([*:2])=O | 81.47 | 3.38 | -78.09 |
20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]C1CN([*:2])CC(C)N1>>[*:1]C1CN([*:2])CCN1 | 61.63 | 44.27 | -17.36 |
21 | -1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]C>>[*:1][H] | 61.63 | 44.27 | -17.36 |
22 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1cn([*:2])nc1[N+](=O)[O-]>>[*:1]c1nnn([*... | 86.1 | 32.9 | -53.2 |
23 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1nn([*:2])cc1[N+](=O)[O-]>>[*:1]c1nnn([*... | 86.1 | 32.9 | -53.2 |
24 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]N1CCC(OC)CC1>>[*:1]N1CCOCC1 | 95.45 | -0.15 | -95.6 |
25 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]/N=C/c1cc(Br)cc([*:2])c1O>>[*:2]CC(=O)C(=... | 74.04 | -2.72 | -76.76 |
26 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]CCN1CCC(OC)CC1>>[*:1]CCN1CCOCC1 | 95.45 | -0.15 | -95.6 |
27 | -1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]/C=N\c1cc(CC)ccc1[*:2]>>[*:1]/C=N\c1ncccc... | 53.18 | 36.45 | -16.73 |
28 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]/N=C\c1cc(I)cc(I)c1[*:2]>>[*:1]C([*:2])C(... | 56.0 | -0.18 | -56.18 |
29 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]/N=C\c1cc(I)cc(I)c1[*:2]>>[*:2]CC(NC(=O)C... | 56.0 | -0.18 | -56.18 |
30 | 0 | 0 | -1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1ccc2no[n+]([O... | 55.67 | -1.98 | -57.65 |
31 | 0 | 0 | -1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 1 | 0 | 0 | [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1cc([N+](=O)[O... | 55.67 | 39.85 | -15.82 |
32 | 0 | 0 | -1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 1 | 0 | 0 | [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1cnc(N)c([N+](... | 55.67 | 39.85 | -15.82 |
33 | 1 | 0 | -1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1nnn(CC(C)=O)n1 | 55.67 | 32.9 | -22.77 |
34 | 0 | 0 | -1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 1 | [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1ccc(/C=C/C(=O... | 55.67 | 22.95 | -32.72 |
35 | 0 | 0 | -1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 1 | 0 | 0 | [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1nonc1N | 55.67 | -3.09 | -58.76 |
36 | -1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1c[n+]([O-])n(C)n1>>[*:1]c1ccc2no[n+]([O... | 42.16 | -1.98 | -44.14 |
37 | -1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 1 | 0 | 0 | [*:1]c1c[n+]([O-])n(C)n1>>[*:1]c1cc([N+](=O)[O... | 42.16 | 39.85 | -2.31 |
38 | -1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 1 | 0 | 0 | [*:1]c1c[n+]([O-])n(C)n1>>[*:1]c1cnc(N)c([N+](... | 42.16 | 39.85 | -2.31 |
39 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1c[n+]([O-])n(C)n1>>[*:1]c1nnn(CC(C)=O)n1 | 42.16 | 32.9 | -9.26 |
40 | -1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 1 | [*:1]c1c[n+]([O-])n(C)n1>>[*:1]c1ccc(/C=C/C(=O... | 42.16 | 22.95 | -19.21 |
41 | -1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 1 | 0 | 0 | [*:1]c1c[n+]([O-])n(C)n1>>[*:1]c1nonc1N | 42.16 | -3.09 | -45.25 |
42 | -1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 1 | 0 | 0 | [*:1]c1cn(CC(C)=O)nc1[*:2]>>[*:1]c1cnc(N)c([*:... | 86.1 | 39.85 | -46.25 |
43 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1cn(CC(C)=O)nc1[*:2]>>[*:1]c1ncn(CCO)c1[... | 86.1 | -2.86 | -88.96 |
44 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | [*:1]Oc1ccccc1>>[*:1]S(=O)CC#N | 79.05 | -3.58 | -82.63 |
45 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]Oc1ccccc1>>[*:1]Sc1nnnn1C | 79.05 | 47.81 | -31.24 |
46 | -1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]C>>[*:1][H] | 58.74 | 47.51 | -11.23 |
47 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]/C=N\c1ccc(F)cc1>>[*:1]/C=N\c1ncccc1O | 63.01 | 36.45 | -26.56 |
48 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]/C=N\c1cccc(Cl)c1Cl>>[*:1]/C=N\c1ncccc1O | 72.7 | 36.45 | -36.25 |
49 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]/C=N\c1cccc(F)c1>>[*:1]/C=N\c1ncccc1O | 55.41 | 36.45 | -18.96 |
50 | 0 | 0 | 0 | -1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]/C=N\c1cccc(OC)c1>>[*:1]/C=N\c1ncccc1O | 59.53 | 36.45 | -23.08 |
51 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]/C=N\c1ccccc1I>>[*:1]/C=N\c1ncccc1O | 62.45 | 36.45 | -26.0 |
52 | 0 | 0 | 0 | -1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]/C=N\c1ccccc1OC>>[*:1]/C=N\c1ncccc1O | 64.28 | 36.45 | -27.83 |
53 | 0 | 0 | 0 | 0 | 0 | 0 | -1 | 1 | 0 | 0 | ... | 0 | 0 | -1 | -1 | 0 | 0 | [*:1]/C=N/c1nonc1N>>[*:1]/C=N\c1ncccc1O | 70.9 | 36.45 | -34.45 |
54 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | -1 | 1 | 0 | [*:1]N/N=C/c1ccccc1>>[*:1]N1CCN(C)CC1 | 83.39 | 0.02 | -83.37 |
55 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]N/N=C/c1ccccc1>>[*:1]N1CCNCC1 | 83.39 | -0.29 | -83.68 |
56 | 0 | 0 | 0 | 0 | 1 | -1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | -1 | 0 | [*:1]N1CCN(CCO)CC1>>[*:1]N1CCNC(C)C1 | 70.95 | 44.27 | -26.68 |
57 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]N1CC(C)NC(C)C1>>[*:1]N1CCNC(C)C1 | 61.63 | 44.27 | -17.36 |
58 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]CCCCCCC>>[*:1]CCCCCCCC | 56.99 | 32.75 | -24.24 |
59 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]CCCCCCC>>[*:1]CCCCCCCCCC | 56.99 | -5.56 | -62.55 |
60 rows × 119 columns
feat_diff.iloc[:,:-4].sum().sort_values(ascending=False).head(25)
B7 22
18 - Pyridine 17
NUC 16
sp2 hybridized carbon atoms (12) 10
Nitrogen atoms (5) 9
sp3 hybridized carbon atoms (10) 7
B9 7
Nitrogen atoms (2) 7
N6 7
N9 7
ACID 7
17 - Aromatic amine 6
sp3 hybridized carbon atoms (5) 5
A33 - phenol 5
E3 - e.g., carbonates 5
15 - Secondary amine group 5
sp2 hybridized carbon atoms (10) 4
Primary amine, not amide 4
Primary or secondary amine, not amide. 4
Alpha halo carbonyl 4
9 - �¡arbonyl 3
Ketone 3
Imines_(not_ring) 3
sp3 hybridized carbon atoms (2) 3
Aromatic NO2 2
dtype: object
feat_diff.iloc[:,:-4].sum().sort_values(ascending=False).tail(25)
4 - Aromatic carbon-alkane -3
B8EXC -3
N4EXC -3
Positively charged atoms -3
ELEC -3
Negatively charged atoms -3
13 - Ether -3
Acyclic N-,=N and not N bound to carbonyl or sulfone -3
25 - Aromatic chloro -4
38 - Aromatic fluoro -4
N oxide -5
sp2 hybridized carbon atoms (8) -5
10 - Aldehyde -6
1 - Alkane group -6
sp2 hybridized carbon atoms (7) -6
Aldehyde carbon atoms -6
E1 - alkyl and aryl ketones and aldehydes -6
Quaternary nitrogen (1) -7
8 - Aromatic carbon-alcohol -10
32 - Iodo compounds -11
Aryl iodide -11
Iodine -11
sp3 hybridized carbon atoms (11) -14
sp2 hybridized carbon atoms (11) -18
3 - Aromatic carbon -22
dtype: object
feat_diff
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
1 - Alkane group | 1,2-Dicarbonyl not in ring | 10 - Aldehyde | 13 - Ether | 15 - Secondary amine group | 16 - Tertiary amine | 17 - Aromatic amine | 18 - Pyridine | 19 - CCN | 2 - Olefin group | ... | Thionyl | Vinyl michael acceptor1 | Primary amine, not amide | Primary or secondary amine, not amide. | tertiary aliphatic amine | carboxylic acid | smirks | measurement_A | measurement_B | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -1 | ... | 0 | 0 | -1 | -1 | 0 | 0 | [*:1]C(=O)/C=C(\N)C(Cl)(Cl)Cl>>[*:1]C(=O)CC(=O... | 69.12 | 44.26 | -24.86 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1cc(Br)cc(Cl)c1O>>[*:1]c1ccc([N+](=O)[O-... | 57.32 | -2.09 | -59.41 |
2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1cc(I)cc(I)c1O>>[*:1]c1ccc([N+](=O)[O-])o1 | 63.01 | -2.09 | -65.1 |
3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1ccc(F)cc1>>[*:1]c1ncccc1O | 63.01 | 36.45 | -26.56 |
4 | -1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1cc(CC)ccc1O>>[*:1]c1ncccc1O | 53.18 | 36.45 | -16.73 |
5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1cccc(Cl)c1Cl>>[*:1]c1ncccc1O | 72.7 | 36.45 | -36.25 |
6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1cccc(F)c1>>[*:1]c1ncccc1O | 55.41 | 36.45 | -18.96 |
7 | 0 | 0 | 0 | -1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1cccc(OC)c1>>[*:1]c1ncccc1O | 59.53 | 36.45 | -23.08 |
8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1cccc2ccccc12>>[*:1]c1ncccc1O | 64.42 | 36.45 | -27.97 |
9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1ccccc1I>>[*:1]c1ncccc1O | 62.45 | 36.45 | -26.0 |
10 | 0 | 0 | 0 | -1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1ccccc1OC>>[*:1]c1ncccc1O | 64.28 | 36.45 | -27.83 |
11 | -1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1ccc(CC)cc1[*:2]>>[*:1]c1cccnc1[*:2] | 53.18 | 36.45 | -16.73 |
12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | -1 | 0 | 0 | [*:1]c1c(Cl)nc(NN)c([*:2])c1[*:3]>>[*:1]c1nc([... | 87.7 | -3.86 | -91.56 |
13 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | -1 | ... | 0 | 0 | -1 | -1 | 0 | 0 | [*:1]/C=C(\N)C(Cl)(Cl)Cl>>[*:1]CC(=O)C(F)(F)F | 69.12 | 44.26 | -24.86 |
14 | -1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]C12CC3CC(CC(C3)C1)C2>>[*:1]c1c(OC)cccc1OC | 87.8 | -1.01 | -88.81 |
15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]Cn1cc([*:2])c([N+](=O)[O-])n1>>[*:1]Cn1nn... | 86.1 | 32.9 | -53.2 |
16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]Cn1cc([N+](=O)[O-])c([*:2])n1>>[*:1]Cn1nn... | 86.1 | 32.9 | -53.2 |
17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | [*:1]c1c[n+]([O-])n([*:2])n1>>[*:1]c1ncn([*:2]... | 42.16 | -3.58 | -45.74 |
18 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 1 | 0 | 0 | [*:1]c1c[n+]([O-])n([*:2])n1>>[*:1]c1nn([*:2])... | 42.16 | 1.48 | -40.68 |
19 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]O[*:2]>>[*:1]OC([*:2])=O | 81.47 | 3.38 | -78.09 |
20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]C1CN([*:2])CC(C)N1>>[*:1]C1CN([*:2])CCN1 | 61.63 | 44.27 | -17.36 |
21 | -1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]C>>[*:1][H] | 61.63 | 44.27 | -17.36 |
22 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1cn([*:2])nc1[N+](=O)[O-]>>[*:1]c1nnn([*... | 86.1 | 32.9 | -53.2 |
23 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1nn([*:2])cc1[N+](=O)[O-]>>[*:1]c1nnn([*... | 86.1 | 32.9 | -53.2 |
24 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]N1CCC(OC)CC1>>[*:1]N1CCOCC1 | 95.45 | -0.15 | -95.6 |
25 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]/N=C/c1cc(Br)cc([*:2])c1O>>[*:2]CC(=O)C(=... | 74.04 | -2.72 | -76.76 |
26 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]CCN1CCC(OC)CC1>>[*:1]CCN1CCOCC1 | 95.45 | -0.15 | -95.6 |
27 | -1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]/C=N\c1cc(CC)ccc1[*:2]>>[*:1]/C=N\c1ncccc... | 53.18 | 36.45 | -16.73 |
28 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]/N=C\c1cc(I)cc(I)c1[*:2]>>[*:1]C([*:2])C(... | 56.0 | -0.18 | -56.18 |
29 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]/N=C\c1cc(I)cc(I)c1[*:2]>>[*:2]CC(NC(=O)C... | 56.0 | -0.18 | -56.18 |
30 | 0 | 0 | -1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1ccc2no[n+]([O... | 55.67 | -1.98 | -57.65 |
31 | 0 | 0 | -1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 1 | 0 | 0 | [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1cc([N+](=O)[O... | 55.67 | 39.85 | -15.82 |
32 | 0 | 0 | -1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 1 | 0 | 0 | [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1cnc(N)c([N+](... | 55.67 | 39.85 | -15.82 |
33 | 1 | 0 | -1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1nnn(CC(C)=O)n1 | 55.67 | 32.9 | -22.77 |
34 | 0 | 0 | -1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 1 | [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1ccc(/C=C/C(=O... | 55.67 | 22.95 | -32.72 |
35 | 0 | 0 | -1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 1 | 0 | 0 | [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1nonc1N | 55.67 | -3.09 | -58.76 |
36 | -1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1c[n+]([O-])n(C)n1>>[*:1]c1ccc2no[n+]([O... | 42.16 | -1.98 | -44.14 |
37 | -1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 1 | 0 | 0 | [*:1]c1c[n+]([O-])n(C)n1>>[*:1]c1cc([N+](=O)[O... | 42.16 | 39.85 | -2.31 |
38 | -1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 1 | 0 | 0 | [*:1]c1c[n+]([O-])n(C)n1>>[*:1]c1cnc(N)c([N+](... | 42.16 | 39.85 | -2.31 |
39 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1c[n+]([O-])n(C)n1>>[*:1]c1nnn(CC(C)=O)n1 | 42.16 | 32.9 | -9.26 |
40 | -1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 1 | [*:1]c1c[n+]([O-])n(C)n1>>[*:1]c1ccc(/C=C/C(=O... | 42.16 | 22.95 | -19.21 |
41 | -1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 1 | 0 | 0 | [*:1]c1c[n+]([O-])n(C)n1>>[*:1]c1nonc1N | 42.16 | -3.09 | -45.25 |
42 | -1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 1 | 0 | 0 | [*:1]c1cn(CC(C)=O)nc1[*:2]>>[*:1]c1cnc(N)c([*:... | 86.1 | 39.85 | -46.25 |
43 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1cn(CC(C)=O)nc1[*:2]>>[*:1]c1ncn(CCO)c1[... | 86.1 | -2.86 | -88.96 |
44 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | [*:1]Oc1ccccc1>>[*:1]S(=O)CC#N | 79.05 | -3.58 | -82.63 |
45 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]Oc1ccccc1>>[*:1]Sc1nnnn1C | 79.05 | 47.81 | -31.24 |
46 | -1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]C>>[*:1][H] | 58.74 | 47.51 | -11.23 |
47 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]/C=N\c1ccc(F)cc1>>[*:1]/C=N\c1ncccc1O | 63.01 | 36.45 | -26.56 |
48 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]/C=N\c1cccc(Cl)c1Cl>>[*:1]/C=N\c1ncccc1O | 72.7 | 36.45 | -36.25 |
49 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]/C=N\c1cccc(F)c1>>[*:1]/C=N\c1ncccc1O | 55.41 | 36.45 | -18.96 |
50 | 0 | 0 | 0 | -1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]/C=N\c1cccc(OC)c1>>[*:1]/C=N\c1ncccc1O | 59.53 | 36.45 | -23.08 |
51 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]/C=N\c1ccccc1I>>[*:1]/C=N\c1ncccc1O | 62.45 | 36.45 | -26.0 |
52 | 0 | 0 | 0 | -1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]/C=N\c1ccccc1OC>>[*:1]/C=N\c1ncccc1O | 64.28 | 36.45 | -27.83 |
53 | 0 | 0 | 0 | 0 | 0 | 0 | -1 | 1 | 0 | 0 | ... | 0 | 0 | -1 | -1 | 0 | 0 | [*:1]/C=N/c1nonc1N>>[*:1]/C=N\c1ncccc1O | 70.9 | 36.45 | -34.45 |
54 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | -1 | 1 | 0 | [*:1]N/N=C/c1ccccc1>>[*:1]N1CCN(C)CC1 | 83.39 | 0.02 | -83.37 |
55 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]N/N=C/c1ccccc1>>[*:1]N1CCNCC1 | 83.39 | -0.29 | -83.68 |
56 | 0 | 0 | 0 | 0 | 1 | -1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | -1 | 0 | [*:1]N1CCN(CCO)CC1>>[*:1]N1CCNC(C)C1 | 70.95 | 44.27 | -26.68 |
57 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]N1CC(C)NC(C)C1>>[*:1]N1CCNC(C)C1 | 61.63 | 44.27 | -17.36 |
58 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]CCCCCCC>>[*:1]CCCCCCCC | 56.99 | 32.75 | -24.24 |
59 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]CCCCCCC>>[*:1]CCCCCCCCCC | 56.99 | -5.56 | -62.55 |
60 rows × 119 columns
corr_feat = feat_diff.iloc[:,:-4].astype(float)
corr = corr_feat.corr()
feat_diff.iloc[:,:-4][(feat_diff.iloc[:,:-4]['3 - Aromatic carbon']<0)].sum().sort_values(ascending=False).head(20)
sp2 hybridized carbon atoms (12) 17
18 - Pyridine 17
B7 17
NUC 15
N6 15
ACID 13
A33 - phenol 13
Nitrogen atoms (5) 4
sp2 hybridized carbon atoms (10) 4
15 - Secondary amine group 3
sp3 hybridized carbon atoms (10) 3
Enamine 3
Alpha halo carbonyl 2
22 - CCl2 2
5 - Alcohol 2
Alkyl halide 2
Nitrogen atoms (1) 2
sp3 hybridized carbon atoms (5) 2
sp3 hybridized carbon atoms (2) 2
sp3 hybridized carbon atoms (12) 2
dtype: object
sub_to_evader_index_reset = sub_to_evader_transforms.reset_index(drop=True)
feat_diff[feat_diff['Iodine']<0]
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
1 - Alkane group | 1,2-Dicarbonyl not in ring | 10 - Aldehyde | 13 - Ether | 15 - Secondary amine group | 16 - Tertiary amine | 17 - Aromatic amine | 18 - Pyridine | 19 - CCN | 2 - Olefin group | ... | Thionyl | Vinyl michael acceptor1 | Primary amine, not amide | Primary or secondary amine, not amide. | tertiary aliphatic amine | carboxylic acid | smirks | measurement_A | measurement_B | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1cc(I)cc(I)c1O>>[*:1]c1ccc([N+](=O)[O-])o1 | 63.01 | -2.09 | -65.1 |
9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1ccccc1I>>[*:1]c1ncccc1O | 62.45 | 36.45 | -26.0 |
28 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]/N=C\c1cc(I)cc(I)c1[*:2]>>[*:1]C([*:2])C(... | 56.0 | -0.18 | -56.18 |
29 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]/N=C\c1cc(I)cc(I)c1[*:2]>>[*:2]CC(NC(=O)C... | 56.0 | -0.18 | -56.18 |
30 | 0 | 0 | -1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1ccc2no[n+]([O... | 55.67 | -1.98 | -57.65 |
31 | 0 | 0 | -1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 1 | 0 | 0 | [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1cc([N+](=O)[O... | 55.67 | 39.85 | -15.82 |
32 | 0 | 0 | -1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 1 | 0 | 0 | [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1cnc(N)c([N+](... | 55.67 | 39.85 | -15.82 |
33 | 1 | 0 | -1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1nnn(CC(C)=O)n1 | 55.67 | 32.9 | -22.77 |
34 | 0 | 0 | -1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 1 | [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1ccc(/C=C/C(=O... | 55.67 | 22.95 | -32.72 |
35 | 0 | 0 | -1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 1 | 0 | 0 | [*:1]c1cc(I)c(O)c(C=O)c1>>[*:1]c1nonc1N | 55.67 | -3.09 | -58.76 |
51 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]/C=N\c1ccccc1I>>[*:1]/C=N\c1ncccc1O | 62.45 | 36.45 | -26.0 |
11 rows × 119 columns
len(sub_to_evader_index_reset.iloc[feat_diff[feat_diff['Iodine']<0].index].compound_structure_B.unique())
8
feat_diff.iloc[:,:-4].sum().sort_values(ascending=False).tail(20)
Negatively charged atoms -3
13 - Ether -3
Acyclic N-,=N and not N bound to carbonyl or sulfone -3
25 - Aromatic chloro -4
38 - Aromatic fluoro -4
N oxide -5
sp2 hybridized carbon atoms (8) -5
10 - Aldehyde -6
1 - Alkane group -6
sp2 hybridized carbon atoms (7) -6
Aldehyde carbon atoms -6
E1 - alkyl and aryl ketones and aldehydes -6
Quaternary nitrogen (1) -7
8 - Aromatic carbon-alcohol -10
32 - Iodo compounds -11
Aryl iodide -11
Iodine -11
sp3 hybridized carbon atoms (11) -14
sp2 hybridized carbon atoms (11) -18
3 - Aromatic carbon -22
dtype: object
feat_diff.iloc[:,:-4].sum().sort_values(ascending=False).head(20)
B7 22
18 - Pyridine 17
NUC 16
sp2 hybridized carbon atoms (12) 10
Nitrogen atoms (5) 9
sp3 hybridized carbon atoms (10) 7
B9 7
Nitrogen atoms (2) 7
N6 7
N9 7
ACID 7
17 - Aromatic amine 6
sp3 hybridized carbon atoms (5) 5
A33 - phenol 5
E3 - e.g., carbonates 5
15 - Secondary amine group 5
sp2 hybridized carbon atoms (10) 4
Primary amine, not amide 4
Primary or secondary amine, not amide. 4
Alpha halo carbonyl 4
dtype: object
search = feat_diff.iloc[:,:-4][(feat_diff.iloc[:,:-4]['B7']>0)].index
len(sub_to_evader_index_reset.iloc[search].compound_structure_B.unique())
2
feat_diff.iloc[:,:-4][(feat_diff.iloc[:,:-4]['E1 - alkyl and aryl ketones and aldehydes']<0)].sum().sort_values(ascending=False).head(20)
Primary or secondary amine, not amide. 3
Primary amine, not amide 3
B8EXC 3
17 - Aromatic amine 3
B9 3
Negatively charged atoms 3
Positively charged atoms 3
Nitrogen atoms (2) 3
Nitrogen atoms (7) 3
Nitrogen atoms (4) 2
Nitrogen atoms (5) 2
B7 2
Dye 16 (1) 2
E3 - e.g., carbonates 2
N4EXC 2
Nitro group 2
Aromatic NO2 2
27 - Aromatic nitro 2
sp2 hybridized carbon atoms (12) 2
Oxygen-nitrogen single bond 2
dtype: object
feat_diff = feat_diff.drop(['N9'], axis=1)
to_drop=['18 - Pyridine', 'N9']
# to_drop=[]
feat_diff = feat_diff.drop(to_drop, axis = 1)
feat_left = feat_left.drop(to_drop, axis = 1)
feat_right = feat_right.drop(to_drop, axis = 1)
fr_sig_descriptors_evade = master_functions.find_sig_feats_mk2(feat_left, feat_right, 0.05)
fractions_to_drop=[]
results_evader = master_functions.results_arr(feat_diff, fr_sig_descriptors_evade, feat_right, feat_left, fractions_to_drop )
Found significant fractions: 21
10 - Aldehyde has negative correlation
percentage_loss 100
15 - Secondary amine group has positive correlation
0/1/2 loss
[('3 - Aromatic carbon', 'Nitrogen atoms (5)', 'N4EXC'), 'sp2 hybridized carbon atoms (11)', 'Iodine']
[-60.0, -40.0, -40.0]
percentage gain under -100
17 - Aromatic amine has positive correlation
0/1/2 loss
[('1 - Alkane group', 'ELEC', 'sp3 hybridized carbon atoms (11)'), 'E1 - alkyl and aryl ketones and aldehydes', 'Iodine']
[-57.14, -42.86, -42.86]
percentage gain under -100
25 - Aromatic chloro has negative correlation
first_gain
[('sp2 hybridized carbon atoms (12)', 'B7'), 'ACID', 'N4EXC']
[50.0, 25.0, 25.0]
3 - Aromatic carbon has negative correlation
first_gain
[('sp2 hybridized carbon atoms (12)', 'B7'), 'NUC', 'N6']
[73.91, 65.22, 65.22]
percentage_loss 100
32 - Iodo compounds has negative correlation
percentage_loss 100
38 - Aromatic fluoro has negative correlation
percentage_loss 100
8 - Aromatic carbon-alcohol has negative correlation
all gain
[('B8EXC', 'Positively charged atoms', 'Negatively charged atoms'), 'Dye 16 (1)', 'Nitrogen atoms (2)']
[50.0, 40.0, 40.0]
percentage_loss 100
Aldehyde carbon atoms has negative correlation
percentage_loss 100
Alpha halo carbonyl has positive correlation
1/2/3 loss
['sp2 hybridized carbon atoms (11)', ('32 - Iodo compounds', '3 - Aromatic carbon', 'Nitrogen atoms (5)'), 'Iodine']
[-100.0, -50.0, -50.0]
percentage gain under -100
Aryl iodide has negative correlation
percentage_loss 100
B7 has positive correlation
percentage gain under -100
B9 has positive correlation
0/1/2 loss
[('1 - Alkane group', 'ELEC', 'sp3 hybridized carbon atoms (11)'), 'Aldehyde carbon atoms', 'ACID']
[-50.0, -37.5, -37.5]
percentage gain under -100
E1 - alkyl and aryl ketones and aldehydes has negative correlation
percentage_loss 100
Iodine has negative correlation
percentage_loss 100
Nitrogen atoms (5) has positive correlation
percentage gain under -100
NUC has positive correlation
second double loss
['3 - Aromatic carbon', ('sp3 hybridized carbon atoms (11)', 'sp2 hybridized carbon atoms (11)'), 'sp2 hybridized carbon atoms (7)']
[-72.73, -40.91, -22.73]
percentage gain under -100
Quaternary nitrogen (1) has negative correlation
percentage_loss 100
sp2 hybridized carbon atoms (11) has negative correlation
first_gain
[('B7', 'sp2 hybridized carbon atoms (12)'), 'NUC', 'sp3 hybridized carbon atoms (10)']
[50.0, 36.36, 31.82]
percentage_loss 100
sp2 hybridized carbon atoms (7) has negative correlation
first_gain
[('sp2 hybridized carbon atoms (12)', 'B7'), 'A33 - phenol', 'NUC']
[100.0, 83.33, 83.33]
percentage_loss 100
sp3 hybridized carbon atoms (11) has negative correlation
percentage_loss 100
results_evader.sort_values(by='dof')
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Main fraction | Correlation | sem | std | dof | Opposite fraction 1 | % of opposite 1 | Opposite fraction 2 | % of opposite 2 | Opposite fraction 3 | % of opposite 3 | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|
3 | 25 - Aromatic chloro | Negative | 55.87 | 13.09 | 26.18 | 4 | (sp2 hybridized carbon atoms (12), B7) | 50.00 | ACID | 25.00 | N4EXC | 25.00 |
9 | Alpha halo carbonyl | Positive | -40.52 | 9.04 | 18.08 | 4 | sp2 hybridized carbon atoms (11) | -100.00 | (32 - Iodo compounds, 3 - Aromatic carbon, Nit... | -50.00 | Iodine | -50.00 |
6 | 38 - Aromatic fluoro | Negative | 22.76 | 2.19 | 4.39 | 4 | ACID | 100.00 | (NUC, A33 - phenol, sp2 hybridized carbon atom... | 100.00 | N6 | 100.00 |
1 | 15 - Secondary amine group | Positive | -52.68 | 9.50 | 21.25 | 5 | (3 - Aromatic carbon, Nitrogen atoms (5), N4EXC) | -60.00 | sp2 hybridized carbon atoms (11) | -40.00 | Iodine | -40.00 |
19 | sp2 hybridized carbon atoms (7) | Negative | 24.81 | 2.85 | 6.98 | 6 | (sp2 hybridized carbon atoms (12), B7) | 100.00 | A33 - phenol | 83.33 | NUC | 83.33 |
0 | 10 - Aldehyde | Negative | 33.92 | 8.09 | 19.80 | 6 | Primary or secondary amine, not amide. | 50.00 | (Primary amine, not amide, B8EXC, 17 - Aromati... | 50.00 | B9 | 50.00 |
13 | E1 - alkyl and aryl ketones and aldehydes | Negative | 33.92 | 8.09 | 19.80 | 6 | Primary or secondary amine, not amide. | 50.00 | (Primary amine, not amide, B8EXC, 17 - Aromati... | 50.00 | B9 | 50.00 |
8 | Aldehyde carbon atoms | Negative | 33.92 | 8.09 | 19.80 | 6 | Primary or secondary amine, not amide. | 50.00 | (Primary amine, not amide, B8EXC, 17 - Aromati... | 50.00 | B9 | 50.00 |
2 | 17 - Aromatic amine | Positive | -26.65 | 8.70 | 23.02 | 7 | (1 - Alkane group, ELEC, sp3 hybridized carbon... | -57.14 | E1 - alkyl and aryl ketones and aldehydes | -42.86 | Iodine | -42.86 |
17 | Quaternary nitrogen (1) | Negative | 24.03 | 7.73 | 20.46 | 7 | NUC | 42.86 | (Primary or secondary amine, not amide., Prima... | 42.86 | 17 - Aromatic amine | 42.86 |
12 | B9 | Positive | -28.40 | 7.74 | 21.89 | 8 | (1 - Alkane group, ELEC, sp3 hybridized carbon... | -50.00 | Aldehyde carbon atoms | -37.50 | ACID | -37.50 |
7 | 8 - Aromatic carbon-alcohol | Negative | 42.15 | 7.48 | 23.65 | 10 | (B8EXC, Positively charged atoms, Negatively c... | 50.00 | Dye 16 (1) | 40.00 | Nitrogen atoms (2) | 40.00 |
5 | 32 - Iodo compounds | Negative | 39.36 | 5.82 | 19.30 | 11 | Nitrogen atoms (2) | 45.45 | (B8EXC, B7, Negatively charged atoms) | 36.36 | Positively charged atoms | 36.36 |
10 | Aryl iodide | Negative | 39.36 | 5.82 | 19.30 | 11 | Nitrogen atoms (2) | 45.45 | (B8EXC, B7, Negatively charged atoms) | 36.36 | Positively charged atoms | 36.36 |
14 | Iodine | Negative | 39.36 | 5.82 | 19.30 | 11 | Nitrogen atoms (2) | 45.45 | (B8EXC, B7, Negatively charged atoms) | 36.36 | Positively charged atoms | 36.36 |
15 | Nitrogen atoms (5) | Positive | -21.62 | 3.16 | 11.81 | 14 | sp2 hybridized carbon atoms (11) | -78.57 | 3 - Aromatic carbon | -64.29 | sp3 hybridized carbon atoms (11) | -50.00 |
20 | sp3 hybridized carbon atoms (11) | Negative | 33.56 | 6.73 | 29.34 | 19 | B7 | 52.63 | NUC | 47.37 | Nitrogen atoms (5) | 36.84 |
16 | NUC | Positive | -34.87 | 4.61 | 21.64 | 22 | 3 - Aromatic carbon | -72.73 | (sp3 hybridized carbon atoms (11), sp2 hybridi... | -40.91 | sp2 hybridized carbon atoms (7) | -22.73 |
11 | B7 | Positive | -23.29 | 2.20 | 10.34 | 22 | 3 - Aromatic carbon | -77.27 | sp2 hybridized carbon atoms (11) | -50.00 | sp3 hybridized carbon atoms (11) | -45.45 |
18 | sp2 hybridized carbon atoms (11) | Negative | 38.77 | 5.16 | 24.20 | 22 | (B7, sp2 hybridized carbon atoms (12)) | 50.00 | NUC | 36.36 | sp3 hybridized carbon atoms (10) | 31.82 |
4 | 3 - Aromatic carbon | Negative | 37.77 | 4.81 | 23.06 | 23 | (sp2 hybridized carbon atoms (12), B7) | 73.91 | NUC | 65.22 | N6 | 65.22 |
master_functions.plot_feats(results_evader)
feat_diff.iloc[:,:-4][(feat_diff.iloc[:,:-4]['Quaternary nitrogen (1)']<0)].sum().sort_values(ascending=False).head(20)
NUC 3
Primary or secondary amine, not amide. 3
Primary amine, not amide 3
B9 3
17 - Aromatic amine 3
Nitrogen atoms (2) 3
Nitro group 2
Nitrogen atoms (4) 2
Dye 16 (1) 2
Nitrogen atoms (5) 2
27 - Aromatic nitro 2
sp2 hybridized carbon atoms (11) 2
E3 - e.g., carbonates 2
B7 2
sp3 hybridized carbon atoms (10) 2
N4EXC 2
Oxygen-nitrogen single bond 2
Aromatic NO2 2
sp3 hybridized carbon atoms (5) 2
Alpha beta-unsaturated ketones; center of Michael reactivity 1
dtype: object
# get example of positive transforms
# substrates
to_fg = '17 - Aromatic amine'
from_fg = 'Quaternary nitrogen (1)'
dex = feat_diff[(feat_diff[to_fg]>0)&(feat_diff[from_fg]<0)] # multiple examples of said transformation with different smirks
print(len(dex))
print('number of unique smirks:', len(dex.smirks.unique()) )
# grab those smirks and produce examples
low=4 # take first smirk
display_arr = []
for i in range(len(dex)):
display_lhs_sub = sub_to_evader_transforms[sub_to_evader_transforms['smirks']==dex.smirks.iloc[i]].LHS.iloc[0]
display_rhs_sub = sub_to_evader_transforms[sub_to_evader_transforms['smirks']==dex.smirks.iloc[i]].RHS.iloc[0]
display_arr.append(Chem.MolFromSmiles(display_lhs_sub))
display_arr.append(Chem.MolFromSmiles(display_rhs_sub))
# Chem.Draw.MolsToGridImage([Chem.MolFromSmiles(display_lhs_sub),Chem.MolFromSmiles(display_rhs_sub)], molsPerRow=2, subImgSize=(400,400), useSVG=True)
leg=[str(x) for x in range(len(dex))]
Chem.Draw.MolsToGridImage(display_arr, molsPerRow=2, subImgSize=(400,400), useSVG=True, maxMols = 50)
3
number of unique smirks: 3
# choose the compounds we're interested in:
a=4
# smiles:
comp_a = sub_to_evader_transforms[sub_to_evader_transforms['smirks'] == dex.iloc[int(a/2)].smirks].compound_structure_A.values[0]
comp_b = sub_to_evader_transforms[sub_to_evader_transforms['smirks'] == dex.iloc[int(a/2)].smirks].compound_structure_B.values[0]
# wt and efflux pre
pre = e_coli_wild_efflux[e_coli_wild_efflux['SMILES'] == comp_a][['INHIB_AVE_wild', 'INHIB_AVE_efflux']].values[0]
# wt and efflux post
post = e_coli_wild_efflux[e_coli_wild_efflux['SMILES'] == comp_b][['INHIB_AVE_wild', 'INHIB_AVE_efflux']].values[0]
print(comp_a)
print('WT: {}%, tolC: {}%'.format(pre[0], pre[1]))
print(comp_b)
print('WT: {}%, tolC: {}%'.format(post[0], post[1]))
Cn1nc([N+](=O)[O-])c[n+]1[O-]
WT: 36.64%, tolC: 78.8%
Nc1nonc1[N+](=O)[O-]
WT: 99.21%, tolC: 96.12%
# get example of negative transforms
# Filter9_metal Negative 47.02 6.41 21.27 11 Nitrogen atoms (2)
# substrates
to_fg = 'B7'
from_fg = 'Iodine'
dex = feat_diff[(feat_diff[to_fg]>0)&(feat_diff[from_fg]<0)] # multiple examples of said transformation with different smirks
print(len(dex))
print('number of unique smirks:', len(dex.smirks.unique()) )
# grab those smirks and produce examples
low=4 # take first smirk
display_arr = []
for i in range(len(dex)):
display_lhs_sub = sub_to_evader_transforms[sub_to_evader_transforms['smirks']==dex.smirks.iloc[i]].LHS.iloc[0]
display_rhs_sub = sub_to_evader_transforms[sub_to_evader_transforms['smirks']==dex.smirks.iloc[i]].RHS.iloc[0]
display_arr.append(Chem.MolFromSmiles(display_lhs_sub))
display_arr.append(Chem.MolFromSmiles(display_rhs_sub))
# Chem.Draw.MolsToGridImage([Chem.MolFromSmiles(display_lhs_sub),Chem.MolFromSmiles(display_rhs_sub)], molsPerRow=2, subImgSize=(400,400), useSVG=True)
Chem.Draw.MolsToGridImage(display_arr, molsPerRow=2, subImgSize=(400,400), useSVG=True, maxMols = 50)
4
number of unique smirks: 4
dex
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
1 - Alkane group | 1,2-Dicarbonyl not in ring | 10 - Aldehyde | 13 - Ether | 15 - Secondary amine group | 16 - Tertiary amine | 17 - Aromatic amine | 18 - Pyridine | 19 - CCN | 2 - Olefin group | ... | Thionyl | Vinyl michael acceptor1 | Primary amine, not amide | Primary or secondary amine, not amide. | tertiary aliphatic amine | carboxylic acid | smirks | measurement_A | measurement_B | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1ccc(F)cc1>>[*:1]c1ncccc1O | 63.01 | 36.45 | -26.56 |
5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1cccc(Cl)c1Cl>>[*:1]c1ncccc1O | 72.7 | 36.45 | -36.25 |
6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1cccc(F)c1>>[*:1]c1ncccc1O | 55.41 | 36.45 | -18.96 |
7 | 0 | 0 | 0 | -1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1cccc(OC)c1>>[*:1]c1ncccc1O | 59.53 | 36.45 | -23.08 |
8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1cccc2ccccc12>>[*:1]c1ncccc1O | 64.42 | 36.45 | -27.97 |
9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1ccccc1I>>[*:1]c1ncccc1O | 62.45 | 36.45 | -26.0 |
10 | 0 | 0 | 0 | -1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]c1ccccc1OC>>[*:1]c1ncccc1O | 64.28 | 36.45 | -27.83 |
47 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]/C=N\c1ccc(F)cc1>>[*:1]/C=N\c1ncccc1O | 63.01 | 36.45 | -26.56 |
48 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]/C=N\c1cccc(Cl)c1Cl>>[*:1]/C=N\c1ncccc1O | 72.7 | 36.45 | -36.25 |
49 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]/C=N\c1cccc(F)c1>>[*:1]/C=N\c1ncccc1O | 55.41 | 36.45 | -18.96 |
50 | 0 | 0 | 0 | -1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]/C=N\c1cccc(OC)c1>>[*:1]/C=N\c1ncccc1O | 59.53 | 36.45 | -23.08 |
51 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]/C=N\c1ccccc1I>>[*:1]/C=N\c1ncccc1O | 62.45 | 36.45 | -26.0 |
52 | 0 | 0 | 0 | -1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]/C=N\c1ccccc1OC>>[*:1]/C=N\c1ncccc1O | 64.28 | 36.45 | -27.83 |
53 | 0 | 0 | 0 | 0 | 0 | 0 | -1 | 1 | 0 | 0 | ... | 0 | 0 | -1 | -1 | 0 | 0 | [*:1]/C=N/c1nonc1N>>[*:1]/C=N\c1ncccc1O | 70.9 | 36.45 | -34.45 |
14 rows × 119 columns
# choose the compounds we're interested in:
a=18
# smiles:
comp_a = sub_to_evader_transforms[sub_to_evader_transforms['smirks'] == dex.iloc[int(a/2)].smirks].compound_structure_A.values[0]
comp_b = sub_to_evader_transforms[sub_to_evader_transforms['smirks'] == dex.iloc[int(a/2)].smirks].compound_structure_B.values[0]
# wt and efflux pre
pre = e_coli_wild_efflux[e_coli_wild_efflux['SMILES'] == comp_a][['INHIB_AVE_wild', 'INHIB_AVE_efflux']].values[0]
# wt and efflux post
post = e_coli_wild_efflux[e_coli_wild_efflux['SMILES'] == comp_b][['INHIB_AVE_wild', 'INHIB_AVE_efflux']].values[0]
print(comp_a)
print('WT: {}%, tolC: {}%'.format(pre[0], pre[1]))
print(comp_b)
print('WT: {}%, tolC: {}%'.format(post[0], post[1]))
CCc1ccc(O)c(/N=C/c2cc(I)cc(I)c2O)c1
WT: 38.8%, tolC: 91.98%
Oc1cccnc1/N=C/c1cc(I)cc(I)c1O
WT: 60.66%, tolC: 97.11%
sub_and_evade_logd['Class'] = sub_and_evade_om_corrected['Class']
sub_and_evade_logd.columns
Index(['Index', 'SMILES', 'logS', 'logS @ pH7.4', 'logD', '2C9 pKi', 'logP',
'MW', 'HBD', 'HBA', 'TPSA', 'Flexibility', 'Rotatable Bonds', 'mol',
'Class'],
dtype='object')
feat='Rotatable Bonds'
sub_and_evade_logd[sub_and_evade_logd['Class']=='Efflux Substrate'][feat].mean(), sub_and_evade_logd[sub_and_evade_logd['Class']=='Efflux Evader'][feat].mean()
(5.730560578661844, 4.859459459459459)
sub_and_evade_logd
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Index | SMILES | logS | logS @ pH7.4 | logD | 2C9 pKi | logP | MW | HBD | HBA | TPSA | Flexibility | Rotatable Bonds | mol | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | OB1OCc2ccccc21 | 5.188 | 2.2370 | 0.07439 | 4.217 | 0.07439 | 133.9 | 1 | 2 | 29.46 | 0.00000 | 0 | <rdkit.Chem.rdchem.Mol object at 0x000002CDAA1... | Efflux Evader |
1 | 1 | BrC(/C=N/Nc1nc(N2CCOCC2)nc(N2CCOCC2)n1)=C/c1cc... | 2.053 | 0.4994 | 2.27200 | 5.529 | 2.78000 | 474.4 | 1 | 9 | 88.00 | 0.18180 | 6 | <rdkit.Chem.rdchem.Mol object at 0x000002CDAA1... | Efflux Evader |
2 | 2 | Clc1ccc(C(=C2CN3CCC2CC3)c2ccc(Cl)s2)s1 | 1.303 | 0.8745 | 3.51100 | 5.096 | 4.87400 | 356.3 | 0 | 1 | 3.24 | 0.08333 | 2 | <rdkit.Chem.rdchem.Mol object at 0x000002CDAA1... | Efflux Evader |
3 | 3 | O=C(/C=C(\O)c1ccc(Br)cc1)C(F)(F)F | 2.361 | 2.2380 | 1.63100 | 4.581 | 3.76600 | 295.1 | 1 | 2 | 37.30 | 0.18750 | 3 | <rdkit.Chem.rdchem.Mol object at 0x000002CDAA1... | Efflux Evader |
4 | 4 | O=C(CCl)C(=O)Nc1ccccc1 | 4.326 | 2.9250 | 1.00300 | 3.932 | 1.00300 | 197.6 | 1 | 3 | 46.17 | 0.30770 | 4 | <rdkit.Chem.rdchem.Mol object at 0x000002CDAA1... | Efflux Evader |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
735 | 735 | c1ccc2c(c1)ccc1c2nc2ccccn21 | 1.606 | 1.6420 | 4.15400 | 4.902 | 4.15400 | 218.3 | 0 | 2 | 17.30 | 0.00000 | 0 | <rdkit.Chem.rdchem.Mol object at 0x000002CDE31... | Efflux Substrate |
736 | 736 | O=C(CSc1ccc2ccccc2n1)N/N=C/c1ccc(O)cc1O | 1.119 | 2.5010 | 2.21900 | 4.954 | 2.21900 | 353.4 | 3 | 6 | 94.81 | 0.22220 | 6 | <rdkit.Chem.rdchem.Mol object at 0x000002CDE31... | Efflux Substrate |
737 | 737 | Cc1c2ccncc2c(C)c2c1[nH]c1ccccc12 | 1.294 | 0.9868 | 4.80000 | 5.346 | 4.80000 | 246.3 | 1 | 2 | 28.68 | 0.00000 | 0 | <rdkit.Chem.rdchem.Mol object at 0x000002CDE31... | Efflux Substrate |
738 | 738 | Cc1cc(C)c(CSc2nnc(C)s2)c(C)c1 | 1.607 | 2.4660 | 3.86300 | 4.569 | 3.86300 | 264.4 | 0 | 2 | 25.78 | 0.16670 | 3 | <rdkit.Chem.rdchem.Mol object at 0x000002CDE31... | Efflux Substrate |
739 | 739 | COc1cc([C@@H]2c3cc4c(cc3[C@@H](OC3OC5CO[C@@H](... | 1.052 | 2.1080 | 1.28600 | 5.984 | 1.28600 | 656.7 | 3 | 13 | 160.80 | 0.11320 | 6 | <rdkit.Chem.rdchem.Mol object at 0x000002CDE31... | Efflux Substrate |
738 rows × 15 columns
sub_and_evade_logd = pd.read_csv('data_curated/sub_and_evade_PE.csv')
sub_and_evade_logd['mol'] = sub_and_evade_logd['SMILES'].apply(Chem.MolFromSmiles)
[09:35:17] Explicit valence for atom # 2 N, 4, is greater than permitted
[09:35:17] Explicit valence for atom # 17 N, 5, is greater than permitted
sub_and_evade_logd = sub_and_evade_logd.dropna(subset='mol')
sub_and_evade_logd['SMILES'] = sub_and_evade_logd['SMILES'].apply(Chem.CanonSmiles)
a_features = calcualte_features_single(sub_to_evader_transforms, 'compound_structure_A')
b_features = calcualte_features_single(sub_to_evader_transforms, 'compound_structure_B')
a_features= a_features.iloc[:,:-87]
b_features= b_features.iloc[:,:-87]
# sub_evade_inactive_features['Class'] = sub_evade_inactive['Class']
Computing features:
100%|█████████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 133.08it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 1890.13it/s]
Computing features:
100%|█████████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 139.04it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 1890.13it/s]
def get_change(current, previous):
if current == previous:
return 0
try:
return (abs(current - previous) / previous) * 100.0
except ZeroDivisionError:
return float('inf')
rets=[]
for column in a_features.columns:
rets.append(get_change(b_features[column].mean(), a_features[column].mean()))
pd.DataFrame(rets, index=a_features.columns).sort_values(by=0).head(20)
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
0 | |
---|---|
VSA_EState5 | -163.547238 |
MinEStateIndex | -45.150857 |
HallKierAlpha | -14.961832 |
MinPartialCharge | -12.158647 |
NumRadicalElectrons | 0.000000 |
EState_VSA11 | 0.000000 |
SlogP_VSA9 | 0.000000 |
SMR_VSA8 | 0.000000 |
VSA_EState8 | 0.066408 |
SlogP_VSA1 | 0.426011 |
MaxEStateIndex | 1.157330 |
MaxAbsEStateIndex | 1.157330 |
FpDensityMorgan1 | 1.441856 |
NumValenceElectrons | 1.888042 |
Chi1 | 1.999946 |
Chi0 | 2.082437 |
HeavyAtomCount | 2.088773 |
BertzCT | 2.155824 |
FpDensityMorgan2 | 2.326498 |
EState_VSA8 | 2.640954 |
pd.DataFrame(rets, index=a_features.columns).sort_values(by=0).tail(30)
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
0 | |
---|---|
PEOE_VSA7 | 20.914404 |
EState_VSA4 | 23.165149 |
SlogP_VSA2 | 24.599014 |
NumHDonors | 24.615385 |
NumAliphaticHeterocycles | 25.000000 |
NumSaturatedHeterocycles | 25.000000 |
VSA_EState6 | 25.295822 |
NHOHCount | 27.142857 |
MolLogP | 29.078748 |
SlogP_VSA8 | 29.091388 |
SlogP_VSA12 | 30.354076 |
EState_VSA6 | 30.865566 |
SMR_VSA6 | 36.032338 |
VSA_EState10 | 38.687729 |
SMR_VSA4 | 38.929079 |
EState_VSA7 | 39.171792 |
NumAromaticCarbocycles | 42.028986 |
PEOE_VSA6 | 42.891886 |
EState_VSA5 | 46.730088 |
EState_VSA1 | 55.387805 |
Ipc | 58.987509 |
PEOE_VSA13 | 59.265545 |
SMR_VSA3 | 60.530420 |
SMR_VSA2 | 65.444545 |
NumAromaticHeterocycles | 74.285714 |
SlogP_VSA7 | 77.553925 |
NumAliphaticCarbocycles | 100.000000 |
NumSaturatedCarbocycles | 100.000000 |
PEOE_VSA3 | 159.404918 |
PEOE_VSA11 | 171.605736 |
a_features.MolLogP.mean()
2.7694200000000015
b_features.MolLogP.mean()
1.9641073333333339
feat='MolWt'
a_features[feat].mean(), b_features[feat].mean()
feat='MolWt'
a_features[feat].mean(), b_features[feat].mean()
feat='MolWt'
a_features[feat].mean(), b_features[feat].mean()
(365.2758000000001, 330.7681666666667)
feat='TPSA'
a_features[feat].mean(), b_features[feat].mean()
(73.45366666666665, 88.39083333333335)
feat='NumRotatableBonds'
a_features[feat].mean(), b_features[feat].mean()
(3.316666666666667, 3.4166666666666665)
feat='NumHAcceptors'
a_features[feat].mean(), b_features[feat].mean()
(4.383333333333334, 5.283333333333333)
feat='NumHDonors'
a_features[feat].mean(), b_features[feat].mean()
(1.0833333333333333, 1.35)
sns.histplot(a_features.MolLogP, color='r')
sns.histplot(b_features.MolLogP, color='b')
<Axes: xlabel='MolLogP', ylabel='Count'>
cluster_8 = pd.read_csv('data_curated/cluster_8.csv')
cluster_8
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
SMILES | INHIB_AVE_wild | INHIB_AVE_efflux | Mol | fps | abs_diff | sub_class | wild_stds | tolc_stds | wild_class | tolc_class | Class | mol | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | O=C(NC(=S)N1CCN(c2cc3c(cc2F)c(=O)c(C(=O)O)cn3C... | 90.32 | 88.08 | <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... | <rdkit.DataStructs.cDataStructs.ExplicitBitVec... | -2.24 | decrease | 8.862059 | 4.772322 | active | active | Efflux Evader | <rdkit.Chem.rdchem.Mol object at 0x00000271FF8... |
1 | CCn1cc(C(=O)O)c(=O)c2cc([N+](=O)[O-])ccc21 | 92.33 | 83.35 | <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... | <rdkit.DataStructs.cDataStructs.ExplicitBitVec... | -8.98 | decrease | 9.068579 | 4.495245 | active | active | Efflux Evader | <rdkit.Chem.rdchem.Mol object at 0x00000271FF8... |
2 | CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCN(C(=S)NC(=O)c... | 92.72 | 91.71 | <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... | <rdkit.DataStructs.cDataStructs.ExplicitBitVec... | -1.01 | decrease | 9.108650 | 4.984962 | active | active | Efflux Evader | <rdkit.Chem.rdchem.Mol object at 0x00000271FF8... |
3 | CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCN(C(=S)Nc4ccc(... | 94.83 | 93.26 | <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... | <rdkit.DataStructs.cDataStructs.ExplicitBitVec... | -1.57 | decrease | 9.325446 | 5.075759 | active | active | Efflux Evader | <rdkit.Chem.rdchem.Mol object at 0x00000271FF8... |
4 | CCn1cc(C(=O)O)c(=O)c2cnc(N3CCN(C(=S)NC(=O)c4cc... | 59.56 | 88.04 | <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... | <rdkit.DataStructs.cDataStructs.ExplicitBitVec... | 28.48 | increase | 5.701576 | 4.769979 | active | active | Efflux Evader | <rdkit.Chem.rdchem.Mol object at 0x00000271FF8... |
5 | CCn1cc(C(=O)O)c(=O)c2cnc(N3CCN(C(=S)Nc4ccc(C(=... | 96.96 | 100.34 | <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... | <rdkit.DataStructs.cDataStructs.ExplicitBitVec... | 3.38 | increase | 9.544296 | 5.490497 | active | active | Efflux Evader | <rdkit.Chem.rdchem.Mol object at 0x00000271FF8... |
6 | CCOC(=O)c1cn(CC)c2cc(N3CCN(C)CC3)c(F)cc2c1=O | 94.15 | 89.71 | <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... | <rdkit.DataStructs.cDataStructs.ExplicitBitVec... | -4.44 | decrease | 9.255578 | 4.867805 | active | active | Efflux Evader | <rdkit.Chem.rdchem.Mol object at 0x00000271FF8... |
7 | Cc1c(NC(=O)c2cn3c4c(c(N5CCN(C)CC5)c(F)cc4c2=O)... | 97.04 | 94.43 | <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... | <rdkit.DataStructs.cDataStructs.ExplicitBitVec... | -2.61 | decrease | 9.552515 | 5.144296 | active | active | Efflux Evader | <rdkit.Chem.rdchem.Mol object at 0x00000271FF8... |
8 | CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23 | 99.54 | 98.79 | <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... | <rdkit.DataStructs.cDataStructs.ExplicitBitVec... | -0.75 | decrease | 9.809382 | 5.399700 | active | active | Efflux Evader | <rdkit.Chem.rdchem.Mol object at 0x00000271FF8... |
9 | CCN1CCN(c2cc3c(cc2F)c(=O)c(C(=O)O)cn3C2CC2)CC1 | 101.15 | 101.88 | <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... | <rdkit.DataStructs.cDataStructs.ExplicitBitVec... | 0.73 | increase | 9.974803 | 5.580708 | active | active | Efflux Evader | <rdkit.Chem.rdchem.Mol object at 0x00000271FF8... |
10 | CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCN(C)CC3)cc21.C... | 100.16 | 100.18 | <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... | <rdkit.DataStructs.cDataStructs.ExplicitBitVec... | 0.02 | increase | 9.873084 | 5.481124 | active | active | Efflux Evader | <rdkit.Chem.rdchem.Mol object at 0x00000271FF8... |
11 | CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCNCC3)cc21 | 98.83 | 98.54 | <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... | <rdkit.DataStructs.cDataStructs.ExplicitBitVec... | -0.29 | decrease | 9.736432 | 5.385055 | active | active | Efflux Evader | <rdkit.Chem.rdchem.Mol object at 0x00000271FF8... |
12 | CCn1cc(C(=O)O)c(=O)c2ccc(C)nc21 | 100.81 | 101.30 | <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... | <rdkit.DataStructs.cDataStructs.ExplicitBitVec... | 0.49 | increase | 9.939870 | 5.546732 | active | active | Efflux Evader | <rdkit.Chem.rdchem.Mol object at 0x00000271FF8... |
13 | CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)Nc4ccc... | 74.97 | 93.00 | <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... | <rdkit.DataStructs.cDataStructs.ExplicitBitVec... | 18.03 | increase | 7.284900 | 5.060529 | active | active | Efflux Evader | <rdkit.Chem.rdchem.Mol object at 0x00000271FF8... |
14 | CN1CCN(c2c(F)cc3c(=O)c(C(=O)O)cn4c3c2SCC4)CC1.Cl | 101.07 | 101.69 | <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... | <rdkit.DataStructs.cDataStructs.ExplicitBitVec... | 0.62 | increase | 9.966584 | 5.569578 | active | active | Efflux Evader | <rdkit.Chem.rdchem.Mol object at 0x00000271FF8... |
15 | COc1c(N2CCNC(C)C2)c(F)cc2c(=O)c(C(=O)O)cn(C3CC... | 99.27 | 98.34 | <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... | <rdkit.DataStructs.cDataStructs.ExplicitBitVec... | -0.93 | decrease | 9.781640 | 5.373339 | active | active | Efflux Evader | <rdkit.Chem.rdchem.Mol object at 0x000002726D8... |
16 | COc1c(N2C[C@@H]3CCCN[C@@H]3C2)c(F)cc2c(=O)c(C(... | 99.99 | 99.89 | <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... | <rdkit.DataStructs.cDataStructs.ExplicitBitVec... | -0.10 | decrease | 9.855617 | 5.464136 | active | active | Efflux Evader | <rdkit.Chem.rdchem.Mol object at 0x000002726D8... |
17 | C[C@H]1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)... | 99.45 | 98.37 | <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... | <rdkit.DataStructs.cDataStructs.ExplicitBitVec... | -1.08 | decrease | 9.800134 | 5.375097 | active | active | Efflux Evader | <rdkit.Chem.rdchem.Mol object at 0x000002726D8... |
18 | C[C@H]1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)... | 100.58 | 100.90 | <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... | <rdkit.DataStructs.cDataStructs.ExplicitBitVec... | 0.32 | increase | 9.916238 | 5.523301 | active | active | Efflux Evader | <rdkit.Chem.rdchem.Mol object at 0x000002726D8... |
19 | C[C@H]1COc2c(C3(N)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23 | 98.12 | 97.94 | <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... | <rdkit.DataStructs.cDataStructs.ExplicitBitVec... | -0.18 | decrease | 9.663482 | 5.349908 | active | active | Efflux Evader | <rdkit.Chem.rdchem.Mol object at 0x000002726D8... |
20 | Cl.O=C(Nc1ccc(-c2n[nH]c(=S)o2)cc1)c1cn(C2CC2)c... | 90.63 | 81.87 | <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... | <rdkit.DataStructs.cDataStructs.ExplicitBitVec... | -8.76 | decrease | 8.893910 | 4.408548 | active | active | Efflux Evader | <rdkit.Chem.rdchem.Mol object at 0x000002726D8... |
21 | COc1c(N2CC3CCCNC3C2)c(F)cc2c(=O)c(C(=O)Nc3ccc(... | 85.55 | 90.95 | <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... | <rdkit.DataStructs.cDataStructs.ExplicitBitVec... | 5.40 | increase | 8.371958 | 4.940443 | active | active | Efflux Evader | <rdkit.Chem.rdchem.Mol object at 0x000002726D8... |
22 | Cc1ccc(S(=O)(=O)O)cc1.NC1CCN(c2nc3c(cc2F)c(=O)... | 94.82 | 90.03 | <rdkit.Chem.rdchem.Mol object at 0x00000271FF9... | <rdkit.DataStructs.cDataStructs.ExplicitBitVec... | -4.79 | decrease | 9.324418 | 4.886550 | active | active | Efflux Evader | <rdkit.Chem.rdchem.Mol object at 0x000002726D8... |
23 | CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCN(C(=S)NC(=O)C... | 9.66 | 97.46 | <rdkit.Chem.rdchem.Mol object at 0x00000272495... | <rdkit.DataStructs.cDataStructs.ExplicitBitVec... | 87.80 | increase | 0.574526 | 5.321790 | inactive | active | Efflux Substrate | <rdkit.Chem.rdchem.Mol object at 0x000002724F6... |
24 | CCn1cc(C(=O)O)c(=O)c2cc(F)c(N3CCN(C(=S)NC(=O)c... | 2.34 | 93.17 | <rdkit.Chem.rdchem.Mol object at 0x00000272283... | <rdkit.DataStructs.cDataStructs.ExplicitBitVec... | 90.83 | increase | -0.177578 | 5.070487 | inactive | active | Efflux Substrate | <rdkit.Chem.rdchem.Mol object at 0x000002724F6... |
25 | CCn1cc(C(=O)O)c(=O)c2cc(F)c(N/N=C/c3ccccc3)cc21 | -2.02 | 81.37 | <rdkit.Chem.rdchem.Mol object at 0x00000271FF7... | <rdkit.DataStructs.cDataStructs.ExplicitBitVec... | 83.39 | increase | -0.625553 | 4.379259 | inactive | active | Efflux Substrate | <rdkit.Chem.rdchem.Mol object at 0x000002724F6... |
26 | CCn1cc(C(=O)O)c(=O)c2cnc(N3CCN(C(=S)NC(=O)c4cc... | -3.27 | 97.79 | <rdkit.Chem.rdchem.Mol object at 0x00000271FF7... | <rdkit.DataStructs.cDataStructs.ExplicitBitVec... | 101.06 | increase | -0.753986 | 5.341121 | inactive | active | Efflux Substrate | <rdkit.Chem.rdchem.Mol object at 0x000002724F6... |
27 | CCOc1cccc(C(=O)NC(=S)N2CCN(c3ncc4c(=O)c(C(=O)O... | -5.55 | 88.93 | <rdkit.Chem.rdchem.Mol object at 0x00000271FF7... | <rdkit.DataStructs.cDataStructs.ExplicitBitVec... | 94.48 | increase | -0.988248 | 4.822114 | inactive | active | Efflux Substrate | <rdkit.Chem.rdchem.Mol object at 0x000002724F6... |
28 | CCn1cc(C(=O)O)c(=O)c2cnc(N3CCN(C(=S)NC(=O)c4cc... | 6.81 | 97.95 | <rdkit.Chem.rdchem.Mol object at 0x00000271FF7... | <rdkit.DataStructs.cDataStructs.ExplicitBitVec... | 91.14 | increase | 0.281699 | 5.350493 | inactive | active | Efflux Substrate | <rdkit.Chem.rdchem.Mol object at 0x000002724F6... |
29 | CCn1cc(C(=O)O)c(=O)c2cnc(N3CCN(C(=S)Nc4ccc(OC)... | -0.57 | 80.90 | <rdkit.Chem.rdchem.Mol object at 0x00000271FF7... | <rdkit.DataStructs.cDataStructs.ExplicitBitVec... | 81.47 | increase | -0.476571 | 4.351727 | inactive | active | Efflux Substrate | <rdkit.Chem.rdchem.Mol object at 0x000002724F6... |
30 | CCn1cc(C(=O)O)c(=O)c2cnc(N3CCN(C(=S)Nc4ccccc4C... | 1.49 | 103.44 | <rdkit.Chem.rdchem.Mol object at 0x00000271FF7... | <rdkit.DataStructs.cDataStructs.ExplicitBitVec... | 101.95 | increase | -0.264913 | 5.672090 | inactive | active | Efflux Substrate | <rdkit.Chem.rdchem.Mol object at 0x000002724F6... |
31 | CCn1cc(C(=O)O)c(=O)c2cnc(N3CCN(C(=S)Nc4ccccc4C... | 0.73 | 97.25 | <rdkit.Chem.rdchem.Mol object at 0x00000271B69... | <rdkit.DataStructs.cDataStructs.ExplicitBitVec... | 96.52 | increase | -0.343000 | 5.309488 | inactive | active | Efflux Substrate | <rdkit.Chem.rdchem.Mol object at 0x000002724F6... |
small_set = main_transforms[main_transforms['compound_structure_B'].isin(cluster_8.SMILES)]
small_set_diff, small_set_left, small_set_right = calculate_fractions_mk7_new_smarts(small_set)
Generating molecular objects from pre-defined substructures
Calcualting LHS+RHS matches
small_set_diff.iloc[:,:-4].sum().sort_values(ascending=False).tail(20)
Dinitrobenzene_3 0
Dipeptide 0
Disulfide 0
Disulfides 0
Disulphide 0
Dithiocarbamate 0
Dithiole-2-thione 0
Dithiole-3-thione 0
Dithiomethylene_acetal 0
Dye 1 (1) 0
Dye 11 0
Dye 16 (1) 0
E3 - e.g., carbonates 0
Nitrogen atoms (2) -1
Adamantyl -1
Primary or secondary amine, not amide. -1
Acyclic N-,=N and not N bound to carbonyl or sulfone -2
N5EXC -2
N4EXC -2
Oxygen-nitrogen single bond -2
dtype: object
small_set_diff.iloc[:,:-4].sum().sort_values(ascending=False).head(20)
B9 2
N9 2
sp2 hybridized carbon atoms (4) 2
phenylpiperazine 2
sp3 hybridized carbon atoms (2) 2
16 - Tertiary amine 2
NUC 2
Nitrogen atoms (4) 2
Sulphates 1
B2 - secondary amine 1
S/PO3 groups 1
5 - Alcohol 1
41 - Acrylate 1
B3 - tertiary amine 1
sp3 hybridized carbon atoms (9) 1
N2 - secondary amines 1
Ester 1
sp2 hybridized carbon atoms (8) 1
ELEC 1
Nitrogen atoms (1) 1
dtype: object
evader_transforms = evader_transforms.drop(columns=['idsmiles_A', 'idsmiles_B', 'measurement_A', 'measurement_B', 'measurement_delta'])
substrate_transforms = substrate_transforms.drop(columns=['idsmiles_A', 'idsmiles_B', 'measurement_A', 'measurement_B', 'measurement_delta'])
comp_a_lhs_overlap = evader_transforms.merge(substrate_transforms, on=['compound_structure_A', 'LHS', 'common_core'], suffixes=['_evader','_substrate'])
len(comp_a_lhs_overlap)
125
len(comp_a_lhs_overlap.compound_structure_A.unique())
52
len(comp_a_lhs_overlap.compound_structure_B_substrate.unique())
23
len(comp_a_lhs_overlap.compound_structure_B_evader.unique())
15
comp_a_lhs_overlap
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
compound_structure_A | compound_structure_B_evader | smirks_evader | common_core | LHS | RHS_evader | compound_structure_B_substrate | smirks_substrate | RHS_substrate | |
---|---|---|---|---|---|---|---|---|---|
0 | Cc1cccc(O)c1/N=C/c1cc(I)cc(I)c1O | Oc1cccnc1/N=C/c1cc(I)cc(I)c1O | [*:1]c1c(C)cccc1O>>[*:1]c1ncccc1O | [*:1]/N=C/c1cc(I)cc(I)c1O | [*:1]c1c(C)cccc1O | [*:1]c1ncccc1O | Oc1c(I)cc(I)cc1/C=N/c1ccc(F)cc1 | [*:1]c1c(C)cccc1O>>[*:1]c1ccc(F)cc1 | [*:1]c1ccc(F)cc1 |
1 | Cc1cccc(O)c1/N=C/c1cc(I)cc(I)c1O | Oc1cccnc1/N=C/c1cc(I)cc(I)c1O | [*:1]c1c(C)cccc1O>>[*:1]c1ncccc1O | [*:1]/N=C/c1cc(I)cc(I)c1O | [*:1]c1c(C)cccc1O | [*:1]c1ncccc1O | CCc1ccc(O)c(/N=C/c2cc(I)cc(I)c2O)c1 | [*:1]c1c(C)cccc1O>>[*:1]c1cc(CC)ccc1O | [*:1]c1cc(CC)ccc1O |
2 | Cc1cccc(O)c1/N=C/c1cc(I)cc(I)c1O | Oc1cccnc1/N=C/c1cc(I)cc(I)c1O | [*:1]c1c(C)cccc1O>>[*:1]c1ncccc1O | [*:1]/N=C/c1cc(I)cc(I)c1O | [*:1]c1c(C)cccc1O | [*:1]c1ncccc1O | Oc1c(I)cc(I)cc1/C=N/c1cccc(Cl)c1Cl | [*:1]c1c(C)cccc1O>>[*:1]c1cccc(Cl)c1Cl | [*:1]c1cccc(Cl)c1Cl |
3 | Cc1cccc(O)c1/N=C/c1cc(I)cc(I)c1O | Oc1cccnc1/N=C/c1cc(I)cc(I)c1O | [*:1]c1c(C)cccc1O>>[*:1]c1ncccc1O | [*:1]/N=C/c1cc(I)cc(I)c1O | [*:1]c1c(C)cccc1O | [*:1]c1ncccc1O | Oc1c(I)cc(I)cc1/C=N/c1cccc(F)c1 | [*:1]c1c(C)cccc1O>>[*:1]c1cccc(F)c1 | [*:1]c1cccc(F)c1 |
4 | Cc1cccc(O)c1/N=C/c1cc(I)cc(I)c1O | Oc1cccnc1/N=C/c1cc(I)cc(I)c1O | [*:1]c1c(C)cccc1O>>[*:1]c1ncccc1O | [*:1]/N=C/c1cc(I)cc(I)c1O | [*:1]c1c(C)cccc1O | [*:1]c1ncccc1O | COc1cccc(/N=C/c2cc(I)cc(I)c2O)c1 | [*:1]c1c(C)cccc1O>>[*:1]c1cccc(OC)c1 | [*:1]c1cccc(OC)c1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
120 | CCCn1ccc(=N)cc1.I | Br.CCCCCCCCCCn1ccc(=N)cc1 | [*:1]CCC>>[*:1]CCCCCCCCCC | [*:1]n1ccc(=N)cc1 | [*:1]CCC | [*:1]CCCCCCCCCC | CCCCCCCn1ccc(=N)cc1.I | [*:1]CCC>>[*:1]CCCCCCC | [*:1]CCCCCCC |
121 | CCCCn1ccc(=N)cc1.I | Br.CCCCCCCCn1ccc(=N)cc1 | [*:1]CCCC>>[*:1]CCCCCCCC | [*:1]n1ccc(=N)cc1 | [*:1]CCCC | [*:1]CCCCCCCC | CCCCCCCn1ccc(=N)cc1.I | [*:1]CCCC>>[*:1]CCCCCCC | [*:1]CCCCCCC |
122 | CCCCn1ccc(=N)cc1.I | Br.CCCCCCCCCCn1ccc(=N)cc1 | [*:1]CCCC>>[*:1]CCCCCCCCCC | [*:1]n1ccc(=N)cc1 | [*:1]CCCC | [*:1]CCCCCCCCCC | CCCCCCCn1ccc(=N)cc1.I | [*:1]CCCC>>[*:1]CCCCCCC | [*:1]CCCCCCC |
123 | Br.CCCCCCn1ccc(=N)cc1 | Br.CCCCCCCCn1ccc(=N)cc1 | [*:1]CCCCCC>>[*:1]CCCCCCCC | [*:1]n1ccc(=N)cc1 | [*:1]CCCCCC | [*:1]CCCCCCCC | CCCCCCCn1ccc(=N)cc1.I | [*:1]CCCCCC>>[*:1]CCCCCCC | [*:1]CCCCCCC |
124 | Br.CCCCCCn1ccc(=N)cc1 | Br.CCCCCCCCCCn1ccc(=N)cc1 | [*:1]CCCCCC>>[*:1]CCCCCCCCCC | [*:1]n1ccc(=N)cc1 | [*:1]CCCCCC | [*:1]CCCCCCCCCC | CCCCCCCn1ccc(=N)cc1.I | [*:1]CCCCCC>>[*:1]CCCCCCC | [*:1]CCCCCCC |
125 rows × 9 columns
comp_a_lhs_overlap[comp_a_lhs_overlap.compound_structure_B_evader.isin(comp_a_lhs_overlap.compound_structure_B_evader.unique())]
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
compound_structure_A | compound_structure_B_evader | smirks_evader | common_core | LHS | RHS_evader | compound_structure_B_substrate | smirks_substrate | RHS_substrate | |
---|---|---|---|---|---|---|---|---|---|
0 | Cc1cccc(O)c1/N=C/c1cc(I)cc(I)c1O | Oc1cccnc1/N=C/c1cc(I)cc(I)c1O | [*:1]c1c(C)cccc1O>>[*:1]c1ncccc1O | [*:1]/N=C/c1cc(I)cc(I)c1O | [*:1]c1c(C)cccc1O | [*:1]c1ncccc1O | Oc1c(I)cc(I)cc1/C=N/c1ccc(F)cc1 | [*:1]c1c(C)cccc1O>>[*:1]c1ccc(F)cc1 | [*:1]c1ccc(F)cc1 |
1 | Cc1cccc(O)c1/N=C/c1cc(I)cc(I)c1O | Oc1cccnc1/N=C/c1cc(I)cc(I)c1O | [*:1]c1c(C)cccc1O>>[*:1]c1ncccc1O | [*:1]/N=C/c1cc(I)cc(I)c1O | [*:1]c1c(C)cccc1O | [*:1]c1ncccc1O | CCc1ccc(O)c(/N=C/c2cc(I)cc(I)c2O)c1 | [*:1]c1c(C)cccc1O>>[*:1]c1cc(CC)ccc1O | [*:1]c1cc(CC)ccc1O |
2 | Cc1cccc(O)c1/N=C/c1cc(I)cc(I)c1O | Oc1cccnc1/N=C/c1cc(I)cc(I)c1O | [*:1]c1c(C)cccc1O>>[*:1]c1ncccc1O | [*:1]/N=C/c1cc(I)cc(I)c1O | [*:1]c1c(C)cccc1O | [*:1]c1ncccc1O | Oc1c(I)cc(I)cc1/C=N/c1cccc(Cl)c1Cl | [*:1]c1c(C)cccc1O>>[*:1]c1cccc(Cl)c1Cl | [*:1]c1cccc(Cl)c1Cl |
3 | Cc1cccc(O)c1/N=C/c1cc(I)cc(I)c1O | Oc1cccnc1/N=C/c1cc(I)cc(I)c1O | [*:1]c1c(C)cccc1O>>[*:1]c1ncccc1O | [*:1]/N=C/c1cc(I)cc(I)c1O | [*:1]c1c(C)cccc1O | [*:1]c1ncccc1O | Oc1c(I)cc(I)cc1/C=N/c1cccc(F)c1 | [*:1]c1c(C)cccc1O>>[*:1]c1cccc(F)c1 | [*:1]c1cccc(F)c1 |
4 | Cc1cccc(O)c1/N=C/c1cc(I)cc(I)c1O | Oc1cccnc1/N=C/c1cc(I)cc(I)c1O | [*:1]c1c(C)cccc1O>>[*:1]c1ncccc1O | [*:1]/N=C/c1cc(I)cc(I)c1O | [*:1]c1c(C)cccc1O | [*:1]c1ncccc1O | COc1cccc(/N=C/c2cc(I)cc(I)c2O)c1 | [*:1]c1c(C)cccc1O>>[*:1]c1cccc(OC)c1 | [*:1]c1cccc(OC)c1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
120 | CCCn1ccc(=N)cc1.I | Br.CCCCCCCCCCn1ccc(=N)cc1 | [*:1]CCC>>[*:1]CCCCCCCCCC | [*:1]n1ccc(=N)cc1 | [*:1]CCC | [*:1]CCCCCCCCCC | CCCCCCCn1ccc(=N)cc1.I | [*:1]CCC>>[*:1]CCCCCCC | [*:1]CCCCCCC |
121 | CCCCn1ccc(=N)cc1.I | Br.CCCCCCCCn1ccc(=N)cc1 | [*:1]CCCC>>[*:1]CCCCCCCC | [*:1]n1ccc(=N)cc1 | [*:1]CCCC | [*:1]CCCCCCCC | CCCCCCCn1ccc(=N)cc1.I | [*:1]CCCC>>[*:1]CCCCCCC | [*:1]CCCCCCC |
122 | CCCCn1ccc(=N)cc1.I | Br.CCCCCCCCCCn1ccc(=N)cc1 | [*:1]CCCC>>[*:1]CCCCCCCCCC | [*:1]n1ccc(=N)cc1 | [*:1]CCCC | [*:1]CCCCCCCCCC | CCCCCCCn1ccc(=N)cc1.I | [*:1]CCCC>>[*:1]CCCCCCC | [*:1]CCCCCCC |
123 | Br.CCCCCCn1ccc(=N)cc1 | Br.CCCCCCCCn1ccc(=N)cc1 | [*:1]CCCCCC>>[*:1]CCCCCCCC | [*:1]n1ccc(=N)cc1 | [*:1]CCCCCC | [*:1]CCCCCCCC | CCCCCCCn1ccc(=N)cc1.I | [*:1]CCCCCC>>[*:1]CCCCCCC | [*:1]CCCCCCC |
124 | Br.CCCCCCn1ccc(=N)cc1 | Br.CCCCCCCCCCn1ccc(=N)cc1 | [*:1]CCCCCC>>[*:1]CCCCCCCCCC | [*:1]n1ccc(=N)cc1 | [*:1]CCCCCC | [*:1]CCCCCCCCCC | CCCCCCCn1ccc(=N)cc1.I | [*:1]CCCCCC>>[*:1]CCCCCCC | [*:1]CCCCCCC |
125 rows × 9 columns
mols=[]
labels=[]
for i in range(len(comp_a_lhs_overlap)):
# compound_A
core = Chem.MolFromSmiles(comp_a_lhs_overlap.common_core.iloc[i])
# LHS
lhs = Chem.MolFromSmiles(comp_a_lhs_overlap.LHS.iloc[i])
# compound_B_evader
RHS_evader = Chem.MolFromSmiles(comp_a_lhs_overlap.RHS_evader.iloc[i])
# compound_B_substrate
RHS_substrate = Chem.MolFromSmiles(comp_a_lhs_overlap.RHS_substrate.iloc[i])
# mols=[core, lhs , RHS_substrate, RHS_evader]
# labels
inactive_label = e_coli_wild_efflux[e_coli_wild_efflux['SMILES'] == comp_a_lhs_overlap.compound_structure_A.iloc[i]][['INHIB_AVE_wild', 'INHIB_AVE_efflux']].values
evader_label = e_coli_wild_efflux[e_coli_wild_efflux['SMILES'] == comp_a_lhs_overlap.compound_structure_B_evader.iloc[i]][['INHIB_AVE_wild', 'INHIB_AVE_efflux']].values
substrate_label = e_coli_wild_efflux[e_coli_wild_efflux['SMILES'] == comp_a_lhs_overlap.compound_structure_B_substrate.iloc[i]][['INHIB_AVE_wild', 'INHIB_AVE_efflux']].values
lab = ['Common core no_{}'.format(i), 'Inactive\n WT: {:.1f}%; tolC: {:.1f}%'.format(inactive_label[0][0], inactive_label[0][1]), 'Substrate\n WT: {:.1f}%; tolC: {:.1f}%'.format(substrate_label[0][0], substrate_label[0][1]), 'Evader\n WT: {:.1f}%; tolC: {:.1f}%'.format(evader_label[0][0], evader_label[0][1]),]
# img = Chem.Draw.MolsToGridImage(mols, molsPerRow=4, subImgSize=(250,250), legends=lab, useSVG=True)
mols.append(core)
mols.append(lhs)
mols.append(RHS_substrate)
mols.append(RHS_evader)
labels.append(lab[0])
labels.append(lab[1])
labels.append(lab[2])
labels.append(lab[3])
img = Chem.Draw.MolsToGridImage(mols, molsPerRow=4, subImgSize=(250,250), legends=labels, useSVG=False, maxMols= 600, returnPNG=False)
# with open('master_transform_2' + '.svg', 'w') as f:
# f.write(img.data)
[10:54:46] WARNING: not removing hydrogen atom with dummy atom neighbors
img
substrate_transforms
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
compound_structure_A | compound_structure_B | smirks | common_core | LHS | RHS | |
---|---|---|---|---|---|---|
2258 | C/C(=N/Nc1nc(Nc2cccc(Br)c2)nc(N2CCOCC2)n1)c1cc... | C/C(=N/Nc1nc(Nc2cccc(Br)c2)nc(N2CCOCC2)n1)c1cc... | [*:1]c1ccc(Br)cc1>>[*:1]c1ccccc1 | [*:1]/C(C)=N\Nc1nc(Nc2cccc(Br)c2)nc(N2CCOCC2)n1 | [*:1]c1ccc(Br)cc1 | [*:1]c1ccccc1 |
2259 | C/C(=N/Nc1nc(Nc2cccc(Br)c2)nc(N2CCOCC2)n1)c1cc... | C/C(=N/Nc1nc(Nc2cccc(Br)c2)nc(N2CCOCC2)n1)c1cc... | [*:1]c1ccc(F)cc1>>[*:1]c1ccccc1 | [*:1]/C(C)=N\Nc1nc(Nc2cccc(Br)c2)nc(N2CCOCC2)n1 | [*:1]c1ccc(F)cc1 | [*:1]c1ccccc1 |
3224 | N#C/C(=C\c1c(F)cccc1Cl)c1nc2ccccc2[nH]1 | Cl.N#C/C(=C\c1ccccc1[N+](=O)[O-])c1nc2ccccc2[nH]1 | [*:1]c1c(F)cccc1Cl>>[*:1]c1ccccc1[N+](=O)[O-] | [*:1]/C=C(\C#N)c1nc2ccccc2[nH]1 | [*:1]c1c(F)cccc1Cl | [*:1]c1ccccc1[N+](=O)[O-] |
3245 | N#C/C(=C\c1cc(Br)c(O)c(Br)c1O)c1nc2ccccc2[nH]1 | Cl.N#C/C(=C\c1ccccc1[N+](=O)[O-])c1nc2ccccc2[nH]1 | [*:1]c1cc(Br)c(O)c(Br)c1O>>[*:1]c1ccccc1[N+](=... | [*:1]/C=C(\C#N)c1nc2ccccc2[nH]1 | [*:1]c1cc(Br)c(O)c(Br)c1O | [*:1]c1ccccc1[N+](=O)[O-] |
3265 | COc1c(Cl)cc(Cl)cc1/C=C(\C#N)c1nc2ccccc2[nH]1 | Cl.N#C/C(=C\c1ccccc1[N+](=O)[O-])c1nc2ccccc2[nH]1 | [*:1]c1cc(Cl)cc(Cl)c1OC>>[*:1]c1ccccc1[N+](=O)... | [*:1]/C=C(\C#N)c1nc2ccccc2[nH]1 | [*:1]c1cc(Cl)cc(Cl)c1OC | [*:1]c1ccccc1[N+](=O)[O-] |
... | ... | ... | ... | ... | ... | ... |
1404497 | CCOC(=O)Cn1ccc(=N)cc1.Cl | CCCCCCCn1ccc(=N)cc1.I | [*:1]CC(=O)OCC>>[*:1]CCCCCCC | [*:1]n1ccc(=N)cc1 | [*:1]CC(=O)OCC | [*:1]CCCCCCC |
1404504 | Br.CCn1ccc(=N)cc1 | CCCCCCCn1ccc(=N)cc1.I | [*:1]CC>>[*:1]CCCCCCC | [*:1]n1ccc(=N)cc1 | [*:1]CC | [*:1]CCCCCCC |
1404510 | CCCn1ccc(=N)cc1.I | CCCCCCCn1ccc(=N)cc1.I | [*:1]CCC>>[*:1]CCCCCCC | [*:1]n1ccc(=N)cc1 | [*:1]CCC | [*:1]CCCCCCC |
1404515 | CCCCn1ccc(=N)cc1.I | CCCCCCCn1ccc(=N)cc1.I | [*:1]CCCC>>[*:1]CCCCCCC | [*:1]n1ccc(=N)cc1 | [*:1]CCCC | [*:1]CCCCCCC |
1404519 | Br.CCCCCCn1ccc(=N)cc1 | CCCCCCCn1ccc(=N)cc1.I | [*:1]CCCCCC>>[*:1]CCCCCCC | [*:1]n1ccc(=N)cc1 | [*:1]CCCCCC | [*:1]CCCCCCC |
4900 rows × 6 columns
comp_a_lhs_overlap = evader_transforms.merge(substrate_transforms, on=['compound_structure_A', 'LHS', 'common_core'], suffixes=['_evader','_substrate'])
comp_a_lhs_overlap = substrate_transforms.merge(evader_transforms, on=['compound_structure_A'], suffixes=['_substrate', '_evader'])
len(comp_a_lhs_overlap.compound_structure_A.unique())
67
len(comp_a_lhs_overlap.compound_structure_B_evader.unique())
23
len(comp_a_lhs_overlap.compound_structure_B_substrate.unique())
42
substarte_to_evader_feats.iloc[:,:-4].sum().sort_values(ascending=False).head(50)
B7 135
18 - Pyridine 135
sp2 hybridized carbon atoms (12) 120
N5EXC 59
sp3 hybridized carbon atoms (10) 56
Alpha halo carbonyl 47
sp3 hybridized carbon atoms (7) 46
Alkyl halide 46
15 - Secondary amine group 41
5 - Alcohol 40
22 - CCl2 39
Enamine 39
sp3 hybridized carbon atoms (12) 37
4 - Aromatic carbon-alkane 35
Nitrogen atoms (2) 22
1 - Alkane group 21
Nitrogen atoms (6) 20
33 - Bromo compounds 19
2 - Olefin group 17
I1 - Aliphatic methylene chains 7 or more long 12
Thiazolidinone 12
Dithiocarbamate 12
Thiocarbonyl group 12
ELEC 12
Aromatic NO2 11
Nitrogen atoms (4) 11
Dye 16 (1) 11
27 - Aromatic nitro 11
Imines_(not_ring) 10
sp3 hybridized carbon atoms (5) 10
Nitro group 10
Ketone 10
E3 - e.g., carbonates 9
48 - CH2S 9
Sulphur atom (3) 9
sp3 hybridized carbon atoms (4) 9
9 - �¡arbonyl 9
Filter39_imine 8
Acyclic N-,=N and not N bound to carbonyl or sulfone 8
Vinyl_halide 8
Filter64_halo_ketone_sulfone 8
Dye 25 7
Filter41_12_dicarbonyl 7
Sulphur atom (5) 7
Alpha_halo_carbonyl 7
Oxalyl 7
Stilbene 7
Diketo group 7
Filter26_alkyl_halide 7
Beta halo carbonyl 7
dtype: int64
substarte_to_evader_feats[substarte_to_evader_feats['B7']>0].iloc[:,:-4].sum().sort_values(ascending=False).tail(20)
Nitrogen atoms (2) -1
B8EXC -1
sp2 hybridized carbon atoms (4) -1
Oxygen-nitrogen single bond -1
Dye 16 (1) -1
Nitrogen atoms (4) -1
Negatively charged atoms -1
4 - Aromatic carbon-alkane -7
sp3 hybridized carbon atoms (7) -7
1 - Alkane group -7
sp3 hybridized carbon atoms (10) -7
4-chlorobenzene -8
38 - Aromatic fluoro -19
High halogen content (>3) -22
25 - Aromatic chloro -30
sp2 hybridized carbon atoms (8) -54
13 - Ether -54
sp3 hybridized carbon atoms (6) -54
sp3 hybridized carbon atoms (11) -61
sp2 hybridized carbon atoms (7) -117
dtype: int64
comp_a_lhs_overlap
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
compound_structure_A | compound_structure_B_substrate | idsmiles_A_substrate | idsmiles_B_substrate | smirks_substrate | common_core_substrate | measurement_A_substrate | measurement_B_substrate | measurement_delta_substrate | LHS_substrate | ... | smirks_evader | common_core_evader | measurement_A_evader | measurement_B_evader | measurement_delta_evader | LHS_evader | RHS_evader | mol_inactive | mol_substrate | mol_evader | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | O=[N+]([O-])c1ccc(/N=C/c2cc(Cl)cc(Cl)c2O)cc1 | O=[N+]([O-])c1ccc(/N=C/c2cc(I)cc(I)c2O)cc1 | 45889 | 45890 | [*:1]c1cc(Cl)cc(Cl)c1O>>[*:1]c1cc(I)cc(I)c1O | [*:1]/C=N/c1ccc([N+](=O)[O-])cc1 | 2.60 | 56.00 | 53.40 | [*:1]c1cc(Cl)cc(Cl)c1O | ... | [*:1]/N=C\c1cc([*:2])cc([*:3])c1O>>[*:2]C([*:3... | [*:2]Cl.[*:3]Cl.[*:1]c1ccc([N+](=O)[O-])cc1 | 2.60 | -0.18 | -2.78 | [*:1]/N=C\c1cc([*:2])cc([*:3])c1O | [*:2]C([*:3])C(=O)NC(CO)C([*:1])O | <rdkit.Chem.rdchem.Mol object at 0x000002A9F42... | <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... | <rdkit.Chem.rdchem.Mol object at 0x000002A9F42... |
1 | O=[N+]([O-])c1ccc(/N=C/c2cc(Cl)cc(Cl)c2O)cc1 | O=[N+]([O-])c1ccc(/N=C/c2cc(I)cc(I)c2O)cc1 | 45889 | 45890 | [*:1]c1cc(Cl)cc(Cl)c1O>>[*:1]c1cc(I)cc(I)c1O | [*:1]/C=N/c1ccc([N+](=O)[O-])cc1 | 2.60 | 56.00 | 53.40 | [*:1]c1cc(Cl)cc(Cl)c1O | ... | [*:1]/N=C\c1cc([*:2])cc(Cl)c1[*:3]>>[*:1]C([*:... | [*:2]Cl.[*:3]O.[*:1]c1ccc([N+](=O)[O-])cc1 | 2.60 | -0.18 | -2.78 | [*:1]/N=C\c1cc([*:2])cc(Cl)c1[*:3] | [*:1]C([*:3])C(CO)NC(=O)[C@@H]([*:2])Cl | <rdkit.Chem.rdchem.Mol object at 0x000002A9F42... | <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... | <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... |
2 | O=[N+]([O-])c1ccc(/N=C/c2cc(Cl)cc(Cl)c2O)cc1 | O=[N+]([O-])c1ccc(/N=C/c2cc(I)cc(I)c2O)cc1 | 45889 | 45890 | [*:1]c1cc(Cl)cc(Cl)c1O>>[*:1]c1cc(I)cc(I)c1O | [*:1]/C=N/c1ccc([N+](=O)[O-])cc1 | 2.60 | 56.00 | 53.40 | [*:1]c1cc(Cl)cc(Cl)c1O | ... | [*:1]/N=C\c1cc(Cl)cc([*:2])c1[*:3]>>[*:1]C([*:... | [*:2]Cl.[*:3]O.[*:1]c1ccc([N+](=O)[O-])cc1 | 2.60 | -0.18 | -2.78 | [*:1]/N=C\c1cc(Cl)cc([*:2])c1[*:3] | [*:1]C([*:3])C(CO)NC(=O)[C@@H]([*:2])Cl | <rdkit.Chem.rdchem.Mol object at 0x000002A9F42... | <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... | <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... |
3 | O=[N+]([O-])c1ccc(/N=C/c2cc(Cl)cc(Cl)c2O)cc1 | O=[N+]([O-])c1ccc(/N=C/c2cc(I)cc(I)c2O)cc1 | 45889 | 45890 | [*:1]c1cc(Cl)cc(Cl)c1O>>[*:1]c1cc(I)cc(I)c1O | [*:1]/C=N/c1ccc([N+](=O)[O-])cc1 | 2.60 | 56.00 | 53.40 | [*:1]c1cc(Cl)cc(Cl)c1O | ... | [*:1]/N=C\c1cc([*:2])cc(Cl)c1[*:3]>>[*:1]C([*:... | [*:2]Cl.[*:3]O.[*:1]c1ccc([N+](=O)[O-])cc1 | 2.60 | -0.18 | -2.78 | [*:1]/N=C\c1cc([*:2])cc(Cl)c1[*:3] | [*:1]C([*:3])C(CO)NC(=O)[C@H]([*:2])Cl | <rdkit.Chem.rdchem.Mol object at 0x000002A9F42... | <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... | <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... |
4 | O=[N+]([O-])c1ccc(/N=C/c2cc(Cl)cc(Cl)c2O)cc1 | O=[N+]([O-])c1ccc(/N=C/c2cc(I)cc(I)c2O)cc1 | 45889 | 45890 | [*:1]c1cc(Cl)cc(Cl)c1O>>[*:1]c1cc(I)cc(I)c1O | [*:1]/C=N/c1ccc([N+](=O)[O-])cc1 | 2.60 | 56.00 | 53.40 | [*:1]c1cc(Cl)cc(Cl)c1O | ... | [*:1]/N=C\c1cc(Cl)cc([*:2])c1[*:3]>>[*:1]C([*:... | [*:2]Cl.[*:3]O.[*:1]c1ccc([N+](=O)[O-])cc1 | 2.60 | -0.18 | -2.78 | [*:1]/N=C\c1cc(Cl)cc([*:2])c1[*:3] | [*:1]C([*:3])C(CO)NC(=O)[C@H]([*:2])Cl | <rdkit.Chem.rdchem.Mol object at 0x000002A9F42... | <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... | <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
304 | CCCn1ccc(=N)cc1.I | CCCCCCCn1ccc(=N)cc1.I | 28118 | 28233 | [*:1]CCC>>[*:1]CCCCCCC | [*:1]n1ccc(=N)cc1 | 1.42 | 56.99 | 55.57 | [*:1]CCC | ... | [*:1]CCC>>[*:1]CCCCCCCCCC | [*:1]n1ccc(=N)cc1 | 1.42 | -5.56 | -6.98 | [*:1]CCC | [*:1]CCCCCCCCCC | <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... | <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... | <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... |
305 | CCCCn1ccc(=N)cc1.I | CCCCCCCn1ccc(=N)cc1.I | 28145 | 28233 | [*:1]CCCC>>[*:1]CCCCCCC | [*:1]n1ccc(=N)cc1 | -14.52 | 56.99 | 71.51 | [*:1]CCCC | ... | [*:1]CCCC>>[*:1]CCCCCCCC | [*:1]n1ccc(=N)cc1 | -14.52 | 32.75 | 47.27 | [*:1]CCCC | [*:1]CCCCCCCC | <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... | <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... | <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... |
306 | CCCCn1ccc(=N)cc1.I | CCCCCCCn1ccc(=N)cc1.I | 28145 | 28233 | [*:1]CCCC>>[*:1]CCCCCCC | [*:1]n1ccc(=N)cc1 | -14.52 | 56.99 | 71.51 | [*:1]CCCC | ... | [*:1]CCCC>>[*:1]CCCCCCCCCC | [*:1]n1ccc(=N)cc1 | -14.52 | -5.56 | 8.96 | [*:1]CCCC | [*:1]CCCCCCCCCC | <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... | <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... | <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... |
307 | Br.CCCCCCn1ccc(=N)cc1 | CCCCCCCn1ccc(=N)cc1.I | 28228 | 28233 | [*:1]CCCCCC>>[*:1]CCCCCCC | [*:1]n1ccc(=N)cc1 | 13.72 | 56.99 | 43.27 | [*:1]CCCCCC | ... | [*:1]CCCCCC>>[*:1]CCCCCCCC | [*:1]n1ccc(=N)cc1 | 13.72 | 32.75 | 19.03 | [*:1]CCCCCC | [*:1]CCCCCCCC | <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... | <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... | <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... |
308 | Br.CCCCCCn1ccc(=N)cc1 | CCCCCCCn1ccc(=N)cc1.I | 28228 | 28233 | [*:1]CCCCCC>>[*:1]CCCCCCC | [*:1]n1ccc(=N)cc1 | 13.72 | 56.99 | 43.27 | [*:1]CCCCCC | ... | [*:1]CCCCCC>>[*:1]CCCCCCCCCC | [*:1]n1ccc(=N)cc1 | 13.72 | -5.56 | -19.28 | [*:1]CCCCCC | [*:1]CCCCCCCCCC | <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... | <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... | <rdkit.Chem.rdchem.Mol object at 0x000002A9F75... |
309 rows × 24 columns
substarte_to_evader_feats
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
0 | 26 | 28 | 42 | 43 | > 2 ester groups | 1 - Alkane group | 1,2-Dicarbonyl not in ring | 10 - Aldehyde | 11 - Acetate group | ... | Vinyl michael acceptor2 | Vinyl_halide | Vinyl_sulphone | Primary amine, not amide | Primary or secondary amine, not amide. | tertiary aliphatic amine | carboxylic acid | Smiles | smirks_evader | smirks_substrate | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]/N=C\c1cc([*:2])cc([*:3])c1O>>[*:2]C([*:3... | [*:1]c1cc(Cl)cc(Cl)c1O>>[*:1]c1cc(I)cc(I)c1O |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]/N=C\c1cc([*:2])cc(Cl)c1[*:3]>>[*:1]C([*:... | [*:1]c1cc(Cl)cc(Cl)c1O>>[*:1]c1cc(I)cc(I)c1O |
2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]/N=C\c1cc(Cl)cc([*:2])c1[*:3]>>[*:1]C([*:... | [*:1]c1cc(Cl)cc(Cl)c1O>>[*:1]c1cc(I)cc(I)c1O |
3 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]/N=C\c1cc([*:2])cc(Cl)c1[*:3]>>[*:1]C([*:... | [*:1]c1cc(Cl)cc(Cl)c1O>>[*:1]c1cc(I)cc(I)c1O |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]/N=C\c1cc(Cl)cc([*:2])c1[*:3]>>[*:1]C([*:... | [*:1]c1cc(Cl)cc(Cl)c1O>>[*:1]c1cc(I)cc(I)c1O |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
304 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]CCC>>[*:1]CCCCCCCCCC | [*:1]CCC>>[*:1]CCCCCCC |
305 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]CCCC>>[*:1]CCCCCCCC | [*:1]CCCC>>[*:1]CCCCCCC |
306 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]CCCC>>[*:1]CCCCCCCCCC | [*:1]CCCC>>[*:1]CCCCCCC |
307 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]CCCCCC>>[*:1]CCCCCCCC | [*:1]CCCCCC>>[*:1]CCCCCCC |
308 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | [*:1]CCCCCC>>[*:1]CCCCCCCCCC | [*:1]CCCCCC>>[*:1]CCCCCCC |
309 rows × 761 columns