In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

concentration_col = 'Concentration (uM)'
concentration_val = 33
ic50_col = 'PriA-SSB Dose response: IC50 (uM)'
median_inhib_col = 'Median % negative control (%)'
inhib_col = '% negative control (%)'

res_full_df = pd.read_csv('../datasets/Zenodo/v1/enamine_PriA-SSB_dose_response_2021-06-16_run.tsv', delimiter='\t', keep_default_na=False)
res_full_df = res_full_df[res_full_df['Control State'] == '']
res_full_df[concentration_col] = res_full_df[concentration_col].astype(float)
res_df = res_full_df[res_full_df[concentration_col] == concentration_val]
res_df = res_df[['Molecule Name', 'Structure (CXSMILES)', concentration_col, inhib_col]]
res_df.index = res_df['Molecule Name'].tolist()
res_df[median_inhib_col] = 0
for smssf_id, row in res_df.groupby('Molecule Name').median().iterrows():
    res_df.loc[smssf_id, median_inhib_col] = row[inhib_col]
    res_df.loc[smssf_id, median_inhib_col] = row[inhib_col]
res_df = res_df.reset_index(drop=True)

inhib_col = '% Inhibition'
hit_thresh = 50.0
res_df[inhib_col] = 100.0 - res_df[median_inhib_col]
res_df['Hit_median_50_thresh'] = (res_df[inhib_col] >= hit_thresh).astype(int)

all_runs_df = pd.read_csv('../datasets/Zenodo/v1/enamine_dose_reponse_curves.tsv', delimiter='\t').drop_duplicates('Synonyms')
res_df = res_df.merge(all_runs_df[['Molecule Name', 'Synonyms']], on='Molecule Name')

assert res_df['Molecule Name'].unique().shape[0] == 68
assert res_df['Synonyms'].unique().shape[0] == 68

res_df = res_df.rename({'Molecule Name' : 'SMSSF Molecule ID', 'Synonyms': 'ID Enamine', 'Structure (CXSMILES)': 'SMSSF SMILES'}, axis=1)

In [2]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.SaltRemover import SaltRemover
from rdkit.Chem.FilterCatalog import *

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

as_frh_filter_col = 'AS FRH Filter'
smiles_col = 'rdkit SMILES'

ams_df = pd.read_csv('../datasets/Zenodo/v1/ams_order_results.csv.gz')
train_df = pd.read_csv('../datasets/Zenodo/v1/training_df_single_fold.csv.gz')
train_df = train_df[~train_df[smiles_col].isna()]

real_df = res_df.copy().drop_duplicates('SMSSF Molecule ID')
real_cpd_info_df = pd.read_csv('../datasets/Zenodo/v1/enamine_final_list.csv.gz').drop('Hit', axis=1)
real_df = real_df.merge(real_cpd_info_df, on='ID Enamine')

train_df = train_df[[smiles_col, 'PriA-SSB AS Activity']]
ams_df = ams_df[[smiles_col, 'Hit']]
real_df = real_df[[smiles_col, 'Hit_median_50_thresh']]

train_df[as_frh_filter_col] = 0
ams_df[as_frh_filter_col] = 0
real_df[as_frh_filter_col] = 0

train_actives = train_df[train_df['PriA-SSB AS Activity'] == 1]
ams_actives = ams_df[ams_df['Hit'] == 1]
real_actives = real_df[real_df['Hit_median_50_thresh'] == 1]

train_inactives = train_df[train_df['PriA-SSB AS Activity'] != 1]
ams_inactives = ams_df[ams_df['Hit'] != 1]
real_inactives = real_df[real_df['Hit_median_50_thresh'] != 1]

In [3]:
# read in smarts patterns and create rdkit mols
smarts_list = pd.read_csv('../datasets/patterns/ToxAlerts_Schorpp2014_SMARTS.txt', delimiter=': ', header=None)[1].tolist()
smarts_patterns = [Chem.MolFromSmarts(smarts) for smarts in smarts_list]

# define filter function for convenience
def apply_frh_filter(smiles_series, smarts_patterns, smiles_col = 'rdkit SMILES'):
        mol_series = smiles_series.astype(str).apply((lambda x: Chem.MolFromSmiles(x)))
        frh_series = mol_series.apply((lambda x: any([x.HasSubstructMatch(smarts) for smarts in smarts_patterns]) ))
        return frh_series.tolist()

---
# Check Active/Hit Compounds

In [4]:
train_actives[as_frh_filter_col] = apply_frh_filter(train_actives[smiles_col], smarts_patterns)
ams_actives[as_frh_filter_col] = apply_frh_filter(ams_actives[smiles_col], smarts_patterns)
real_actives[as_frh_filter_col] = apply_frh_filter(real_actives[smiles_col], smarts_patterns)

In [5]:
print('Train AS Freq. Hitters: {} out of {} active compounds.'.format(train_actives[as_frh_filter_col].sum(), 
                                                                      train_actives.shape[0]))
print('AMS AS Freq. Hitters:   {} out of {} active compounds.'.format(ams_actives[as_frh_filter_col].sum(), 
                                                                      ams_actives.shape[0]))
print('REAL AS Freq. Hitters:  {} out of {} active compounds.'.format(real_actives[as_frh_filter_col].sum(), 
                                                                      real_actives.shape[0]))

Train AS Freq. Hitters: 150 out of 554 active compounds.
AMS AS Freq. Hitters:   182 out of 412 active compounds.
REAL AS Freq. Hitters:  19 out of 31 active compounds.


---
# Check Inactive Compounds

In [6]:
train_inactives[as_frh_filter_col] = apply_frh_filter(train_inactives[smiles_col], smarts_patterns)
ams_inactives[as_frh_filter_col] = apply_frh_filter(ams_inactives[smiles_col], smarts_patterns)
real_inactives[as_frh_filter_col] = apply_frh_filter(real_inactives[smiles_col], smarts_patterns)

In [7]:
print('Train AS Freq. Hitters: {} out of {} inactive compounds.'.format(train_inactives[as_frh_filter_col].sum(), 
                                                                      train_inactives.shape[0]))
print('AMS AS Freq. Hitters:   {} out of {} inactive compounds.'.format(ams_inactives[as_frh_filter_col].sum(), 
                                                                        ams_inactives.shape[0]))
print('REAL AS Freq. Hitters:  {} out of {} inactive compounds.'.format(real_inactives[as_frh_filter_col].sum(), 
                                                                        real_inactives.shape[0]))

Train AS Freq. Hitters: 3281 out of 426745 inactive compounds.
AMS AS Freq. Hitters:   149 out of 612 inactive compounds.
REAL AS Freq. Hitters:  4 out of 37 inactive compounds.
