In [1]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.SaltRemover import SaltRemover
from rdkit.Chem.FilterCatalog import *

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

as_frh_filter_col = 'AS FRH Filter'
smiles_col = 'rdkit SMILES'

ams_df = pd.read_csv('../datasets/Zenodo/v1/ams_order_results.csv.gz')
train_df = pd.read_csv('../datasets/Zenodo/v1/training_df_single_fold.csv.gz')

real_df = pd.read_csv('../datasets/enamine_results/pria_enamine_inhib_33_uM.csv').drop_duplicates('SMSSF Molecule ID')
real_cpd_info_df = pd.read_csv('../datasets/Zenodo/v1/enamine_final_list_v2.csv.gz').drop('Hit', axis=1)
real_df = real_df.merge(real_cpd_info_df, on='ID Enamine')

train_df = train_df[[smiles_col, 'PriA-SSB AS Activity']]
ams_df = ams_df[[smiles_col, 'Hit']]
real_df = real_df[[smiles_col, 'Hit']]

train_df = train_df[train_df['PriA-SSB AS Activity'] == 1]
ams_df = ams_df[ams_df['Hit'] == 1]
real_df = real_df[real_df['Hit'] == 1]

train_df[as_frh_filter_col] = 0
ams_df[as_frh_filter_col] = 0
real_df[as_frh_filter_col] = 0

  return f(*args, **kwds)


In [2]:
# read in smarts patterns and create rdkit mols
smarts_list = pd.read_csv('../datasets/raw/ToxAlerts_Schorpp2014_SMARTS.txt', delimiter=': ', header=None)[1].tolist()
smarts_patterns = [Chem.MolFromSmarts(smarts) for smarts in smarts_list]

In [4]:
train_df[as_frh_filter_col] = train_df[smiles_col].astype(str).apply((lambda x: any([Chem.MolFromSmiles(x).HasSubstructMatch(smarts) for smarts in smarts_patterns]) ))
ams_df[as_frh_filter_col] = ams_df[smiles_col].astype(str).apply((lambda x: any([Chem.MolFromSmiles(x).HasSubstructMatch(smarts) for smarts in smarts_patterns]) ))
real_df[as_frh_filter_col] = real_df[smiles_col].astype(str).apply((lambda x: any([Chem.MolFromSmiles(x).HasSubstructMatch(smarts) for smarts in smarts_patterns]) ))

In [16]:
train_frh_count = train_df[train_df[as_frh_filter_col]].shape[0]
ams_frh_count = ams_df[ams_df[as_frh_filter_col]].shape[0]
real_frh_count = real_df[real_df[as_frh_filter_col]].shape[0]

In [17]:
print('Train AS Freq. Hitters: {} out of {} compounds.'.format(train_frh_count, train_df.shape[0]))
print('AMS AS Freq. Hitters:   {} out of {} compounds.'.format(ams_frh_count, ams_df.shape[0]))
print('REAL AS Freq. Hitters:  {} out of {} compounds.'.format(real_frh_count, real_df.shape[0]))

Train AS Freq. Hitters: 150 out of 554 compounds.
AMS AS Freq. Hitters:   182 out of 412 compounds.
REAL AS Freq. Hitters:  9 out of 31 compounds.
