In [1]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform
from glob import glob

from rogi import SARI, RoughnessIndex

from rdkit import Chem
from rdkit.Chem import Descriptors

import seaborn as sns
sns.set(style='ticks')

In [2]:
# =====================
# Parse ChEMBL datasets
# =====================

csv_files = glob('chembl_datasets/*csv')
all_pKi = []
all_smi = []

for csv_file in csv_files:
    # parse csv file
    data = pd.read_csv(csv_file, index_col=0)
    data = data.dropna()   # rm NaN entries
    smiles = data.loc[:, 'smiles']  # get smiles
    pKi =  - np.log10(data.loc[:, 'exp_mean [nM]'].to_numpy() * 10 **(-9))  # take log affinity
    all_smi.append(smiles)
    all_pKi.append(pKi)

# store results in dataframe
df = pd.DataFrame(columns=['dataset', 'sari_maccs', 'sari_morgan', 'sari_descs'])
df.loc[:, 'dataset'] = [f.split('/')[1].split('.')[0] for f in csv_files]

## SARI with MACCS

In [3]:
# ====================
# Get SARI raw scores
# ====================

raw_conts = []
raw_discs = []

for smi, pKi in zip(all_smi, all_pKi):
    sari = SARI(pKi=pKi, smiles=smi, fingerprints='maccs')
    raw_cont, raw_disc = sari.compute_raw_scores()
    raw_conts.append(raw_cont)
    raw_discs.append(raw_disc)

mean_raw_cont = np.mean(raw_conts)
std_raw_cont = np.std(raw_conts)
mean_raw_disc = np.mean(raw_discs)
std_raw_disc = np.std(raw_discs)

In [4]:
# ========
# Get SARI 
# ========

sari_scores = []

mean_raw_cont = np.mean(raw_conts)
std_raw_cont = np.std(raw_conts)
mean_raw_disc = np.mean(raw_discs)
std_raw_disc = np.std(raw_discs)

for smi, pKi in zip(all_smi, all_pKi):
    sari = SARI(pKi=pKi, smiles=smi, fingerprints='maccs')
    sari_score = sari.compute_sari(mean_raw_cont=mean_raw_cont, std_raw_cont=std_raw_cont,
                                   mean_raw_disc=mean_raw_disc, std_raw_disc=std_raw_disc)
    sari_scores.append(sari_score)

df.loc[:, 'sari_maccs'] = sari_scores

## SARI with Morgan FPs

In [5]:
# ====================
# Get SARI raw scores
# ====================

raw_conts = []
raw_discs = []

for smi, pKi in zip(all_smi, all_pKi):
    sari = SARI(pKi=pKi, smiles=smi, fingerprints='morgan')
    raw_cont, raw_disc = sari.compute_raw_scores()
    raw_conts.append(raw_cont)
    raw_discs.append(raw_disc)

mean_raw_cont = np.mean(raw_conts)
std_raw_cont = np.std(raw_conts)
mean_raw_disc = np.mean(raw_discs)
std_raw_disc = np.std(raw_discs)

In [6]:
# ========
# Get SARI 
# ========

sari_scores = []

mean_raw_cont = np.mean(raw_conts)
std_raw_cont = np.std(raw_conts)
mean_raw_disc = np.mean(raw_discs)
std_raw_disc = np.std(raw_discs)

for smi, pKi in zip(all_smi, all_pKi):
    sari = SARI(pKi=pKi, smiles=smi, fingerprints='morgan')
    sari_score = sari.compute_sari(mean_raw_cont=mean_raw_cont, std_raw_cont=std_raw_cont,
                                   mean_raw_disc=mean_raw_disc, std_raw_disc=std_raw_disc)
    sari_scores.append(sari_score)

df.loc[:, 'sari_morgan'] = sari_scores

## SARI with Descriptors

In [7]:
def smiles_to_descriptors_df(smiles, descriptors=None):
    Xdict = {}
    mols = [Chem.MolFromSmiles(smi) for smi in smiles]
    for desc in Descriptors.descList:
        name = desc[0]
        f = desc[1]

        if descriptors is not None and name in descriptors:
            Xdict[name] = [f(m) for m in mols]
    return pd.DataFrame(Xdict)

        
desc_names = ['MolWt', 'FractionCSP3', 'NumHAcceptors', 'NumHDonors', 'NOCount', 'NHOHCount',
              'NumAliphaticRings', 'NumAliphaticHeterocycles', 'NumAromaticHeterocycles', 'NumAromaticRings', 
              'NumRotatableBonds', 'TPSA', 'qed', 'MolLogP']

In [8]:
# ====================
# Get SARI raw scores
# ====================

raw_conts = []
raw_discs = []

for smi, pKi in zip(all_smi, all_pKi):
    # descriptors
    df_X = smiles_to_descriptors_df(smi, descriptors=desc_names)
    # normalize features like we do for ROGI
    X = df_X.to_numpy()
    _X = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
    
    # use ROGI code just to get the distance matrix
    ri = RoughnessIndex(Y=pKi, X=_X, metric='euclidean', verbose=False)
    dist_matrix = squareform(ri._Dx)
    sim_matrix = 1. - dist_matrix
    
    # get SARI
    sari = SARI(pKi, sim_matrix=sim_matrix)
    raw_cont, raw_disc = sari.compute_raw_scores()
    raw_conts.append(raw_cont)
    raw_discs.append(raw_disc)

mean_raw_cont = np.mean(raw_conts)
std_raw_cont = np.std(raw_conts)
mean_raw_disc = np.mean(raw_discs)
std_raw_disc = np.std(raw_discs)

In [9]:
# ========
# Get SARI 
# ========

sari_scores = []

mean_raw_cont = np.mean(raw_conts)
std_raw_cont = np.std(raw_conts)
mean_raw_disc = np.mean(raw_discs)
std_raw_disc = np.std(raw_discs)

for smi, pKi in zip(all_smi, all_pKi):
    # descriptors
    df_X = smiles_to_descriptors_df(smi, descriptors=desc_names)
    # normalize features like we do for ROGI
    X = df_X.to_numpy()
    _X = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
    
    # use ROGI code just to get the distance matrix
    ri = RoughnessIndex(Y=pKi, X=_X, metric='euclidean', verbose=False)
    dist_matrix = squareform(ri._Dx)
    sim_matrix = 1. - dist_matrix
    
    # get SARI
    sari = SARI(pKi, sim_matrix=sim_matrix)
    sari_score = sari.compute_sari(mean_raw_cont=mean_raw_cont, std_raw_cont=std_raw_cont,
                                   mean_raw_disc=mean_raw_disc, std_raw_disc=std_raw_disc)
    sari_scores.append(sari_score)

df.loc[:, 'sari_descs'] = sari_scores

In [10]:
df

Unnamed: 0,dataset,sari_maccs,sari_morgan,sari_descs
0,CHEMBL2034_Ki,0.344895,0.296986,0.382991
1,CHEMBL219_Ki,0.08562,0.135909,0.066079
2,CHEMBL2147_Ki,0.015804,0.080855,0.000324
3,CHEMBL236_Ki,0.022981,0.062994,0.048977
4,CHEMBL3979_EC50,0.125841,0.073727,0.047772
5,CHEMBL2835_Ki,0.000236,1.1e-05,0.43146
6,CHEMBL264_Ki,0.118019,0.214854,0.107444
7,CHEMBL238_Ki,0.333856,0.211574,0.008837
8,CHEMBL1871_Ki,0.396657,0.279095,0.201517
9,CHEMBL2971_Ki,0.014116,0.005421,0.13983


In [11]:
df.to_csv('regression_sari_scores.csv', index=False)