In [1]:
import pandas as pd
import os
from tqdm import tqdm
import numpy as np
from multiprocessing import  Pool,cpu_count
from itertools import product
from functools import partial
from rdkit import Chem, DataStructs
from rdkit.Chem.AtomPairs import Pairs
from rdkit.Chem import MACCSkeys, AllChem

In [2]:
allsmile= pd.read_csv('./AD_project/druglist_smiles.csv')

# Atom Pairs 

In [3]:
def atompairfu (partialframe, fullframe):
    drugalist=[]
    drugblist=[]
    simscorelist=[]
    allsmilelist=[Chem.MolFromSmiles(i) for i in fullframe.smile.values]
    allsmilelist=[Pairs.GetAtomPairFingerprint(x) for x in allsmilelist]
    alldrugname= fullframe.drugname.values
    for i in tqdm(range(len(partialframe))):
        druga=partialframe.drugname.values[i]
        drugasmile=Chem.MolFromSmiles(partialframe.smile.values[i])
        drugasmile=Pairs.GetAtomPairFingerprint(drugasmile)
        for j in range(len(alldrugname)):
            drugb=alldrugname[j]
            drugbsmile=allsmilelist[j]
            try:
                simscore=DataStructs.DiceSimilarity(drugasmile,drugbsmile) # default Tanimoto similarity 
                drugalist.append(druga)
                drugblist.append(drugb)
                simscorelist.append(simscore)
            except:
                drugalist.append(druga)
                drugblist.append(drugb)
                simscorelist.append('')
    returnfr=pd.DataFrame(drugalist, columns=['drug_a'])
    returnfr['drug_b']=drugblist
    returnfr['score']=simscorelist
    return returnfr

In [4]:
n_cores=40
score_split = np.array_split(allsmile, n_cores)
pool = Pool(n_cores)
final_train = pd.concat(pool.map(partial(atompairfu, fullframe=allsmile),score_split))
final_train['zscore'] = (final_train.score - final_train.score.mean())/final_train.score.std(ddof=0)

100%|██████████| 103/103 [00:04<00:00, 21.35it/s]
 96%|█████████▌| 98/102 [00:05<00:00, 21.69it/s]]
100%|██████████| 103/103 [00:05<00:00, 18.95it/s]
100%|██████████| 102/102 [00:05<00:00, 19.67it/s]
100%|██████████| 102/102 [00:05<00:00, 19.05it/s]
100%|██████████| 102/102 [00:04<00:00, 20.86it/s]
100%|██████████| 102/102 [00:05<00:00, 18.42it/s]
100%|██████████| 103/103 [00:04<00:00, 20.62it/s]

100%|██████████| 102/102 [00:04<00:00, 20.79it/s]
100%|██████████| 102/102 [00:05<00:00, 19.28it/s]
100%|██████████| 103/103 [00:05<00:00, 20.25it/s]
100%|██████████| 103/103 [00:05<00:00, 20.30it/s]
100%|██████████| 102/102 [00:05<00:00, 20.04it/s]
100%|██████████| 102/102 [00:05<00:00, 20.27it/s]
 95%|█████████▌| 98/103 [00:05<00:00, 19.26it/s]
100%|██████████| 102/102 [00:05<00:00, 19.78it/s]
100%|██████████| 102/102 [00:05<00:00, 19.56it/s]
100%|██████████| 102/102 [00:05<00:00, 19.87it/s]
100%|██████████| 102/102 [00:05<00:00, 19.52it/s]
 99%|█████████▉| 101/102 [00:05<00:00, 22.39it/s]


# MACCS Key

In [5]:
def MACCSfu (partialframe, fullframe):
    drugalist=[]
    drugblist=[]
    simscorelist=[]
    allsmilelist=[Chem.MolFromSmiles(i) for i in fullframe.smile.values]
    allsmilelist=[MACCSkeys.GenMACCSKeys(x) for x in allsmilelist]
    alldrugname= fullframe.drugname.values
    for i in tqdm(range(len(partialframe))):
        druga=partialframe.drugname.values[i]
        drugasmile=Chem.MolFromSmiles(partialframe.smile.values[i])
        drugasmile=MACCSkeys.GenMACCSKeys(drugasmile)
        for j in range(len(alldrugname)):
            drugb=alldrugname[j]
            drugbsmile=allsmilelist[j]
            try:
                simscore=DataStructs.FingerprintSimilarity(drugasmile,drugbsmile) # default Tanimoto similarity 
                drugalist.append(druga)
                drugblist.append(drugb)
                simscorelist.append(simscore)
            except:
                drugalist.append(druga)
                drugblist.append(drugb)
                simscorelist.append('')
    returnfr=pd.DataFrame(drugalist, columns=['drug_a'])
    returnfr['drug_b']=drugblist
    returnfr['score']=simscorelist
    return returnfr

In [6]:
n_cores=40
score_split = np.array_split(allsmile, n_cores)
pool = Pool(n_cores)
final_train1 = pd.concat(pool.map(partial(MACCSfu, fullframe=allsmile),score_split))
final_train1['zscore'] = (final_train1.score - final_train1.score.mean())/final_train1.score.std(ddof=0)

100%|██████████| 103/103 [00:02<00:00, 35.11it/s]
 39%|███▉      | 40/102 [00:01<00:02, 29.16it/s]]
100%|██████████| 102/102 [00:03<00:00, 33.21it/s]
100%|██████████| 102/102 [00:03<00:00, 30.50it/s]
100%|██████████| 103/103 [00:03<00:00, 30.27it/s]
100%|██████████| 102/102 [00:02<00:00, 34.61it/s]
100%|██████████| 103/103 [00:03<00:00, 33.96it/s]
100%|██████████| 103/103 [00:03<00:00, 30.84it/s]
100%|██████████| 102/102 [00:02<00:00, 34.92it/s]
100%|██████████| 102/102 [00:03<00:00, 31.70it/s]
100%|██████████| 102/102 [00:02<00:00, 37.07it/s]
100%|██████████| 102/102 [00:03<00:00, 31.59it/s]
100%|██████████| 103/103 [00:02<00:00, 36.07it/s]
 85%|████████▌ | 87/102 [00:02<00:00, 29.45it/s]]
 94%|█████████▍| 96/102 [00:03<00:00, 31.73it/s]]
100%|██████████| 102/102 [00:03<00:00, 32.77it/s]
100%|██████████| 102/102 [00:03<00:00, 32.13it/s]
100%|██████████| 103/103 [00:03<00:00, 32.76it/s]
100%|██████████| 102/102 [00:03<00:00, 31.05it/s]
100%|██████████| 102/102 [00:02<00:00, 34.04it/s]


# Morgan Fingerprint Similarity (set radius = 2)

In [7]:
# Set radius = 2
def Morganfu (partialframe, fullframe):
    drugalist=[]
    drugblist=[]
    simscorelist=[]
    allsmilelist=[Chem.MolFromSmiles(i) for i in fullframe.smile.values]
    allsmilelist=[AllChem.GetMorganFingerprint(x,2) for x in allsmilelist]
    alldrugname= fullframe.drugname.values
    for i in tqdm(range(len(partialframe))):
        druga=partialframe.drugname.values[i]
        drugasmile=Chem.MolFromSmiles(partialframe.smile.values[i])
        drugasmile=AllChem.GetMorganFingerprint(drugasmile,2)
        for j in range(len(alldrugname)):
            drugb=alldrugname[j]
            drugbsmile=allsmilelist[j]
            try:
                simscore=DataStructs.DiceSimilarity(drugasmile,drugbsmile) # default Tanimoto similarity 
                drugalist.append(druga)
                drugblist.append(drugb)
                simscorelist.append(simscore)
            except:
                drugalist.append(druga)
                drugblist.append(drugb)
                simscorelist.append('')
    returnfr=pd.DataFrame(drugalist, columns=['drug_a'])
    returnfr['drug_b']=drugblist
    returnfr['score']=simscorelist
    return returnfr

In [8]:
n_cores=40
score_split = np.array_split(allsmile, n_cores)
pool = Pool(n_cores)
final_train2 = pd.concat(pool.map(partial(Morganfu, fullframe=allsmile),score_split))
final_train2['zscore'] = (final_train2.score - final_train2.score.mean())/final_train2.score.std(ddof=0)

100%|██████████| 103/103 [00:01<00:00, 57.32it/s]
100%|██████████| 102/102 [00:01<00:00, 58.97it/s]
 68%|██████▊   | 69/102 [00:01<00:00, 44.81it/s]]
 65%|██████▌   | 67/103 [00:01<00:00, 40.67it/s]]
100%|██████████| 102/102 [00:02<00:00, 49.88it/s]
100%|██████████| 102/102 [00:02<00:00, 47.48it/s]
100%|██████████| 102/102 [00:02<00:00, 50.32it/s]
 87%|████████▋ | 89/102 [00:01<00:00, 44.59it/s]]
 74%|███████▍  | 76/103 [00:01<00:00, 39.13it/s]]
100%|██████████| 102/102 [00:02<00:00, 45.85it/s]
100%|██████████| 102/102 [00:02<00:00, 44.44it/s]
 92%|█████████▏| 94/102 [00:01<00:00, 45.47it/s]]
 86%|████████▋ | 88/102 [00:01<00:00, 44.58it/s]]

 84%|████████▍ | 86/102 [00:01<00:00, 45.65it/s]]
100%|██████████| 102/102 [00:02<00:00, 50.11it/s]
100%|██████████| 102/102 [00:02<00:00, 46.03it/s]
100%|██████████| 102/102 [00:02<00:00, 49.33it/s]
100%|██████████| 102/102 [00:02<00:00, 48.87it/s]
100%|██████████| 102/102 [00:02<00:00, 44.72it/s]
 96%|█████████▌| 99/103 [00:02<00:00, 46.74it/s]]

# Topological Fingerprints

In [9]:
def Topofu (partialframe, fullframe):
    drugalist=[]
    drugblist=[]
    simscorelist=[]
    allsmilelist=[Chem.MolFromSmiles(i) for i in fullframe.smile.values]
    allsmilelist=[Chem.RDKFingerprint(x) for x in allsmilelist]
    alldrugname= fullframe.drugname.values
    for i in tqdm(range(len(partialframe))):
        druga=partialframe.drugname.values[i]
        drugasmile=Chem.MolFromSmiles(partialframe.smile.values[i])
        drugasmile=Chem.RDKFingerprint(drugasmile)
        for j in range(len(alldrugname)):
            drugb=alldrugname[j]
            drugbsmile=allsmilelist[j]
            try:
                simscore=DataStructs.FingerprintSimilarity(drugasmile,drugbsmile)
                drugalist.append(druga)
                drugblist.append(drugb)
                simscorelist.append(simscore)
            except:
                drugalist.append(druga)
                drugblist.append(drugb)
                simscorelist.append('')
    returnfr=pd.DataFrame(drugalist, columns=['drug_a'])
    returnfr['drug_b']=drugblist
    returnfr['score']=simscorelist
    return returnfr

In [10]:
n_cores=40
score_split = np.array_split(allsmile, n_cores)
pool = Pool(n_cores)
final_train4 = pd.concat(pool.map(partial(Topofu, fullframe=allsmile),score_split))
final_train4['zscore'] = (final_train4.score - final_train4.score.mean())/final_train4.score.std(ddof=0)

100%|██████████| 103/103 [00:02<00:00, 42.60it/s]
100%|██████████| 102/102 [00:02<00:00, 42.97it/s]
100%|██████████| 102/102 [00:02<00:00, 41.72it/s]
100%|██████████| 102/102 [00:02<00:00, 42.50it/s]
100%|██████████| 103/103 [00:02<00:00, 42.07it/s]
100%|██████████| 102/102 [00:02<00:00, 35.91it/s]
100%|██████████| 102/102 [00:03<00:00, 33.11it/s]
 41%|████      | 42/102 [00:01<00:02, 27.06it/s]]
100%|██████████| 102/102 [00:03<00:00, 32.78it/s]
100%|██████████| 103/103 [00:02<00:00, 34.72it/s]
100%|██████████| 102/102 [00:03<00:00, 31.21it/s]
100%|██████████| 103/103 [00:03<00:00, 32.31it/s]
100%|██████████| 102/102 [00:03<00:00, 30.53it/s]
 41%|████      | 42/103 [00:01<00:01, 34.78it/s]]
100%|██████████| 102/102 [00:02<00:00, 38.66it/s]
100%|██████████| 102/102 [00:03<00:00, 31.29it/s]
100%|██████████| 102/102 [00:03<00:00, 31.43it/s]
 81%|████████▏ | 83/102 [00:02<00:00, 36.56it/s]]
100%|██████████| 103/103 [00:02<00:00, 36.05it/s]
100%|██████████| 103/103 [00:02<00:00, 35.16it/s]


# Comparison

In [11]:
# Set threshold of zscore >= 3. Use only atom-pair, MACCS, Morgan_2, Topology
i = 3
path = '/home/AD_project/similarity_tables/'
df = pd.concat([final_train[final_train.zscore>=i], final_train1[final_train.zscore>=i], final_train2[final_train.zscore>=i], final_train4[final_train4.zscore>=i]])
df = df[['drug_a', 'drug_b']].drop_duplicates()
df['drug_a'] = df['drug_a'].apply(lambda x:'drug_'+x)
df['drug_b'] = df['drug_b'].apply(lambda x:'drug_'+x)
#df.to_csv('./AD_project/drug_similarity.csv')