In [1]:
import sys, os
import numpy as np
import pandas as pd
from rdkit import Chem

### Find analogs from Zinc20 instock dataset

Due to the size of Zinc20 instock dataset, this Zinc_instock.txt is not available in the directory, but I attached script to calculate similarity (at dataset/script/cal_similar.py)

In [2]:
df = pd.read_csv('dataset/curated_Zinc20_instock/Zinc_instock.txt')
df.head(2)

Unnamed: 0,zinc_id,new_smile,tanimoto_sim,cluster
0,4371221,CO[C@H]1OC[C@@H](O)[C@H](O)[C@H]1O,0.081,16
1,34310585,C[S@@](=O)CC(N)=O,0.091,10


1. The new_smile column is the canonical_smile generated by RDKit;
2. The tanimoto_sim column is the maximum tanimoto similarity value between query molecule and 25 known binders (in 1.3 clustering);
3. The cluster column is the label of most similar known binders for the query molecule.

In [3]:
len(df)

10309353

Total around 10.3 million compounds in Zinc20 instock dataset

In [4]:
df1 = df.loc[df['tanimoto_sim']>=0.3]
len(df1)

16474

After cutoff (tanimoto_sim >= 0.3), only 16474 compounds selected for further analysis

In [5]:
df1.groupby('cluster').count()

Unnamed: 0_level_0,zinc_id,new_smile,tanimoto_sim
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,227,227,227
1,2419,2419,2419
2,43,43,43
3,65,65,65
4,54,54,54
5,24,24,24
6,1,1,1
7,366,366,366
8,22,22,22
9,1692,1692,1692


Each known binder (cluster 0 to 24) has at least one analog shown in above table

In [6]:
df1 = df1.reset_index(drop=True)
df1.head(10)

Unnamed: 0,zinc_id,new_smile,tanimoto_sim,cluster
0,20258057,O=C(c1c[nH]c(=O)[nH]1)N1CCNCC1,0.333,13
1,42420199,CCOC(=O)C(=O)N1CCNCC1,0.337,13
2,2456218,O=C(O)C(=O)N1CCNCC1,0.338,13
3,8700976,NC(=O)C(=O)N1CCNCC1,0.321,13
4,36949583,C[C@@H](O)C(=O)N1CCNCC1,0.304,13
5,34978177,O=C(CO)N1CCNCC1,0.312,13
6,2456219,O=C(O)CC(=O)N1CCNCC1,0.338,13
7,39570052,O=C(c1nc[nH]n1)N1CCNCC1,0.31,13
8,36949582,C[C@H](O)C(=O)N1CCNCC1,0.304,13
9,4219193,CC(C)NC(=O)C(=O)N1CCNCC1,0.314,13


In [7]:
df1.to_csv('dataset/curated_Zinc20_instock/curated_Zinc_analogs.csv', index=False)

### Blood-brain-barrier permeability (BBBP) prediction

LightBBB is used to predict the BBBP. It has online server to upload smiles and give BBBP prediction.

This curated_Zinc_analogs_BBBP.csv already has BBBP prediction

In [2]:
df = pd.read_csv('dataset/curated_Zinc20_instock/curated_Zinc_analogs_BBBP.csv')
df = df.drop_duplicates('new_smile')
df.head(2)

Unnamed: 0,zinc_id,new_smile,tanimoto_sim,cluster,BBBP
0,20258057,O=C(c1c[nH]c(=O)[nH]1)N1CCNCC1,0.333,13,Permeable
1,42420199,CCOC(=O)C(=O)N1CCNCC1,0.337,13,Permeable


In [3]:
df.groupby('BBBP').count()

Unnamed: 0_level_0,zinc_id,new_smile,tanimoto_sim,cluster
BBBP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Non-Permeable,3166,3166,3166,3166
Permeable,13302,13302,13302,13302


In [4]:
df1 = df.loc[df['BBBP']=='Permeable']
len(df1)

13302

In [5]:
df1 = df1.reset_index(drop=True)
df1.groupby('cluster').count()

Unnamed: 0_level_0,zinc_id,new_smile,tanimoto_sim,BBBP
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,164,164,164,164
1,1891,1891,1891,1891
2,41,41,41,41
3,51,51,51,51
4,35,35,35,35
5,13,13,13,13
6,1,1,1,1
7,81,81,81,81
8,20,20,20,20
9,1206,1206,1206,1206


In [6]:
df1['ligand_id'] = pd.Series(range(1, len(df1)+1), index=df1.index)
df1.head(3)

Unnamed: 0,zinc_id,new_smile,tanimoto_sim,cluster,BBBP,ligand_id
0,20258057,O=C(c1c[nH]c(=O)[nH]1)N1CCNCC1,0.333,13,Permeable,1
1,42420199,CCOC(=O)C(=O)N1CCNCC1,0.337,13,Permeable,2
2,2456218,O=C(O)C(=O)N1CCNCC1,0.338,13,Permeable,3


In [7]:
df1.to_csv('dataset/curated_Zinc20_instock/curated_Zinc_analogs_BBBP_select.csv', index=False)