In [1]:
import pandas as pd
import numpy as np
import warnings

In [10]:
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem, PandasTools, MACCSkeys, AtomPairs, rdFingerprintGenerator
from rdkit import DataStructs
from rdkit.Chem.rdmolops import PatternFingerprint
from rdkit.Avalon import pyAvalonTools
from rdkit.Chem.AtomPairs.Pairs import GetAtomPairFingerprintAsBitVect

pd.set_option('display.max_rows', None)

warnings.filterwarnings('ignore')

In [11]:
df = pd.read_csv('NetGP_data/gdsc_drug_smiles_data.tsv', sep='\t', header=0)
df.head()

Unnamed: 0,drug_name,smiles
0,(5Z)-7-Oxozeaenol,CC1CC=CC(=O)C(C(CC=CC2=C(C(=CC(=C2)OC)O)C(=O)O...
1,5-Fluorouracil,C1=C(C(=O)NC(=O)N1)F
2,Afatinib,CN(C)CC=CC(=O)NC1=C(C=C2C(=C1)C(=NC=N2)NC3=CC(...
3,AICA Ribonucleotide,C1=NC(=C(N1C2C(C(C(O2)COP(=O)(O)O)O)O)N)C(=O)N
4,AKT inhibitor VIII,C1CN(CCC1N2C3=CC=CC=C3NC2=O)CC4=CC=C(C=C4)C5=C...


In [12]:
# Generating molecular objects from SMILES
PandasTools.AddMoleculeColumnToFrame(df,'smiles','mol')



# Morgan Fingerprint

In [15]:
df_mf = []
for mol in df['mol']:
    mf_bitvertor = AllChem.GetMorganFingerprintAsBitVect(mol, radius=1, nBits = 2048)
    arr=np.zeros((0,), dtype=np.int8)

    # Convert the rdkit explicit vectors into numpy arrays
    DataStructs.ConvertToNumpyArray(mf_bitvertor,arr)
    df_mf.append(arr)

MF = pd.concat([df, pd.DataFrame(df_mf)], axis=1)

# Substructure generation using Tanimoto similarity

In [2]:
import pandas as pd
import numpy as np
import warnings

data = pd.read_csv('NetGP_data/gdsc_drug_smiles_data.tsv', sep='\t', header=0)

print(data.head())
print(data.shape)


             drug_name                                             smiles
0    (5Z)-7-Oxozeaenol  CC1CC=CC(=O)C(C(CC=CC2=C(C(=CC(=C2)OC)O)C(=O)O...
1       5-Fluorouracil                               C1=C(C(=O)NC(=O)N1)F
2             Afatinib  CN(C)CC=CC(=O)NC1=C(C=C2C(=C1)C(=NC=N2)NC3=CC(...
3  AICA Ribonucleotide     C1=NC(=C(N1C2C(C(C(O2)COP(=O)(O)O)O)O)N)C(=O)N
4   AKT inhibitor VIII  C1CN(CCC1N2C3=CC=CC=C3NC2=O)CC4=CC=C(C=C4)C5=C...
(454, 2)


1. SMILES --> Fingerprint

In [3]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem

# remove 'None' molecule
idxs=[]
for i, smiles in enumerate(data["smiles"]):
    mol = Chem.MolFromSmiles(smiles)
    if (mol == None) :
        print(i,mol)
        idxs.append(i)
data = data.drop(idxs, axis= 0)

print(data.shape)

(454, 2)




In [4]:
# Fingerprint
fps = []
for i, smiles in enumerate(data["smiles"]):
    mol = Chem.MolFromSmiles(smiles)
    arr = np.zeros((1,))
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    DataStructs.ConvertToNumpyArray(fp, arr)

    fps.append(fp)

data['fp'] = fps
data.head()



Unnamed: 0,drug_name,smiles,fp
0,(5Z)-7-Oxozeaenol,CC1CC=CC(=O)C(C(CC=CC2=C(C(=CC(=C2)OC)O)C(=O)O...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,5-Fluorouracil,C1=C(C(=O)NC(=O)N1)F,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Afatinib,CN(C)CC=CC(=O)NC1=C(C=C2C(=C1)C(=NC=N2)NC3=CC(...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,AICA Ribonucleotide,C1=NC(=C(N1C2C(C(C(O2)COP(=O)(O)O)O)O)N)C(=O)N,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,AKT inhibitor VIII,C1CN(CCC1N2C3=CC=CC=C3NC2=O)CC4=CC=C(C=C4)C5=C...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [30]:
# example
drug_smiles =  "Cc1cc(Oc2nccc(CCC)c2)ccc1"
substructure_smiles = "c1ccc(Oc2ccccn2)cc1"

# Tanimoto similarity threshold
threshold = 0.7

# Drug --> Fingerprint
drug_molecule = Chem.MolFromSmiles(drug_smiles)
drug_fp = AllChem.GetMorganFingerprintAsBitVect(drug_molecule, 2, nBits=2048)
drug_arr = np.zeros((1,))
DataStructs.ConvertToNumpyArray(drug_fp, drug_arr)

# Substructure --> Fingerprint
substructure_molecule = Chem.MolFromSmiles(substructure_smiles)
substructure_fp = AllChem.GetMorganFingerprintAsBitVect(substructure_molecule, 2, nBits=2048)
substructure_arr = np.zeros((1,))
DataStructs.ConvertToNumpyArray(substructure_fp, substructure_arr)

print(drug_arr)
print(substructure_arr)
print(np.sum(drug_arr))
print(np.sum(substructure_arr))

# Tanimoto similarity
similarity = DataStructs.TanimotoSimilarity(drug_fp, substructure_fp)
print(similarity)


[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
35.0
21.0
0.3333333333333333
