# Calculating Pubchem Fingerprints:
- For Reference drugs
- For Training drugs

In [17]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from rdkit import Chem
from rdkit import DataStructs
from rdkit.ML.Cluster import Butina
from rdkit.Chem import Draw
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem.Draw import SimilarityMaps

## Pubchem fingerprints calculation: Reference Drugs

In [20]:
df_ref = pd.read_csv('gdsc_drugs+SMILES.csv')
df_ref

Unnamed: 0,DRUG_NAME,canonical_SMILES,standardized_SMILES
0,123138,C[CH2],[CH2]C
1,123829,CC(C[Se]CC(=O)NCCS(=O)(=O)O)C1CCC2C1(C(CC3C2C(...,CC(C[Se]CC(=O)NCCS(=O)(=O)O)C1CCC2C3C(O)CC4CC(...
2,150412,CC(CO)(CO)NCC1=CC2=C(C=C1)C3=C(C=C2)C4=CC=CC=C...,CC(CO)(CO)NCc1ccc2c(ccc3c4ccccc4ccc23)c1
3,50869,C1=CC=C(C=C1)OCCCCCCNN,NNCCCCCCOc1ccccc1
4,615590,CNC(=O)CSC1=C(NC2=CC=CC=C21)C3=CC=CS3,CNC(=O)CSc1c(-c2cccs2)[nH]c2ccccc12
...,...,...,...
202,Wee1 Inhibitor,C1=CC=C(C(=C1)C2=CC3=C(C4=C(N3)C=CC(=C4)O)C5=C...,O=C1NC(=O)c2c1c(-c1ccccc1Cl)cc1[nH]c3ccc(O)cc3c21
203,Wnt-C59,CC1=NC=CC(=C1)C2=CC=C(C=C2)CC(=O)NC3=CC=C(C=C3...,Cc1cc(-c2ccc(CC(=O)Nc3ccc(-c4cccnc4)cc3)cc2)ccn1
204,XAV939,C1CSCC2=C1N=C(NC2=O)C3=CC=C(C=C3)C(F)(F)F,O=c1[nH]c(-c2ccc(C(F)(F)F)cc2)nc2c1CSCC2
205,ZM447439,COC1=C(C=C2C(=C1)C(=NC=N2)NC3=CC=C(C=C3)NC(=O)...,COc1cc2c(Nc3ccc(NC(=O)c4ccccc4)cc3)ncnc2cc1OCC...


In [21]:
molecules_ref = []
for _, smiles in df_ref[[ "standardized_SMILES"]].itertuples():
    molecules_ref.append((Chem.MolFromSmiles(smiles)))

In [54]:
# Creating fingerprints for all molecules

rdkit_gen = rdFingerprintGenerator.GetRDKitFPGenerator(maxPath=7, fpSize=3072)
fgrps_ref = [rdkit_gen.GetFingerprint(mol) for mol in molecules_ref]

In [55]:
np.array(fgrps_ref).shape

(207, 3072)

In [23]:
# Calculating number of fingerprints
nfgrps_ref = len(fgrps_ref)
print("Number of fingerprints:", nfgrps_ref)

Number of fingerprints: 207


## Pubchem fingerprints calculation: Training Drugs

In [5]:
df_train = pd.read_csv('../../../data/gen_exp_drugs_st_SMILES.csv')
print(df_train.shape)
df_train.head()

(425, 2)


Unnamed: 0,drug_name,standardized_SMILES
0,5-methoxytryptamine,COc1ccc2[nH]cc(CCN)c2c1
1,acalabrutinib,CC#CC(=O)N1CCC[C@H]1c1nc(-c2ccc(C(=O)Nc3ccccn3...
2,acemetacin,COc1ccc2c(c1)c(CC(=O)OCC(=O)O)c(C)n2C(=O)c1ccc...
3,acipimox,Cc1cnc(C(=O)O)c[n+]1[O-]
4,adapalene,COc1ccc(-c2ccc3cc(C(=O)O)ccc3c2)cc1C12CC3CC(CC...


In [24]:
molecules_train = []
for _, smiles in df_train[[ "standardized_SMILES"]].itertuples():
    molecules_train.append((Chem.MolFromSmiles(smiles)))

In [52]:
# Creating fingerprints for all molecules

rdkit_gen = rdFingerprintGenerator.GetRDKitFPGenerator(maxPath=7, fpSize=3072)
fgrps_train = [rdkit_gen.GetFingerprint(mol) for mol in molecules_train]

In [53]:
np.array(fgrps_train).shape

(425, 3072)

In [26]:
# Calculating number of fingerprints
nfgrps_train = len(fgrps_train)
print("Number of fingerprints:", nfgrps_train)

Number of fingerprints: 425


## Calculate bitwise similarity - Training drugs vs Reference drugs