# Calculating Pubchem Fingerprints:
- For Reference drugs
- For Training drugs

In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from rdkit import Chem
from rdkit import DataStructs
from rdkit.ML.Cluster import Butina
from rdkit.Chem import Draw
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem.Draw import SimilarityMaps

## Pubchem fingerprints calculation: Reference Drugs

In [2]:
df_ref = pd.read_csv('gdsc_drugs+SMILES.csv')
df_ref

Unnamed: 0,DRUG_NAME,canonical_SMILES,standardized_SMILES
0,123138,C[CH2],[CH2]C
1,123829,CC(C[Se]CC(=O)NCCS(=O)(=O)O)C1CCC2C1(C(CC3C2C(...,CC(C[Se]CC(=O)NCCS(=O)(=O)O)C1CCC2C3C(O)CC4CC(...
2,150412,CC(CO)(CO)NCC1=CC2=C(C=C1)C3=C(C=C2)C4=CC=CC=C...,CC(CO)(CO)NCc1ccc2c(ccc3c4ccccc4ccc23)c1
3,50869,C1=CC=C(C=C1)OCCCCCCNN,NNCCCCCCOc1ccccc1
4,615590,CNC(=O)CSC1=C(NC2=CC=CC=C21)C3=CC=CS3,CNC(=O)CSc1c(-c2cccs2)[nH]c2ccccc12
...,...,...,...
202,Wee1 Inhibitor,C1=CC=C(C(=C1)C2=CC3=C(C4=C(N3)C=CC(=C4)O)C5=C...,O=C1NC(=O)c2c1c(-c1ccccc1Cl)cc1[nH]c3ccc(O)cc3c21
203,Wnt-C59,CC1=NC=CC(=C1)C2=CC=C(C=C2)CC(=O)NC3=CC=C(C=C3...,Cc1cc(-c2ccc(CC(=O)Nc3ccc(-c4cccnc4)cc3)cc2)ccn1
204,XAV939,C1CSCC2=C1N=C(NC2=O)C3=CC=C(C=C3)C(F)(F)F,O=c1[nH]c(-c2ccc(C(F)(F)F)cc2)nc2c1CSCC2
205,ZM447439,COC1=C(C=C2C(=C1)C(=NC=N2)NC3=CC=C(C=C3)NC(=O)...,COc1cc2c(Nc3ccc(NC(=O)c4ccccc4)cc3)ncnc2cc1OCC...


In [3]:
molecules_ref = []
for _, smiles in df_ref[[ "standardized_SMILES"]].itertuples():
    molecules_ref.append((Chem.MolFromSmiles(smiles)))

In [4]:
# Creating fingerprints for all molecules

rdkit_gen = rdFingerprintGenerator.GetRDKitFPGenerator(maxPath=7, fpSize=3072)
fgrps_ref = [rdkit_gen.GetFingerprint(mol) for mol in molecules_ref]

In [5]:
np.array(fgrps_ref).shape

(207, 3072)

In [51]:
df_fgrps_ref = pd.DataFrame(np.array(fgrps_ref))
df_fgrps_ref.columns = ["FP"+str(i) for i in range(np.array(fgrps_ref).shape[1])]
df_fgrps_ref.to_csv('fgrps_ref.csv', index=False)
print(df_fgrps_ref.shape)
df_fgrps_ref.head()

(207, 3072)


Unnamed: 0,FP0,FP1,FP2,FP3,FP4,FP5,FP6,FP7,FP8,FP9,...,FP3062,FP3063,FP3064,FP3065,FP3066,FP3067,FP3068,FP3069,FP3070,FP3071
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,1,0,1,1,0,0,0,...,1,0,1,0,0,0,1,0,0,0


In [6]:
# Calculating number of fingerprints
nfgrps_ref = len(fgrps_ref)
print("Number of fingerprints:", nfgrps_ref)

Number of fingerprints: 207


## Pubchem fingerprints calculation: Training Drugs

In [7]:
df_train = pd.read_csv('../../../data/gen_exp_drugs_st_SMILES.csv')
print(df_train.shape)
df_train.head()

(425, 2)


Unnamed: 0,drug_name,standardized_SMILES
0,5-methoxytryptamine,COc1ccc2[nH]cc(CCN)c2c1
1,acalabrutinib,CC#CC(=O)N1CCC[C@H]1c1nc(-c2ccc(C(=O)Nc3ccccn3...
2,acemetacin,COc1ccc2c(c1)c(CC(=O)OCC(=O)O)c(C)n2C(=O)c1ccc...
3,acipimox,Cc1cnc(C(=O)O)c[n+]1[O-]
4,adapalene,COc1ccc(-c2ccc3cc(C(=O)O)ccc3c2)cc1C12CC3CC(CC...


In [8]:
molecules_train = []
for _, smiles in df_train[[ "standardized_SMILES"]].itertuples():
    molecules_train.append((Chem.MolFromSmiles(smiles)))

In [9]:
# Creating fingerprints for all molecules

rdkit_gen = rdFingerprintGenerator.GetRDKitFPGenerator(maxPath=7, fpSize=3072)
fgrps_train = [rdkit_gen.GetFingerprint(mol) for mol in molecules_train]

In [10]:
np.array(fgrps_train).shape

(425, 3072)

In [50]:
df_fgrps_train = pd.DataFrame(np.array(fgrps_train))
df_fgrps_train.columns = ["FP"+str(i) for i in range(np.array(fgrps_train).shape[1])]
df_fgrps_train.to_csv('fgrps_train.csv', index=False)
print(df_fgrps_train.shape)
df_fgrps_train.head()

(425, 3072)


Unnamed: 0,FP0,FP1,FP2,FP3,FP4,FP5,FP6,FP7,FP8,FP9,...,FP3062,FP3063,FP3064,FP3065,FP3066,FP3067,FP3068,FP3069,FP3070,FP3071
0,0,0,1,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,1,0,1,1,1,1,0,0,...,0,1,1,0,0,0,1,1,0,1
2,1,0,1,1,0,1,1,0,0,0,...,0,0,0,0,0,0,1,0,1,0
3,0,0,1,0,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0


In [11]:
# Calculating number of fingerprints
nfgrps_train = len(fgrps_train)
print("Number of fingerprints:", nfgrps_train)

Number of fingerprints: 425


## Calculate bitwise similarity - Training drugs vs Reference drugs

In [62]:
a = np.array(([1,2,3], 
              [4,5,6], 
              [7,8,9]))

b = np.array(([1,2,3], 
              [4,5,6]))

clist = []

for i in range(a.shape[0]):
    c = np.zeros((b.shape[0], b.shape[1]))
    for j in range(b.shape[0]):
        for m in range(a.shape[1]):
            for n in range(b.shape[1]):
                if a[i][m] == b[j][n]:
                    c[j][n] = 1
    clist.append(c)
    
print(len(clist))
print(clist)

3
[array([[1., 1., 1.],
       [0., 0., 0.]]), array([[0., 0., 0.],
       [1., 1., 1.]]), array([[0., 0., 0.],
       [0., 0., 0.]])]


In [63]:
cstack = np.stack(clist)
cstack.shape

(3, 2, 3)

In [60]:
# Convert dataframes to numpy arrays for efficient computation
# Convert dataframes to numpy arrays for efficient computation
train = df_fgrps_train.values
ref = df_fgrps_ref.values

# Create a list of dataframes where each dataframe is a comparison of a row in train with all rows in ref
ssp_list = [(train[i, np.newaxis] == ref).astype(int) for i in range(train.shape[0])]

# Convert list of dataframes to a 3D numpy array
ssp_array = np.array([df.values for df in ssp_list])

print(ssp_array.shape)

AttributeError: 'numpy.ndarray' object has no attribute 'values'

In [46]:
import numpy as np

ssp_list = []

# Convert lists to NumPy arrays
fgrps_train_np = np.array(fgrps_train)
fgrps_ref_np = np.array(fgrps_ref)

for i in range(fgrps_train_np.shape[0]):
    ssp = np.equal.outer(fgrps_train_np[i], fgrps_ref_np).astype(int)
    ssp_list.append(ssp)

MemoryError: Unable to allocate 7.28 GiB for an array with shape (3072, 207, 3072) and data type int32

In [45]:
ssp_list = []

for i in range(np.array(fgrps_train).shape[0]):
    ssp = np.zeros((np.array(fgrps_ref).shape[0], np.array(fgrps_ref).shape[1]))
    for j in range(np.array(fgrps_ref).shape[0]):
        for m in range(np.array(fgrps_train).shape[1]):
            for n in range(np.array(fgrps_ref).shape[1]):
                if fgrps_train[i][m] == fgrps_ref[j][n]:
                    ssp[j][n] = 1
    ssp_list.append(ssp)
                    

KeyboardInterrupt: 

In [None]:
ssp_stack = np.stack(ssp_list)
ssp_stack.shape

In [None]:
# # Split train, test
# perm = np.arange(len(score))
# np.random.shuffle(perm)
# data = array_reg[perm]
# labels = score[perm]
TEST_SIZE = 75
ssp_train = ssp_stack[TEST_SIZE:]
# labels_train = labels[TEST_SIZE:]
ssp_test = ssp_stack[:TEST_SIZE]
# labels_test = labels[:TEST_SIZE]

In [None]:
# Save dataset
import h5py

h5f = h5py.File('ssp_data_train.h5', 'w')
h5f.create_dataset('data', data=ssp_train)
h5f.close()

# h5f = h5py.File('gene_exp_labels_train.h5', 'w')
# h5f.create_dataset('data', data=labels_train)
# h5f.close()

h5f = h5py.File('ssp_data_test.h5', 'w')
h5f.create_dataset('data', data=ssp_test)
h5f.close()

# h5f = h5py.File('gene_exp_labels_test.h5', 'w')
# h5f.create_dataset('data', data=labels_test)
# h5f.close()

In [None]:
h5f = h5py.File('ssp_data_train.h5', 'r')
ssp_train = h5f['data'][:]
# h5f2 = h5py.File('../dleps/gene_exp_labels_train.h5', 'r')
# labels_train = h5f2['data'][:]
h5f = h5py.File('ssp_data_test.h5', 'r')
ssp_test = h5f['data'][:]
# h5f2 = h5py.File('../dleps/gene_exp_labels_test.h5', 'r')
# labels_test = h5f2['data'][:]

print(ssp_train.shape)
# print(labels_train.shape)
print(ssp_test.shape)
# print(labels_test.shape)