### MACCS Keys

In [1]:
#MACCS (167)
#used for substructure search
#captures essential features of a molecule, checks for absence of presence of specific structural elements
#The MACCS (Molecular ACCess System) keys are a set of structural keys used for substructure searching and similarity analysis. 
#They encode specific chemical substructures or patterns.

In [13]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem import MACCSkeys
from rdkit import DataStructs
import numpy as np

In [3]:
df = pd.read_csv('df_fin1.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Molecule ChEMBL ID,Standard Value,Smiles,Receptor_class,MW,LogP,NumHAcceptors,NumHDonors,pIC50
0,0,CHEMBL257197,26.0,C[C@@H](Oc1cccc2ncnc(Nc3ccc4c(cnn4Cc4nccs4)c3)...,active,473.562,4.0834,9.0,1.0,7.585027
1,1,CHEMBL257196,140.0,C[C@@H](Oc1cccc2ncnc(Nc3ccc4c(cnn4Cc4cscn4)c3)...,active,445.508,3.4805,9.0,2.0,6.853872
2,2,CHEMBL256996,130.0,C[C@H](Oc1cccc2ncnc(Nc3ccc4c(cnn4Cc4cscn4)c3)c...,active,473.562,4.0834,9.0,1.0,6.886057
3,3,CHEMBL497697,361.0,CN/N=C/c1c(N)ncnc1Nc1ccc2c(cnn2Cc2cccc(F)c2)c1,active,390.426,2.8928,8.0,3.0,6.442493
4,4,CHEMBL271668,200.0,Cc1ccc(Oc2ccc(Nc3ncnc4cccc(O[C@H](C)C(=O)NCCO)...,active,473.533,4.05334,8.0,3.0,6.69897


In [5]:
df.columns

Index(['Unnamed: 0', 'Molecule ChEMBL ID', 'Standard Value', 'Smiles',
       'Receptor_class', 'MW', 'LogP', 'NumHAcceptors', 'NumHDonors', 'pIC50'],
      dtype='object')

In [6]:
columns_to_drop = ['Molecule ChEMBL ID', 'Unnamed: 0', 'Standard Value', 'Receptor_class', 'MW', 'LogP', 'NumHAcceptors', 'NumHDonors', 'pIC50']

In [7]:
df = df.drop(columns = columns_to_drop, axis = 1)

In [8]:
df.head()

Unnamed: 0,Smiles
0,C[C@@H](Oc1cccc2ncnc(Nc3ccc4c(cnn4Cc4nccs4)c3)...
1,C[C@@H](Oc1cccc2ncnc(Nc3ccc4c(cnn4Cc4cscn4)c3)...
2,C[C@H](Oc1cccc2ncnc(Nc3ccc4c(cnn4Cc4cscn4)c3)c...
3,CN/N=C/c1c(N)ncnc1Nc1ccc2c(cnn2Cc2cccc(F)c2)c1
4,Cc1ccc(Oc2ccc(Nc3ncnc4cccc(O[C@H](C)C(=O)NCCO)...


In [9]:
PandasTools.AddMoleculeColumnToFrame(df, 'Smiles', 'mol')

In [10]:
df.head()

Unnamed: 0,Smiles,mol
0,C[C@@H](Oc1cccc2ncnc(Nc3ccc4c(cnn4Cc4nccs4)c3)...,<rdkit.Chem.rdchem.Mol object at 0x14214ce40>
1,C[C@@H](Oc1cccc2ncnc(Nc3ccc4c(cnn4Cc4cscn4)c3)...,<rdkit.Chem.rdchem.Mol object at 0x14214ceb0>
2,C[C@H](Oc1cccc2ncnc(Nc3ccc4c(cnn4Cc4cscn4)c3)c...,<rdkit.Chem.rdchem.Mol object at 0x14214d000>
3,CN/N=C/c1c(N)ncnc1Nc1ccc2c(cnn2Cc2cccc(F)c2)c1,<rdkit.Chem.rdchem.Mol object at 0x14214d070>
4,Cc1ccc(Oc2ccc(Nc3ncnc4cccc(O[C@H](C)C(=O)NCCO)...,<rdkit.Chem.rdchem.Mol object at 0x14214d0e0>


In [19]:
df_maccs = []
for mol in df['mol']:
    maccs_bitvector = MACCSkeys.GenMACCSKeys(mol)
    my_array = np.zeros((0,))
    DataStructs.ConvertToNumpyArray(maccs_bitvector, my_array)
    df_maccs.append(my_array)
#df['maccs'] = df_maccs
MACCS = pd.concat([df, pd.DataFrame(df_maccs)], axis = 1)


In [None]:
bitstring="".join(arr.astype(str))
fp2 = DataStructs.cDataStructs.CreateFromBitString(bitstring)

fp.GetOnBits()

In [20]:
MACCS

Unnamed: 0,Smiles,mol,maccs,0,1,2,3,4,5,6,...,157,158,159,160,161,162,163,164,165,166
0,C[C@@H](Oc1cccc2ncnc(Nc3ccc4c(cnn4Cc4nccs4)c3)...,<rdkit.Chem.rdchem.Mol object at 0x14214ce40>,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
1,C[C@@H](Oc1cccc2ncnc(Nc3ccc4c(cnn4Cc4cscn4)c3)...,<rdkit.Chem.rdchem.Mol object at 0x14214ceb0>,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
2,C[C@H](Oc1cccc2ncnc(Nc3ccc4c(cnn4Cc4cscn4)c3)c...,<rdkit.Chem.rdchem.Mol object at 0x14214d000>,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3,CN/N=C/c1c(N)ncnc1Nc1ccc2c(cnn2Cc2cccc(F)c2)c1,<rdkit.Chem.rdchem.Mol object at 0x14214d070>,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
4,Cc1ccc(Oc2ccc(Nc3ncnc4cccc(O[C@H](C)C(=O)NCCO)...,<rdkit.Chem.rdchem.Mol object at 0x14214d0e0>,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3091,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...,<rdkit.Chem.rdchem.Mol object at 0x1422f2340>,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3092,CCOc1cc2ncc(C#N)c(Nc3ccc(OCc4ccccn4)c(Cl)c3)c2...,<rdkit.Chem.rdchem.Mol object at 0x1422f23b0>,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3093,CCOc1cc2ncc(C#N)c(Nc3ccc(OCc4ccccn4)c(Cl)c3)c2...,<rdkit.Chem.rdchem.Mol object at 0x1422f2420>,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3094,CCOc1cc2ncc(C#N)c(Nc3ccc(OCc4ccccn4)c(Cl)c3)c2...,<rdkit.Chem.rdchem.Mol object at 0x1422f2490>,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [12]:
df['maccs'] = df_maccs

ValueError: Length of values (0) does not match length of index (3096)