### MACCS Keys

In [1]:
#The MACCS (Molecular ACCess System) keys - (167 bits)
#used for **substructure searching** and similarity analysis.
#They encode specific chemical substructures or patterns.
#captures essential features of a molecule, checks for absence of presence of specific structural elements


In [2]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem import MACCSkeys
from rdkit import DataStructs
import numpy as np

In [3]:
df = pd.read_csv('df_fin1.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Molecule ChEMBL ID,Standard Value,Smiles,Receptor_class,MW,LogP,NumHAcceptors,NumHDonors,pIC50
0,0,CHEMBL257197,26.0,C[C@@H](Oc1cccc2ncnc(Nc3ccc4c(cnn4Cc4nccs4)c3)...,active,473.562,4.0834,9.0,1.0,7.585027
1,1,CHEMBL257196,140.0,C[C@@H](Oc1cccc2ncnc(Nc3ccc4c(cnn4Cc4cscn4)c3)...,active,445.508,3.4805,9.0,2.0,6.853872
2,2,CHEMBL256996,130.0,C[C@H](Oc1cccc2ncnc(Nc3ccc4c(cnn4Cc4cscn4)c3)c...,active,473.562,4.0834,9.0,1.0,6.886057
3,3,CHEMBL497697,361.0,CN/N=C/c1c(N)ncnc1Nc1ccc2c(cnn2Cc2cccc(F)c2)c1,active,390.426,2.8928,8.0,3.0,6.442493
4,4,CHEMBL271668,200.0,Cc1ccc(Oc2ccc(Nc3ncnc4cccc(O[C@H](C)C(=O)NCCO)...,active,473.533,4.05334,8.0,3.0,6.69897


In [5]:
df.columns

Index(['Unnamed: 0', 'Molecule ChEMBL ID', 'Standard Value', 'Smiles',
       'Receptor_class', 'MW', 'LogP', 'NumHAcceptors', 'NumHDonors', 'pIC50'],
      dtype='object')

In [6]:
columns_to_drop = ['Molecule ChEMBL ID', 'Unnamed: 0', 'Standard Value', 'Receptor_class', 'MW', 'LogP', 'NumHAcceptors', 'NumHDonors', 'pIC50']

In [7]:
df = df.drop(columns = columns_to_drop, axis = 1)

In [8]:
df.head()

Unnamed: 0,Smiles
0,C[C@@H](Oc1cccc2ncnc(Nc3ccc4c(cnn4Cc4nccs4)c3)...
1,C[C@@H](Oc1cccc2ncnc(Nc3ccc4c(cnn4Cc4cscn4)c3)...
2,C[C@H](Oc1cccc2ncnc(Nc3ccc4c(cnn4Cc4cscn4)c3)c...
3,CN/N=C/c1c(N)ncnc1Nc1ccc2c(cnn2Cc2cccc(F)c2)c1
4,Cc1ccc(Oc2ccc(Nc3ncnc4cccc(O[C@H](C)C(=O)NCCO)...


In [9]:
PandasTools.AddMoleculeColumnToFrame(df, 'Smiles', 'mol')

In [10]:
df.head()

Unnamed: 0,Smiles,mol
0,C[C@@H](Oc1cccc2ncnc(Nc3ccc4c(cnn4Cc4nccs4)c3)...,<rdkit.Chem.rdchem.Mol object at 0x137bd9000>
1,C[C@@H](Oc1cccc2ncnc(Nc3ccc4c(cnn4Cc4cscn4)c3)...,<rdkit.Chem.rdchem.Mol object at 0x137bd9070>
2,C[C@H](Oc1cccc2ncnc(Nc3ccc4c(cnn4Cc4cscn4)c3)c...,<rdkit.Chem.rdchem.Mol object at 0x137bd91c0>
3,CN/N=C/c1c(N)ncnc1Nc1ccc2c(cnn2Cc2cccc(F)c2)c1,<rdkit.Chem.rdchem.Mol object at 0x137bd9230>
4,Cc1ccc(Oc2ccc(Nc3ncnc4cccc(O[C@H](C)C(=O)NCCO)...,<rdkit.Chem.rdchem.Mol object at 0x137bd92a0>


In [11]:
df_maccs = []
for mol in df['mol']:
    maccs_bitvector = MACCSkeys.GenMACCSKeys(mol)
    my_array = np.zeros((0,))
    DataStructs.ConvertToNumpyArray(maccs_bitvector, my_array)
    df_maccs.append(my_array)
MACCS = pd.concat([df, pd.DataFrame(df_maccs)], axis = 1)


In [12]:
MACCS

Unnamed: 0,Smiles,mol,0,1,2,3,4,5,6,7,...,157,158,159,160,161,162,163,164,165,166
0,C[C@@H](Oc1cccc2ncnc(Nc3ccc4c(cnn4Cc4nccs4)c3)...,<rdkit.Chem.rdchem.Mol object at 0x137bd9000>,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
1,C[C@@H](Oc1cccc2ncnc(Nc3ccc4c(cnn4Cc4cscn4)c3)...,<rdkit.Chem.rdchem.Mol object at 0x137bd9070>,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
2,C[C@H](Oc1cccc2ncnc(Nc3ccc4c(cnn4Cc4cscn4)c3)c...,<rdkit.Chem.rdchem.Mol object at 0x137bd91c0>,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3,CN/N=C/c1c(N)ncnc1Nc1ccc2c(cnn2Cc2cccc(F)c2)c1,<rdkit.Chem.rdchem.Mol object at 0x137bd9230>,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0
4,Cc1ccc(Oc2ccc(Nc3ncnc4cccc(O[C@H](C)C(=O)NCCO)...,<rdkit.Chem.rdchem.Mol object at 0x137bd92a0>,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3091,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...,<rdkit.Chem.rdchem.Mol object at 0x137d82490>,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3092,CCOc1cc2ncc(C#N)c(Nc3ccc(OCc4ccccn4)c(Cl)c3)c2...,<rdkit.Chem.rdchem.Mol object at 0x137d82500>,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3093,CCOc1cc2ncc(C#N)c(Nc3ccc(OCc4ccccn4)c(Cl)c3)c2...,<rdkit.Chem.rdchem.Mol object at 0x137d82570>,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3094,CCOc1cc2ncc(C#N)c(Nc3ccc(OCc4ccccn4)c(Cl)c3)c2...,<rdkit.Chem.rdchem.Mol object at 0x137d825e0>,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


### Getting On Bits

Let's get the first SMILES string in the dataframe as an example. I will use iloc here.

In [13]:
first_smiles_df = df.iloc[1:2, 0:1]
first_smiles_df
pd.set_option('display.max_colwidth', None)  # Display full column width
print(first_smiles_df)

                                                     Smiles
1  C[C@@H](Oc1cccc2ncnc(Nc3ccc4c(cnn4Cc4cscn4)c3)c12)C(N)=O


It can also be done this way:

In [14]:
first_smiles = df['Smiles'].iloc[1]
first_smiles

'C[C@@H](Oc1cccc2ncnc(Nc3ccc4c(cnn4Cc4cscn4)c3)c12)C(N)=O'

In [15]:
sample = Chem.MolFromSmiles('C[C@@H](Oc1cccc2ncnc(Nc3ccc4c(cnn4Cc4cscn4)c3)c12)C(N)=O')

In [16]:
maccs_sample = MACCSkeys.GenMACCSKeys(sample)

In [17]:
list(maccs_sample)

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0]

In [18]:
len(maccs_sample)

167

In [19]:
maccs_on = list(maccs_sample.GetOnBits())

In [20]:
maccs_on

[36,
 38,
 47,
 52,
 65,
 72,
 75,
 77,
 79,
 80,
 83,
 84,
 88,
 92,
 94,
 95,
 96,
 97,
 98,
 100,
 101,
 105,
 110,
 111,
 113,
 117,
 120,
 121,
 122,
 124,
 125,
 126,
 127,
 131,
 133,
 135,
 137,
 142,
 143,
 145,
 148,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165]

In [21]:
len(maccs_on)

56

There are 56 on bits for this molecule.