In [None]:
#Get Manhattan, Tanimoto distance between the bit vectors of the compounds

In [137]:
import os
import csv

import pandas as pd
import numpy as np
import enlighten

from IPython.core.display import HTML
from IPython.display import SVG, Image, display

from rdkit.Chem import PandasTools, AllChem as Chem, Descriptors
from rdkit import DataStructs


In [5]:
def show(df):
    return HTML(df.to_html(notebook=True))

In [2]:
assay = 1511

In [3]:
#load sdf
sdfpath = os.path.join(os.getcwd(), f'{assay}_compounds.sdf')
dataset = PandasTools.LoadSDF(sdfpath)



In [6]:
show(dataset.head(4))

Unnamed: 0,PUBCHEM_ASSAY_ID,PUBCHEM_SID,PUBCHEM_CID,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_CANONICAL_SMILES,ID,ROMol
0,1511,56314828,1988,Inactive,CCC1=CC=CC(=C1N(COCC)C(=O)CCl)C,,
1,1511,56321651,425322,Inactive,C1CCC(CC1)NC(=O)C2COC3=CC=CC=C3O2,,
2,1511,56322145,24981957,Inactive,C1CN(CCN1CC2=CC=CC=C2)S(=O)(=O)C3=CC=CC(=C3)C(...,,
3,1511,51090143,7384392,Inactive,CCS(=O)(=O)N1CCC2(CC1)N(CCO2)S(=O)(=O)C3=CC=C(...,,


In [13]:
all_mol = dataset.copy()



In [18]:
all_mol["MACCS_keys"] = np.nan


In [24]:
#get MACCS keys and add them to the dataframe
pbar = enlighten.Counter(total=len(all_mol.index), desc='Getting MACCS keys...', unit='tick')

for index, row in all_mol.iterrows():
    mol = row.ROMol
    maccs = Chem.GetMACCSKeysFingerprint(mol)
    all_mol.loc[index, "MACCS_keys"] = maccs
    pbar.update()

In [76]:
actives = all_mol[all_mol["PUBCHEM_ACTIVITY_OUTCOME"] == "Active"]

In [44]:
active_MACCS = actives.MACCS_keys.to_list()
all_MACCS = all_mol.MACCS_keys.to_list()

In [45]:
#Setup empty matrix
tanimoto_coefficients = np.zeros([len(active_MACCS), len(all_MACCS)])
tanimoto_coefficients.shape

(1552, 305679)

In [47]:
#calculate the tanimoto coefficents for each molecule pair
TM_bar = enlighten.Counter(total=len(actives.index), desc="Calculating Tanimoto similarities...", unit='tick')

for i_ID, i in enumerate(active_MACCS):
    for j_ID, j in enumerate(all_MACCS):
        #metric is unnecessary, 
        TM_sim = DataStructs.FingerprintSimilarity(i, j, metric=DataStructs.TanimotoSimilarity)
        tanimoto_coefficients[i_ID, j_ID] = TM_sim
    TM_bar.update()


In [None]:
#setup actives dataframe
# actives = actives.assign(mol_1=np.nan, mol_2=np.nan, mol_3=np.nan, mol_4=np.nan, mol_5=np.nan)

In [138]:
cols = ['PUBCHEM_CID', 'PUBCHEM_ACTIVITY_OUTCOME', 'PUBCHEM_CANONICAL_SMILES']
new_cols = [
    'mol1', 'activity1', 'cid_1',
    'mol2', 'activity2', 'cid_2',
    'mol3', 'activity3', 'cid_3',
    'mol4', 'activity4', 'cid_4',
    'mol5', 'activity5', 'cid_5',
]
cols = cols + new_cols

In [100]:
print(all_cols)

['PUBCHEM_ASSAY_ID', 'PUBCHEM_SID', 'PUBCHEM_CID', 'PUBCHEM_ACTIVITY_OUTCOME', 'PUBCHEM_CANONICAL_SMILES', 'ID', 'ROMol', 'MACCS_keys']


In [139]:
all_cols = all_mol.columns.values.tolist()
col_indexes = [ID for ID, x in enumerate(all_cols) if x in cols]


In [None]:
# similarities = actives.copy()
# similarities.reindex(columns=cols)

In [156]:
#create CSV
headers = ['ACTIVE_COMPOUND_ID', 'ACTIVITY', 'ACTIVE_SMILES', 'CID1', 'ACTIVITY1', 'SMILES1', 'CID2', 'ACTIVITY2', 'SMILES2', 'CID3', 'ACTIVITY3', 'SMILES3', 'CID4', 'ACTIVITY4', 'SMILES4', 'CID5', 'ACTIVITY5', 'SMILES5']

# with open(f'{assay}_TM_actives.csv', 'w', newline='') as csvfile:
TM_fname = f'{assay}_TM_actives.csv'

In [157]:
csv_pbar = enlighten.Counter(total=tanimoto_coefficients.shape[0], desc=f'Writing similar molecules to {TM_fname}...', unit='ticks')

with open(TM_fname, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(headers)
    for ID, i in enumerate(tanimoto_coefficients):
        mol_idx = np.argsort(i)
        mol_idx = mol_idx[::-1]
        a,b,c,d,e = mol_idx[:5]
        # actives.loc[ID]
        # show(all_mol.iloc[a]) 
        # print(all_mol.iloc[a])
        # mol1, activity1, cid_1 = all_mol.iloc[a, [col_indexes]]

        active_row = actives.iloc[ID].values.flatten().tolist()
        active_cid, activity_act, active_SMI = active_row[col_indexes[0]], active_row[col_indexes[1]], active_row[col_indexes[2]]

        # print(active_cid)
        # print(activity_act)
        # print(active_SMI)

        #I hate how dumb this looks




        row1 = all_mol.iloc[a].values.flatten().tolist()
        mol_cid1, activity1, mol_smi1 = row1[col_indexes[0]], row1[col_indexes[1]], row1[col_indexes[2]]

        row2 = all_mol.iloc[b].values.flatten().tolist()
        mol_cid2, activity2, mol_smi2 = row2[col_indexes[0]], row2[col_indexes[1]], row2[col_indexes[2]]

        row3 = all_mol.iloc[c].values.flatten().tolist()
        mol_cid3, activity3, mol_smi3 = row3[col_indexes[0]], row3[col_indexes[1]], row3[col_indexes[2]]

        row4 = all_mol.iloc[d].values.flatten().tolist()
        mol_cid4, activity4, mol_smi4 = row4[col_indexes[0]], row4[col_indexes[1]], row4[col_indexes[2]]

        row5 = all_mol.iloc[e].values.flatten().tolist()
        mol_cid5, activity5, mol_smi5 = row5[col_indexes[0]], row5[col_indexes[1]], row5[col_indexes[2]]

        line = [active_cid, activity_act, active_SMI, mol_cid1, activity1, mol_smi1, mol_cid2, activity2, mol_smi2, mol_cid3, activity3, mol_smi3, mol_cid4, activity4, mol_smi4, mol_cid5, activity5, mol_smi5]
        writer.writerow(line)
        csv_pbar.update()

In [158]:
similarities = pd.read_csv(TM_fname)

In [159]:
PandasTools.AddMoleculeColumnToFrame(similarities, 'ACTIVE_SMILES', 'ROMol_Act', includeFingerprints=True)

PandasTools.AddMoleculeColumnToFrame(similarities, 'SMILES1', 'ROMol1', includeFingerprints=True)
PandasTools.AddMoleculeColumnToFrame(similarities, 'SMILES2', 'ROMol2', includeFingerprints=True)
PandasTools.AddMoleculeColumnToFrame(similarities, 'SMILES3', 'ROMol3', includeFingerprints=True)
PandasTools.AddMoleculeColumnToFrame(similarities, 'SMILES4', 'ROMol4', includeFingerprints=True)
PandasTools.AddMoleculeColumnToFrame(similarities, 'SMILES5', 'ROMol5', includeFingerprints=True)

In [163]:
#remove unnecessary columns
comp_cols = ['ACTIVE_COMPOUND_ID','ROMol_Act', 'ACTIVITY','ROMol2','CID2' 'ACTIVITY2', 'ROMol3','CID3', 'ACTIVITY3','ROMol4','CID4', 'ACTIVITY4','ROMol5','CID5' 'ACTIVITY5']

In [164]:
comp_mols = similarities[comp_cols]

In [165]:
show(comp_mols.head(3))

Unnamed: 0,ACTIVE_COMPOUND_ID,ROMol_Act,ACTIVITY,ROMol2,ACTIVITY2,ROMol3,ACTIVITY3,ROMol4,ACTIVITY4,ROMol5,ACTIVITY5
0,17178962,,Active,,Inactive,,Inactive,,Inactive,,Inactive
1,24687034,,Active,,Inactive,,Inactive,,Inactive,,Inactive
2,24687422,,Active,,Active,,Inactive,,Inactive,,Inactive
