In [1]:

from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
from rdkit.Chem import MolFromSmiles, MolToSmiles
from tqdm.auto import tqdm
tqdm.pandas()
import pandas as pd
import os
import pickle
from multiprocessing import Pool
from rdkit import RDLogger
import numpy as np
import itertools
RDLogger.DisableLog('rdApp.error')
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd

In [2]:
# load the pickled files
enamine_protein = pd.read_pickle('../data/diversity_picking/ECFP6_v2/enamine_protein_ECFP6_MaxMin_v3.pkl')
chemdiv_rna = pd.read_pickle('../data/diversity_picking/ECFP6_v2/chemdiv_rna_ECFP6_MaxMin_v3.pkl')
enamine_rna = pd.read_pickle('../data/diversity_picking/ECFP6_v2/enamine_rna_ECFP6_MaxMin_v3.pkl')
life_chemicals_rna = pd.read_pickle('../data/diversity_picking/ECFP6_v2/life_chemicals_rna_ECFP6_MaxMin_v3.pkl')
robin_rna = pd.read_pickle('../data/diversity_picking/ECFP6_v2/robin_rna_ECFP6_MaxMin_v3.pkl')


In [3]:
# columns of the dataframe
enamine_protein.columns

Index(['mol', 'smiles', 'ECFP6', 'source'], dtype='object')

In [4]:
enamine_protein.head(1)

Unnamed: 0,mol,smiles,ECFP6,source
170822,<rdkit.Chem.rdchem.Mol object at 0x7f4ce71dec50>,Cc1nnc(NS(=O)(=O)c2ccc3c(c2)CCC3)s1,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",enamine_protein


In [5]:
# shape of the data
print('Enamine Protein:', enamine_protein.shape)
print('ChemDiv RNA:', chemdiv_rna.shape)
print('Enamine RNA:', enamine_rna.shape)
print('LifeChemicals RNA:', life_chemicals_rna.shape)
print('Robin RNA:', robin_rna.shape)


Enamine Protein: (38710, 4)
ChemDiv RNA: (19908, 4)
Enamine RNA: (11502, 4)
LifeChemicals RNA: (5308, 4)
Robin RNA: (1992, 4)


In [6]:
df_list = [enamine_protein, chemdiv_rna, enamine_rna, life_chemicals_rna, robin_rna]


In [7]:
# from each dataframe, delete column ECFP6
for df in df_list:
    del df['ECFP6']
    

In [8]:
def compute_ECFP_with_bit_info(df, smiles_column):
    # Function to calculate ECFP6 and capture bit information
    def get_fingerprint_and_bit_info(smiles):
        mol = Chem.MolFromSmiles(smiles)
        # Calculate ECFP6 with radius=3 and capture bit info
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=2048, bitInfo=bit_info_map)
        # Convert fingerprint to bit string for storage
        fp_bit_string = fp.ToBitString()
        # Convert bit info map to a storable format (e.g., string or list)
        bit_info_storable = {bit: list(info) for bit, info in bit_info_map.items()}
        return fp_bit_string, bit_info_storable

    # Initialize an empty bit info map for each molecule
    bit_info_map = {}
    # Apply the function to each SMILES in the dataframe
    df[['ecfp6', 'bit_info_map']] = df.apply(lambda x: pd.Series(get_fingerprint_and_bit_info(x[smiles_column])), axis=1)
    
    return df



In [9]:
# compute ECFP with bit info for each dataset  
enamine_protein = compute_ECFP_with_bit_info(enamine_protein, 'smiles')
chemdiv_rna = compute_ECFP_with_bit_info(chemdiv_rna, 'smiles')
enamine_rna = compute_ECFP_with_bit_info(enamine_rna, 'smiles')
life_chemicals_rna = compute_ECFP_with_bit_info(life_chemicals_rna, 'smiles')
robin_rna = compute_ECFP_with_bit_info(robin_rna, 'smiles')

In [10]:
enamine_protein.head(1)

Unnamed: 0,mol,smiles,source,ecfp6,bit_info_map
170822,<rdkit.Chem.rdchem.Mol object at 0x7f4ce71dec50>,Cc1nnc(NS(=O)(=O)c2ccc3c(c2)CCC3)s1,enamine_protein,0000100001000000000000000000000000000000000000...,"{4: [(6, 2)], 9: [(18, 2)], 162: [(18, 1)], 16..."


In [11]:
# add column rna to each dataframe and if the  dataframe is not enamine_protein, add 1 to the column rna 
enamine_protein['rna'] = 0
chemdiv_rna['rna'] = 1
enamine_rna['rna'] = 1
life_chemicals_rna['rna'] = 1
robin_rna['rna'] = 1

In [12]:
# save the datasets with ECFP and bit info to pickle files to '/home/ubuntu/diplomka/notebooks_ipynb/data_for_ml'
enamine_protein.to_pickle('data_for_ml/enamine_protein_df_ml.pkl')
chemdiv_rna.to_pickle('data_for_ml/chemdiv_rna_df_ml.pkl')
enamine_rna.to_pickle('data_for_ml/enamine_rna_df_ml.pkl')
life_chemicals_rna.to_pickle('data_for_ml/life_chemicals_rna_df_ml.pkl')
robin_rna.to_pickle('data_for_ml/robin_rna_df_ml.pkl')

In [13]:
# load the datasets from pickle
enamine_protein = pd.read_pickle('data_for_ml/enamine_protein_df_ml.pkl')
chemdiv_rna = pd.read_pickle('data_for_ml/chemdiv_rna_df_ml.pkl')
enamine_rna = pd.read_pickle('data_for_ml/enamine_rna_df_ml.pkl')
life_chemicals_rna = pd.read_pickle('data_for_ml/life_chemicals_rna_df_ml.pkl')
robin_rna = pd.read_pickle('data_for_ml/robin_rna_df_ml.pkl')

# concatenate all rna datasets and count the number of rows
rna_df = pd.concat([chemdiv_rna, enamine_rna, life_chemicals_rna, robin_rna])
rna_df.shape


(38710, 6)