In [1]:

from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
from rdkit.Chem import MolFromSmiles, MolToSmiles
from tqdm.auto import tqdm
tqdm.pandas()
import pandas as pd
import os
import pickle
from multiprocessing import Pool
from rdkit import RDLogger
import numpy as np
import itertools
RDLogger.DisableLog('rdApp.error')
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd

In [2]:


rna_binders = pd.read_pickle('data_dataset2/standardized_data/deduplicated_data/rna_binders_dedup.pkl')
rna_non_binders = pd.read_pickle('data_dataset2/standardized_data/deduplicated_data/rna_non_binders_dedup.pkl')
protein_binders = pd.read_pickle('data_dataset2/standardized_data/deduplicated_data/protein_binders_dedup.pkl')
protein_non_binders = pd.read_pickle('data_dataset2/standardized_data/deduplicated_data/protein_non_binders_dedup.pkl')


In [3]:
combined = pd.concat([rna_binders, rna_non_binders, protein_binders, protein_non_binders])
combined[combined.duplicated('smiles')]

Unnamed: 0,mol,source,smiles


In [4]:
# look for "." in smiles
combined[combined.smiles.str.contains('\.')]

Unnamed: 0,mol,source,smiles


In [5]:
# columns of the dataframe
rna_binders.columns, rna_non_binders.columns, protein_binders.columns, protein_non_binders.columns

(Index(['mol', 'source', 'smiles'], dtype='object'),
 Index(['mol', 'source', 'smiles'], dtype='object'),
 Index(['mol', 'source', 'smiles'], dtype='object'),
 Index(['mol', 'source', 'smiles'], dtype='object'))

In [6]:
# shape of the data
print('RNA binders:', rna_binders.shape)
print('RNA non-binders:', rna_non_binders.shape)
print('Probes & Drugs:', protein_binders.shape)
print('Zinc Dark Matter:', protein_non_binders.shape)


RNA binders: (1961, 3)
RNA non-binders: (21883, 3)
Probes & Drugs: (2276, 3)
Zinc Dark Matter: (24945, 3)


In [7]:
df_list = [rna_binders, rna_non_binders, protein_binders, protein_non_binders]


In [8]:
def compute_ECFP_with_bit_info(df, smiles_column):
    # Function to calculate ECFP6 and capture bit information
    def get_fingerprint_and_bit_info(smiles):
        mol = Chem.MolFromSmiles(smiles)
        # Calculate ECFP6 with radius=3 and capture bit info
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=2048, bitInfo=bit_info_map)
        # Convert fingerprint to bit string for storage
        fp_bit_string = fp.ToBitString()
        # Convert bit info map to a storable format (e.g., string or list)
        bit_info_storable = {bit: list(info) for bit, info in bit_info_map.items()}
        return fp_bit_string, bit_info_storable

    # Initialize an empty bit info map for each molecule
    bit_info_map = {}
    # Apply the function to each smiles in the dataframe
    df[['ecfp6', 'bit_info_map']] = df.apply(lambda x: pd.Series(get_fingerprint_and_bit_info(x[smiles_column])), axis=1)
    
    return df



In [9]:
# compute ECFP with bit info for each dataset  
rna_binders = compute_ECFP_with_bit_info(rna_binders, 'smiles')
rna_non_binders = compute_ECFP_with_bit_info(rna_non_binders, 'smiles')
protein_binders = compute_ECFP_with_bit_info(protein_binders, 'smiles')
protein_non_binders = compute_ECFP_with_bit_info(protein_non_binders, 'smiles')


In [10]:
rna_binders['label'] = 1  # Positive class for RNA binding
rna_non_binders['label'] = 0  # Negative class for RNA binding
protein_binders['label'] = 0  # Negative class for binding model
protein_non_binders['label'] = 0  # Negative class for binding model

In [11]:
# dires for models
os.makedirs('data_for_ml/ml_datasets_set2', exist_ok=True)
os.makedirs('data_for_ml/ml_datasets_set2/model1_rna_bin_non_rna_bin', exist_ok=True)
os.makedirs('data_for_ml/ml_datasets_set2/model2_rna_bin_protein_bin', exist_ok=True)
os.makedirs('data_for_ml/ml_datasets_set2/model3_binders_nonbinders', exist_ok=True)

In [12]:
# Sample from each DataFrame
protein_binders_sampled = protein_binders.sample(n=654, random_state=42)
rna_non_binders_sampled = rna_non_binders.sample(n=653  , random_state=42)
protein_non_binders_sampled = protein_non_binders.sample(n=654, random_state=42)



In [13]:
print("Protein Binders Sampled:", len(protein_binders_sampled))
print("RNA Non-Binders Sampled:", len(rna_non_binders_sampled))
print("Protein Non-Binders Sampled:", len(protein_non_binders_sampled))


Protein Binders Sampled: 654
RNA Non-Binders Sampled: 653
Protein Non-Binders Sampled: 654


In [14]:
# balance the datasets
# Concatenate all sampled non-binders into a single DataFrame
non_binders_sampled = pd.concat([rna_non_binders_sampled, protein_non_binders_sampled, protein_binders_sampled])
non_binders_sampled = non_binders_sampled.sample(frac=1, random_state=42).reset_index(drop=True)
rna_non_binders_sampled = rna_non_binders.sample(n=1961, random_state=42)
protein_binders_sampled = protein_binders.sample(n=1961, random_state=42)
protein_non_binders_sampled = protein_non_binders.sample(n=1961, random_state=42)


# For Model 1: RNA-Binders vs. Non-RNA-Binders
rna_model1_df = pd.concat([rna_binders, rna_non_binders_sampled]).reset_index(drop=True)
rna_model1_df = rna_model1_df.sample(frac=1, random_state=42)
rna_model1_df.to_pickle('data_for_ml/ml_datasets_set2/model1_rna_bin_non_rna_bin/data.pkl')

# For Model 2: RNA-Binders vs. Protein-Binders
rna_model2_df = pd.concat([rna_binders, protein_binders_sampled]).reset_index(drop=True)
rna_model2_df = rna_model2_df.sample(frac=1, random_state=42)
rna_model2_df.to_pickle('data_for_ml/ml_datasets_set2/model2_rna_bin_protein_bin/data.pkl')

# For Model 3: Binders vs. Non-Binders  (RNA binders vs RNA non-binders, protein binders, protein non-binders)
rna_model3_df = pd.concat([rna_binders, non_binders_sampled]).reset_index(drop=True)
rna_model3_df.to_pickle('data_for_ml/ml_datasets_set2/model3_binders_nonbinders/data.pkl')


print("All datasets have been balanced and saved.")


All datasets have been balanced and saved.


In [15]:
print("Total Non-Binders Sampled:", len(non_binders_sampled))

Total Non-Binders Sampled: 1961


In [16]:
rna_model1_df.shape, rna_model2_df.shape, rna_model3_df.shape

((3922, 6), (3922, 6), (3922, 6))

In [17]:
rna_model1_df['label'].value_counts(), rna_model2_df['label'].value_counts(), rna_model3_df['label'].value_counts()

(label
 0    1961
 1    1961
 Name: count, dtype: int64,
 label
 0    1961
 1    1961
 Name: count, dtype: int64,
 label
 1    1961
 0    1961
 Name: count, dtype: int64)