In [1]:
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
from rdkit.Chem import MolFromSmiles, MolToSmiles
from tqdm.auto import tqdm
tqdm.pandas()
import pandas as pd
import os
import pickle
from multiprocessing import Pool
from rdkit import RDLogger
import numpy as np
import itertools
RDLogger.DisableLog('rdApp.error')

In [2]:
# Load the datasets
rna_binders_df = pd.read_pickle('data_dataset2/standardized_data/deduplicated_data/rna_binders_dedup.pkl')
rna_non_binders_df = pd.read_pickle('data_dataset2/standardized_data/deduplicated_data/rna_non_binders_dedup.pkl')
protein_binders_df = pd.read_pickle('data_dataset2/standardized_data/deduplicated_data/protein_binders_dedup.pkl')
protein_non_binders_df = pd.read_pickle('data_dataset2/standardized_data/deduplicated_data/protein_non_binders_dedup.pkl')


In [3]:
# Concatenate dataframes
all_df = pd.concat([rna_binders_df, rna_non_binders_df, protein_binders_df, protein_non_binders_df], ignore_index=True)
rna_df = pd.concat([rna_binders_df, rna_non_binders_df], ignore_index=True)
protein_df = pd.concat([protein_binders_df, protein_non_binders_df], ignore_index=True)


In [4]:

# Corrected summaries
print(f'Total ALL: {len(all_df)}')
print(f'Total RNA: {len(rna_binders_df) + len(rna_non_binders_df)}')  # Summing both RNA datasets
print(f'Total Protein: {len(protein_binders_df) + len(protein_non_binders_df)}')  # Summing both protein datasets

# Corrections based on correct data source naming
print(f'ROBIN: {len(rna_binders_df) + len(rna_non_binders_df)}')  # Summing RNA datasets from ROBIN
print(f'Probes & Drugs: {len(protein_binders_df)}')  # Protein binders from Probes & Drugs
print(f'Zinc Dark Matter: {len(protein_non_binders_df)}')  # Protein non-binders from Zinc Dark Matter




Total ALL: 51065
Total RNA: 23844
Total Protein: 27221
ROBIN: 23844
Probes & Drugs: 2276
Zinc Dark Matter: 24945


In [5]:
datasets = [rna_binders_df, rna_non_binders_df, protein_binders_df, protein_non_binders_df]


In [6]:

# from which source are the duplicated smiles
all_df[all_df['smiles'].duplicated()]['source'].value_counts()




Series([], Name: count, dtype: int64)

In [7]:
# using source column, divide the all_df into 4 dataframes
rna_binders_df = all_df[all_df['source'] == 'robin_b']
rna_non_binders_df = all_df[all_df['source'] == 'robin_n']
protein_binders_df = all_df[all_df['source'] == 'probes_drugs']
protein_non_binders_df = all_df[all_df['source'] == 'zinc_dark_m']

 

In [8]:
# List of datasets
datasets = {
    'rna_binders': rna_binders_df,
    'rna_non_binders': rna_non_binders_df,
    'protein_binders': protein_binders_df,
    'protein_non_binders': protein_non_binders_df
    
}

# remove duplicates within datasets
for dataset in datasets:
    datasets[dataset] = datasets[dataset].drop_duplicates(subset='smiles', keep='first')
    

# create a df to store the number of duplicates between each pair of datasets
df_dup_count = pd.DataFrame(columns=datasets.keys(), index=datasets.keys())
df_dup_count = df_dup_count.fillna(0)
    
# Iterate over all pairs of datasets and count duplicates
for (name1, df1), (name2, df2) in itertools.combinations(datasets.items(), 2):
    # Find duplicates between df1 and df2
    duplicates = df1[df1['smiles'].isin(df2['smiles'])]
    
    # Store the number of duplicates in the df
    df_dup_count.loc[name1, name2] = len(duplicates)
    df_dup_count.loc[name2, name1] = len(duplicates)
    
    # Print the count of duplicates
    print(f"Duplicates between {name1} and {name2}: {len(duplicates)}")
    

# center the diagonal
df_dup_count.values[[np.arange(len(df_dup_count))]*2] = 0


    
# drop duplicates in rna_df and protein_df
rna_df = rna_df.drop_duplicates(subset='smiles', keep='first')
protein_df = protein_df.drop_duplicates(subset='smiles', keep='first')



Duplicates between rna_binders and rna_non_binders: 0
Duplicates between rna_binders and protein_binders: 0
Duplicates between rna_binders and protein_non_binders: 0
Duplicates between rna_non_binders and protein_binders: 0
Duplicates between rna_non_binders and protein_non_binders: 0
Duplicates between protein_binders and protein_non_binders: 0


In [9]:
print(f'RNA binders: {len(rna_binders_df)}')
print(f'RNA non-binders: {len(rna_non_binders_df)}')
print(f'Protein binders: {len(protein_binders_df)}')
print(f'Protein non-binders: {len(protein_non_binders_df)}')


RNA binders: 1961
RNA non-binders: 21883
Protein binders: 2276
Protein non-binders: 24945


In [10]:
# check if True if not create dir named 'no_duplicates'
if not os.path.exists('../data/no_duplicates/v2_second'):
    os.makedirs('../data/no_duplicates/v2_second')

In [11]:
# create function for ECFP calculation. Function will use rdkit's AllChem.GetMorganFingerprintAsBitVect() function and it will compute ECFP4 fingerprints with radius 2, ECFP6 fingerprints with radius 3
# ,ECFP8 fingerprints with radius 4 and ECFP10 fingerprints with radius 5. Function will create new columns for each ECFP and it will return dataframe with new columns.

def compute_ECFP(df, smiles_column, radius):
    df['ECFP'+str(radius*2)] = df[smiles_column].apply(lambda x: GetMorganFingerprintAsBitVect(MolFromSmiles(x),radius, nBits=2048))
    
    return df

In [12]:
combined_df = pd.concat([rna_binders_df, rna_non_binders_df, protein_binders_df, protein_non_binders_df], ignore_index=True)

In [13]:
combined_df = compute_ECFP(combined_df, 'smiles', 3)
combined_df.head(2)

Unnamed: 0,mol,source,smiles,ECFP6
0,<rdkit.Chem.rdchem.Mol object at 0x7f01bc696070>,robin_b,CC(=O)c1ccc(Br)c(N)c1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,<rdkit.Chem.rdchem.Mol object at 0x7f01686d7c40>,robin_b,O=C(NCCO)c1cc2ccccc2oc1=O,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."


In [14]:
rna_binders_df = combined_df[combined_df['source'] == 'robin_b']
rna_non_binders_df = combined_df[combined_df['source'] == 'robin_n']
protein_binders_df = combined_df[combined_df['source'] == 'probes_drugs']
protein_non_binders_df = combined_df[combined_df['source'] == 'zinc_dark_m']

In [15]:
# create a folder for ECFP datasets
if not os.path.exists('data_dataset2/standardized_data/ECFP_datasets_set2'):
    os.makedirs('data_dataset2/standardized_data/ECFP_datasets_set2')


# save ECFP datasets as pickle files
rna_binders_df.to_pickle('data_dataset2/standardized_data/ECFP_datasets_set2/rna_binders_ECFP6_set2_dedup.pkl')
rna_non_binders_df.to_pickle('data_dataset2/standardized_data/ECFP_datasets_set2/rna_non_binders_ECFP6_set2_dedup.pkl')
protein_binders_df.to_pickle('data_dataset2/standardized_data/ECFP_datasets_set2/protein_binders_ECFP6_set2_dedup.pkl')
protein_non_binders_df.to_pickle('data_dataset2/standardized_data/ECFP_datasets_set2/protein_non_binders_ECFP6_set2_dedup.pkl')

