In [1]:
import os
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.error')
from rdkit import DataStructs
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()


# create directory for saving similarity matrices
if not os.path.exists('../output/distance_matrices/v2'):
    os.makedirs('../output/distance_matrices/v2')

    
n_protein = 8000
n_rna = 2000





In [2]:
enamine_protein_div6 = pd.read_pickle('../data/diversity_picking/ECFP6_v2/enamine_protein_ECFP6_MaxMin_v3.pkl').head(n_protein)
chemdiv_div6 = pd.read_pickle('../data/diversity_picking/ECFP6_v2/chemdiv_rna_ECFP6_MaxMin_v3.pkl').head(n_rna)
enamine_div6 = pd.read_pickle('../data/diversity_picking/ECFP6_v2/enamine_rna_ECFP6_MaxMin_v3.pkl').head(n_rna)
life_chemicals_div6 = pd.read_pickle('../data/diversity_picking/ECFP6_v2/life_chemicals_rna_ECFP6_MaxMin_v3.pkl').head(n_rna)
robin_df_div6 = pd.read_pickle('../data/diversity_picking/ECFP6_v2/robin_rna_ECFP6_MaxMin_v3.pkl').head(n_rna)



In [3]:
# Length of all dataframes
print(len(enamine_protein_div6), len(chemdiv_div6), len(enamine_div6), len(life_chemicals_div6), len(robin_df_div6))

8000 2000 2000 2000 1992


In [4]:
# Combine datasets into one DataFrame
combined_df = pd.concat([enamine_protein_div6, chemdiv_div6, enamine_div6, life_chemicals_div6, robin_df_div6])

# Sort datasets by size (largest to smallest)
datasets = [enamine_protein_div6, chemdiv_div6, enamine_div6, life_chemicals_div6, robin_df_div6]
sorted_datasets = sorted(datasets, key=lambda x: x.shape[0], reverse=True)

# Finding all unique duplicates across datasets
all_duplicates = combined_df[combined_df.duplicated('smiles', keep=False)]

# Remove duplicates in order from largest to smallest, except the smallest
for i in range(len(sorted_datasets) - 1):  # Skip the smallest dataset
    dataset = sorted_datasets[i]
    duplicates_to_remove = all_duplicates[~all_duplicates['source'].isin([dataset['source'].iloc[0]])]['smiles']
    sorted_datasets[i] = dataset[~dataset['smiles'].isin(duplicates_to_remove)]

# Extracting the updated datasets
enamine_protein_div6, chemdiv_div6, enamine_div6, life_chemicals_div6, robin_df_div6 = sorted_datasets

# Now the datasets are deduplicated in the desired order

In [5]:
combined_df = pd.concat([
    enamine_protein_div6, chemdiv_div6, enamine_div6, 
    life_chemicals_div6, robin_df_div6
])

# Finding duplicates in the 'SMILES' column
duplicates = combined_df[combined_df.duplicated('smiles', keep=False)]

# Creating a summary of duplicates between each pair of datasets
summary = {}
for dataset1 in combined_df['source'].unique():
    for dataset2 in combined_df['source'].unique():
        if dataset1 != dataset2:
            pair = tuple(sorted([dataset1, dataset2]))
            if pair not in summary:
                duplicates_in_pair = duplicates[
                    (duplicates['source'] == dataset1) | 
                    (duplicates['source'] == dataset2)
                ]
                count = duplicates_in_pair['smiles'].nunique()
                summary[pair] = count

# Displaying the summary
for pair, count in summary.items():
    print(f"Duplicates between {pair[0]} and {pair[1]}: {count}")

Duplicates between chemdiv and enamine_protein: 0
Duplicates between enamine and enamine_protein: 0
Duplicates between enamine_protein and life_chemicals: 0
Duplicates between enamine_protein and robin: 0
Duplicates between chemdiv and enamine: 0
Duplicates between chemdiv and life_chemicals: 0
Duplicates between chemdiv and robin: 0
Duplicates between enamine and life_chemicals: 0
Duplicates between enamine and robin: 0
Duplicates between life_chemicals and robin: 0


In [6]:

# Combine the dataframes
combined_df6 = pd.concat([enamine_protein_div6, chemdiv_div6, enamine_div6, life_chemicals_div6, robin_df_div6], ignore_index=True)


n = combined_df6.shape[0]
ecfp_column = ['ECFP6']
df_list = []

for ecfp_col in tqdm(ecfp_column, desc="Calculating distance matrices"):
    if ecfp_col == 'ECFP6':
        combined_df = combined_df6

    distance_matrix = np.zeros((n, n))

    for i in range(n):
        if i % 100 == 0:
            print(i)
        for j in range(i, n):
            distance_matrix[i, j] = 1 - DataStructs.TanimotoSimilarity(combined_df[ecfp_col][i], combined_df[ecfp_col][j])
            distance_matrix[j, i] = distance_matrix[i, j]

    df = pd.DataFrame({'mol': combined_df['mol'], 'source': combined_df['source'], ecfp_col: distance_matrix.tolist()})
    df_list.append(df)

    file_name = f'../output/distance_matrices/v2/distance_matrix_{ecfp_col}.pkl'
    df.to_pickle(file_name)

# Concatenate the DataFrames for all ECFP columns
combined_df = pd.concat(df_list, axis=1)

Calculating distance matrices:   0%|          | 0/1 [00:00<?, ?it/s]

0


100


200


300


400


500


600


700


800


900


1000


1100


1200


1300


1400


1500


1600


1700


1800


1900


2000


2100


2200


2300


2400


2500


2600


2700


2800


2900


3000


3100


3200


3300


3400


3500


3600


3700


3800


3900


4000


4100


4200


4300


4400


4500


4600


4700


4800


4900


5000


5100


5200


5300


5400


5500


5600


5700


5800


5900


6000


6100


6200


6300


6400


6500


6600


6700


6800


6900


7000


7100


7200


7300


7400


7500


7600


7700


7800


7900


8000


8100


8200


8300


8400


8500


8600


8700


8800


8900


9000


9100


9200


9300


9400


9500


9600


9700


9800


9900


10000


10100


10200


10300


10400


10500


10600


10700


10800


10900


11000


11100


11200


11300


11400


11500


11600


11700


11800


11900


12000


12100


12200


12300


12400


12500


12600


12700


12800


12900


13000


13100


13200


13300


13400


13500


13600


13700


13800


13900


14000


14100


14200


14300


14400


14500


14600


14700


14800


14900


15000


15100


15200


15300


15400


15500


15600


15700


15800
15900


In [7]:
# load pickle
dist_mat_ = pd.read_pickle('../output/distance_matrices/v2/distance_matrix_ECFP6.pkl')
dist_mat_

Unnamed: 0,mol,source,ECFP6
0,<rdkit.Chem.rdchem.Mol object at 0x7f8e58ddfdd0>,enamine_protein,"[0.0, 0.9895833333333334, 0.9705882352941176, ..."
1,<rdkit.Chem.rdchem.Mol object at 0x7f8e58ddd210>,enamine_protein,"[0.9895833333333334, 0.0, 0.968421052631579, 0..."
2,<rdkit.Chem.rdchem.Mol object at 0x7f8e58ddd940>,enamine_protein,"[0.9705882352941176, 0.968421052631579, 0.0, 0..."
3,<rdkit.Chem.rdchem.Mol object at 0x7f8e58ddc900>,enamine_protein,"[0.9655172413793104, 0.9727272727272728, 0.965..."
4,<rdkit.Chem.rdchem.Mol object at 0x7f8e58ddc720>,enamine_protein,"[0.962962962962963, 0.9603960396039604, 0.9633..."
...,...,...,...
15987,<rdkit.Chem.rdchem.Mol object at 0x7f8be2f4ce50>,robin,"[0.9189189189189189, 0.9819819819819819, 0.947..."
15988,<rdkit.Chem.rdchem.Mol object at 0x7f8be2f4cea0>,robin,"[0.9285714285714286, 0.9421487603305785, 0.961..."
15989,<rdkit.Chem.rdchem.Mol object at 0x7f8be2f4cef0>,robin,"[0.937007874015748, 0.9508196721311475, 0.9774..."
15990,<rdkit.Chem.rdchem.Mol object at 0x7f8be2f4cf40>,robin,"[0.9316239316239316, 0.9464285714285714, 0.958..."
