In [1]:
import pandas as pd
pd.set_option('mode.chained_assignment', None)
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import os 
from rdkit.Chem import Lipinski
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import Descriptors, MolToSmiles, QED, MolFromSmiles

import sys
sys.path.append("../sfi")  # Add the parent directory (project root) to the path
from logd_predictor_clean import LogDPredictor

In [2]:

rna_binders = pd.read_pickle('../data/diversity_picking/ECFP6_set2/rna_binders_ECFP6_set2_diversity_picked.pkl')
rna_non_binders = pd.read_pickle('../data/diversity_picking/ECFP6_set2/rna_non_binders_ECFP6_set2_diversity_picked.pkl')
protein_binders = pd.read_pickle('../data/diversity_picking/ECFP6_set2/protein_binders_ECFP6_set2_diversity_picked.pkl')
protein_non_binders = pd.read_pickle('../data/diversity_picking/ECFP6_set2/protein_non_binders_ECFP6_set2_diversity_picked.pkl')

In [3]:
# create folder for output propety_analysis
if not os.path.exists('../output/set2/property_analysis'):
   os.makedirs('../output/set2/property_analysis')



In [4]:
# Concatenate the dataframes into one
combined_df = pd.concat([
    rna_binders,
    rna_non_binders,
    protein_binders,
    protein_non_binders
])

# drop mol 
combined_df.drop(columns=['mol'], inplace=True)
# compute mols from smiles
combined_df['mol'] = combined_df['smiles'].apply(lambda x: MolFromSmiles(x))

# Finding duplicates in the 'SMILES' column
duplicates = combined_df[combined_df.duplicated('smiles', keep=False)]

# Creating a summary of duplicates between each pair of datasets
summary = {}
for dataset1 in combined_df['source'].unique():
    for dataset2 in combined_df['source'].unique():
        if dataset1 != dataset2:
            pair = tuple(sorted([dataset1, dataset2]))
            if pair not in summary:
                duplicates_in_pair = duplicates[
                    (duplicates['source'] == dataset1) | 
                    (duplicates['source'] == dataset2)
                ]
                count = duplicates_in_pair['smiles'].nunique()
                summary[pair] = count

# Displaying the summary
for pair, count in summary.items():
    print(f"Duplicates between {pair[0]} and {pair[1]}: {count}")

Duplicates between robin_b and robin_n: 0
Duplicates between probes_drugs and robin_b: 0
Duplicates between robin_b and zinc_dark_m: 0
Duplicates between probes_drugs and robin_n: 0
Duplicates between robin_n and zinc_dark_m: 0
Duplicates between probes_drugs and zinc_dark_m: 0


In [5]:
combined_df['source'].unique()  

array(['robin_b', 'robin_n', 'probes_drugs', 'zinc_dark_m'], dtype=object)

In [6]:
rna_binders = combined_df[combined_df['source'] == 'robin_b']
rna_non_binders = combined_df[combined_df['source'] == 'robin_n']
protein_binders = combined_df[combined_df['source'] == 'probes_drugs']
protein_non_binders = combined_df[combined_df['source'] == 'zinc_dark_m']



In [7]:
predict_logd = LogDPredictor(model_file_name='../sfi/model_plus.txt')


def calculate_properties(mol):
    """
    The `calculate_properties` function takes a molecule object as input and calculates various 
    molecular properties, such as molecular weight, heavy atom count, number of oxygen, nitrogen, 
    carbon, chlorine, fluorine, and sulfur atoms, hydrogen bond acceptors and donors, 
    number of rings, logarithm of the octanol-water partition coefficient (cLogP), 
    number of rotatable bonds, and topological polar surface area (TPSA). 
    The function returns a pandas Series containing the SMILES representation of 
    the molecule and the calculated properties.
    """
    smiles = MolToSmiles(mol, canonical=True)
    mw = Descriptors.MolWt(mol)
    heavy_atom_count = mol.GetNumHeavyAtoms()
    num_O_atoms = len([atom for atom in mol.GetAtoms() if atom.GetSymbol() == 'O'])
    num_N_atoms = len([atom for atom in mol.GetAtoms() if atom.GetSymbol() == 'N'])
    num_C_atoms = len([atom for atom in mol.GetAtoms() if atom.GetSymbol() == 'C'])
    num_Cl_atoms = len([atom for atom in mol.GetAtoms() if atom.GetSymbol() == 'Cl'])
    num_F_atoms = len([atom for atom in mol.GetAtoms() if atom.GetSymbol() == 'F'])
    num_S_atoms = len([atom for atom in mol.GetAtoms() if atom.GetSymbol() == 'S'])
    qed = QED.qed(mol)
    # use the predict_logd to calculate logd
    clogD = predict_logd.predict(mol) # model for this prediction can be found in /sfi/build_logd_model.ipynb

    hba = Lipinski.NumHAcceptors(mol)
    hbd = Lipinski.NumHDonors(mol)
    rings = rdMolDescriptors.CalcNumRings(mol)
    clogP = Descriptors.MolLogP(mol)
    n_rot = rdMolDescriptors.CalcNumRotatableBonds(mol)
    tpsa = rdMolDescriptors.CalcTPSA(mol)

    return pd.Series([smiles, mw, heavy_atom_count, num_O_atoms, num_N_atoms, num_C_atoms, 
                      num_Cl_atoms, num_F_atoms, num_S_atoms, hba, hbd, rings, clogP, n_rot, tpsa, qed, clogD],)


rna_binders[['smiles', 'MW', '#HeavyAtoms', 'NumO', 'NumN', 'NumC', 'NumCl', 'NumF', 'NumS', 'HBA', 'HBD', 'Rings', 'ClogP', '#RotBonds', 'TPSA', 'QED', 'ClogD']] = rna_binders.apply(lambda x: calculate_properties(x['mol']), axis=1)
rna_non_binders[['smiles', 'MW', '#HeavyAtoms', 'NumO', 'NumN', 'NumC', 'NumCl', 'NumF', 'NumS', 'HBA', 'HBD', 'Rings', 'ClogP', '#RotBonds', 'TPSA', 'QED', 'ClogD']] = rna_non_binders.apply(lambda x: calculate_properties(x['mol']), axis=1)
protein_binders[['smiles', 'MW', '#HeavyAtoms', 'NumO', 'NumN', 'NumC', 'NumCl', 'NumF', 'NumS', 'HBA', 'HBD', 'Rings', 'ClogP', '#RotBonds', 'TPSA', 'QED', 'ClogD']] = protein_binders.apply(lambda x: calculate_properties(x['mol']), axis=1)
protein_non_binders[['smiles', 'MW', '#HeavyAtoms', 'NumO', 'NumN', 'NumC', 'NumCl', 'NumF', 'NumS', 'HBA', 'HBD', 'Rings', 'ClogP', '#RotBonds', 'TPSA', 'QED', 'ClogD']] = protein_non_binders.apply(lambda x: calculate_properties(x['mol']), axis=1)


In [8]:
# save to csv without index
rna_binders.to_csv('../output/set2/property_analysis/rna_binders.csv', index=False)
rna_non_binders.to_csv('../output/set2/property_analysis/rna_non_binders.csv', index=False)
protein_binders.to_csv('../output/set2/property_analysis/protein_binders.csv', index=False)
protein_non_binders.to_csv('../output/set2/property_analysis/protein_non_binders.csv', index=False)
