# Figure X. Diversity

In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import json
import zipfile
import pandas as pd
from tqdm.auto import tqdm
from io import TextIOWrapper
from collections import defaultdict

Preparing the records for analysis.

In [2]:
def get_records(path, measure):

    trial_indices = defaultdict(int)
    hit_records_ = []
    records_ = []
    with zipfile.ZipFile(path, 'r') as z:
        for filename in tqdm(z.namelist()):
            if filename.endswith('.json'):
                with z.open(filename) as file:
                    with TextIOWrapper(file, encoding='utf-8') as text_file:
                        try:
                            # Load results
                            result = json.load(text_file)
                            value = result[measure]
                            if value == 1.0: # temperature 0.16 limit 1
                                record_raw = pd.DataFrame(
                                    result['annotations']
                                ).reindex(
                                    columns=[
                                        'SMILES',
                                        'Step Designed',
                                        'Step Made',
                                        'Step Scored',
                                        'Step Tested',
                                        'ABL1 pIC50',
                                        'Log P',
                                        'Log S'
                                    ]
                                )

                                # Process record
                                hit_record = record_raw.iloc[:5]
                                record = record_raw.dropna()

                                # Get measure of interest
                                record[measure] = value
                                hit_record[measure] = value

                                # Increment trial
                                trial_indices[value] += 1
                                record['trial'] = trial_indices[value]
                                hit_record['trial'] = trial_indices[value]

                                # Record list
                                hit_records_.append(hit_record)
                                records_.append(record)
                                result = None
                        except:
                            continue
    return records_, hit_records_

paths = [
    '../../dgym-data/analysis/noise/selection_noise_2024-05-10_23-36-56.zip',
]

measure = 'sigma'
hit_records = []
records = []
for path in paths:
    records_, hit_records_ = get_records(path, measure)
    records.extend(records_)
    hit_records.extend(hit_records_)

df_raw = pd.concat(records)
df_raw_hits = pd.concat(hit_records)
records = None
records = None

  0%|          | 0/500 [00:00<?, ?it/s]

# Score data

### Scoring starting hits

Load oracles.

In [3]:
import os

def get_tcp_objectives():

    dockstring_dir = f'../../dgym-data/dockstring_targets/'
    files = os.listdir(dockstring_dir)
    configs = sorted([f for f in files if 'conf' in f])
    targets = sorted([f for f in files if 'target' in f])

    idx = 0
    with open(dockstring_dir + configs[idx], 'r') as f:
        config_ = f.readlines()
        config_ = [c.replace('\n', '') for c in config_]
        config_ = [c.split(' = ') for c in config_ if c]
        config_ = {c[0]: float(c[1]) for c in config_}

    target = targets[idx]
    name = target.split('_')[0]

    config_.update({
        'size_x': 22.5,
        'size_y': 22.5,
        'size_z': 22.5,
    })

    from dgym.envs.oracle import \
        DockingOracle, CatBoostOracle, RDKitOracle, NoisyOracle

    config = {
        'search_mode': 'detailed',
        'scoring': 'vina',
        'seed': 5,
        **config_
    }

    pIC50_oracle = DockingOracle(
        f'{name} pIC50',
        receptor_path=f'../../dgym-data//dockstring_targets/{name}_target.pdbqt',
        config=config
    )
    log_P_oracle = RDKitOracle('Log P', descriptor='MolLogP')
    log_S_oracle = CatBoostOracle(
        'Log S', path='../dgym/envs/models/aqsolcb.model')
    
    return pIC50_oracle, log_P_oracle, log_S_oracle

Score hits.

In [5]:
import io
import dgym as dg

hits = dg.collection.MoleculeCollection(
    [dg.molecule.Molecule(s) for s in df_raw_hits.SMILES]
)

pIC50_oracle, log_P_oracle, log_S_oracle = get_tcp_objectives()

from contextlib import redirect_stdout
f = io.StringIO()

with redirect_stdout(f):
    df_raw_hits['ABL1 pIC50'] = pIC50_oracle(hits, batch_size=40)
    df_raw_hits['Log P'] = log_P_oracle(hits, batch_size=40)
    df_raw_hits['Log S'] = log_S_oracle(hits, batch_size=40)
    df_raw_hits = df_raw_hits.fillna(0.0)

Write to disk.

In [None]:
pd.concat([df_raw_hits, df_raw]).to_csv('../../dgym-data/analysis/novartis/trajectories_2.csv')

# Load from disk

`Temperature = 0.16`

In [14]:
import pandas as pd
df_raw_complete = pd.read_csv('../../dgym-data/analysis/novartis/trajectories_2.csv', index_col=0)

Assign outcomes.

In [16]:
from dgym.envs.utility import Policy, ClassicUtilityFunction

# create evaluators
docking_utility_function = ClassicUtilityFunction(
    ideal=(9, 13),
    acceptable=(8, 13)
)

log_P_utility_function = ClassicUtilityFunction(
    ideal=(-0.5, 3.5),
    acceptable=(-0.5, 3.5)
)

log_S_utility_function = ClassicUtilityFunction(
    ideal=(-4, 1),
    acceptable=(-4, 1)
)

composite_utility_function = Policy([
    docking_utility_function, log_P_utility_function, log_S_utility_function])

In [6]:
from tqdm.auto import tqdm

records = []
for trial, data in tqdm(df_raw_complete.groupby('trial')):
    
    utility = composite_utility_function(
        data.reindex(columns=['ABL1 pIC50', 'Log P', 'Log S']).values,
        precompute=True,
        method='average'
    )
    try:
        cost = utility.index(1.0)
        outcome = 1
    except:
        cost = len(utility)
        outcome = 0

    data['utility'] = utility
    data['outcome'] = outcome
    records.append(data)

df = pd.concat(records).query('outcome == 1')

  0%|          | 0/100 [00:00<?, ?it/s]

Featurize but do not marginalize over sigmas.

In [27]:
from tqdm.auto import tqdm
from scikit_mol.conversions import SmilesToMolTransformer
from scikit_mol.descriptors import MolecularDescriptorTransformer

smiles_to_mol = SmilesToMolTransformer(parallel=True)
desc_list = [
    'HeavyAtomCount',
    'FractionCSP3',
    'NumRotatableBonds',
    'TPSA',
    'RingCount',
    'NumAliphaticRings',
    'NumAromaticRings',
    'NOCount',
    'NHOHCount',
    'HeavyAtomMolWt',
    'ExactMolWt',
    'NumHAcceptors',
    'NumHDonors',
    'qed'
]
featurizer = MolecularDescriptorTransformer(
    desc_list, parallel=True)

trial_data = []
for (trial, sigma), data in tqdm(df.groupby(['trial', 'sigma'])):
    
    rd_mols = smiles_to_mol.transform(data['SMILES'].values)
    X = featurizer.transform(rd_mols)
    
    for (label, features) in zip(desc_list, X.T):
        data[label] = features
    
    # Convert step made to Progress of Series as in Novartis paper.
    data['Progress Raw'] = data['Step Made'] / data['Step Made'].max() * 100

    trial_data.append(data)

mol_properties = pd.concat(trial_data)

  0%|          | 0/500 [00:00<?, ?it/s]

In [28]:
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
from scipy.spatial.distance import pdist, squareform

def calculate_tanimoto_similarity(smiles_list):
    """
    Calculate the Tanimoto similarity matrix for a list of SMILES strings.

    Parameters
    ----------
    smiles_list : list of str
        List of SMILES strings representing the molecules.

    Returns
    -------
    np.ndarray
        A symmetric matrix containing Tanimoto similarities between all pairs of molecules.
    """
    fingerprints = [AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smiles), 2) for smiles in smiles_list]
    num_molecules = len(fingerprints)
    similarity_matrix = np.zeros((num_molecules, num_molecules))

    for i in range(num_molecules):
        for j in range(i, num_molecules):
            similarity = DataStructs.TanimotoSimilarity(fingerprints[i], fingerprints[j])
            similarity_matrix[i, j] = similarity
            similarity_matrix[j, i] = similarity

    return similarity_matrix

def calculate_diversity(smiles_list):
    """
    Calculate the diversity of a batch of molecules based on their SMILES strings.

    Parameters
    ----------
    smiles_list : list of str
        List of SMILES strings representing the molecules.

    Returns
    -------
    float
        The diversity score of the batch, where a lower score indicates higher diversity.
    """
    similarity_matrix = calculate_tanimoto_similarity(smiles_list)
    upper_triangular_values = similarity_matrix[np.triu_indices(len(smiles_list), k=1)]
    diversity_score = 1 - np.mean(upper_triangular_values)
    return diversity_score

diversity_records = []
for sigma in tqdm([0.0, 0.5, 1.0, 1.5, 2.0]):
    for trial in range(100):
        smiles_in_batch = mol_properties.query(f'trial == {trial}').query(f'sigma == {sigma}')['SMILES']
        diversity = calculate_diversity(smiles_in_batch)
        diversity_records.append({'trial': trial, 'sigma': sigma, 'diversity': diversity})

  0%|          | 0/5 [00:00<?, ?it/s]

In [33]:
num_bins = 100
mol_properties['Progress (%)'] = pd.cut(
    mol_properties['Progress Raw'], num_bins, labels=range(num_bins)).astype(int)

In [35]:
mol_properties['Progress (%)']

0        0
1        0
2        0
3        0
4        0
        ..
3267    99
3273    99
3284    99
3286    99
3414    99
Name: Progress (%), Length: 129015, dtype: int64

In [39]:
pd.DataFrame(diversity_records).groupby('sigma').agg({'diversity': 'mean'})

Unnamed: 0_level_0,diversity
sigma,Unnamed: 1_level_1
0.0,0.779863
0.5,0.789517
1.0,0.794149
1.5,0.803968
2.0,0.804351
