# Debugging the design

In [1]:
%load_ext autoreload
%autoreload 2

import uuid
import argparse
import dgym as dg
import pandas as pd

def get_data(path):

    deck = dg.MoleculeCollection.load(
        f'{path}/DSi-Poised_Library_annotated.sdf',
        reactant_names=['reagsmi1', 'reagsmi2', 'reagsmi3']
    )

    reactions = dg.ReactionCollection.from_json(
        path = f'{path}/All_Rxns_rxn_library_sorted.json',
        smarts_col = 'reaction_string',
        classes_col = 'functional_groups'
    )

    building_blocks = dg.datasets.disk_loader(
        f'{path}/Enamine_Building_Blocks_Stock_262336cmpd_20230630.sdf')
    fingerprints = dg.datasets.fingerprints(
        f'{path}/Enamine_Building_Blocks_Stock_262336cmpd_20230630_atoms.fpb')

    import torch
    import pyarrow.parquet as pq
    table = pq.read_table(f'{path}/sizes.parquet')[0]
    sizes = torch.tensor(table.to_numpy())

    return deck, reactions, building_blocks, fingerprints, sizes

In [2]:
_, reactions, building_blocks, fingerprints, sizes = get_data('../../dgym-data/')

In [3]:
from dgym.envs.designer import Designer, Generator

generator = Generator(building_blocks, fingerprints, sizes)
designer = Designer(generator, reactions)

### Visualizing molecules

In [270]:
# start = designer.design(1)[0]

routines = [
    {'temperature': 0.0, 'limit': 1},
    {'temperature': 0.02, 'limit': 1},
    {'temperature': 0.04, 'limit': 1},
    {'temperature': 0.08, 'limit': 1},
    {'temperature': 0.16, 'limit': 1},
    {'temperature': 0.32, 'limit': 1},
    {'temperature': 0.64, 'limit': 1},
    {'temperature': 0.0, 'limit': 2},
    {'temperature': 0.02, 'limit': 2},
    {'temperature': 0.04, 'limit': 2},
    {'temperature': 0.08, 'limit': 2},
    {'temperature': 0.16, 'limit': 2},
    {'temperature': 0.32, 'limit': 2},
    {'temperature': 0.64, 'limit': 2},
    {'temperature': 0.0, 'limit': 10},
    {'temperature': 0.02, 'limit': 10},
    {'temperature': 0.04, 'limit': 10},
    {'temperature': 0.08, 'limit': 10},
    {'temperature': 0.16, 'limit': 10},
    {'temperature': 0.32, 'limit': 10},
    {'temperature': 0.64, 'limit': 10}
]

results = []
for r in routines:
    temperature, limit = r.values()
    # designer.reset()
    res = designer.design(
        start, 1, temperature=temperature,
        limit=limit
    )[0]
    results.append(res)

# gen = designer.generator([start.reactants[1]], search='similar', temperature=0.04)
# mols = [next(gen[0]).mol for _ in range(10)]

from rdkit.Chem.Draw import MolsToGridImage
print('temperature', temperature, 'limit', limit)
MolsToGridImage([r.mol for r in results], molsPerRow=3, subImgSize=[400, 400], useSVG=True)

## Testing average expected Tanimoto similarity for different routines

In [249]:
import numpy as np

def compute_tanimoto_similarity(fp1, fp2):
    """
    Computes the Tanimoto similarity between two binary fingerprint arrays.

    Parameters:
    fp1 (np.array): Numpy array representing the first binary fingerprint.
    fp2 (np.array): Numpy array representing the second binary fingerprint.

    Returns:
    float: Tanimoto similarity score between the two fingerprints.
    """
    # Convert RDKit ExplicitBitVect to numpy arrays
    arr1 = np.array(fp1)
    arr2 = np.array(fp2)

    # Compute the intersection and union of the fingerprints
    intersection = np.sum(arr1 & arr2)
    union = np.sum(arr1 | arr2)

    # Compute Tanimoto similarity
    tanimoto_similarity = intersection / union
    return tanimoto_similarity

In [310]:
routines = [
    {'temperature': 0.0, 'limit': 1},
    {'temperature': 0.02, 'limit': 1},
    {'temperature': 0.04, 'limit': 1},
    {'temperature': 0.08, 'limit': 1},
    {'temperature': 0.16, 'limit': 1},
    {'temperature': 0.32, 'limit': 1},
    {'temperature': 0.64, 'limit': 1},
    {'temperature': 0.0, 'limit': 2},
    {'temperature': 0.02, 'limit': 2},
    {'temperature': 0.04, 'limit': 2},
    {'temperature': 0.08, 'limit': 2},
    {'temperature': 0.16, 'limit': 2},
    {'temperature': 0.32, 'limit': 2},
    {'temperature': 0.64, 'limit': 2},
    {'temperature': 0.0, 'limit': 10},
    {'temperature': 0.02, 'limit': 10},
    {'temperature': 0.04, 'limit': 10},
    {'temperature': 0.08, 'limit': 10},
    {'temperature': 0.16, 'limit': 10},
    {'temperature': 0.32, 'limit': 10},
    {'temperature': 0.64, 'limit': 10}
]

def make_design(start, designer, creativity: int = 0):
        
    routine = routines[creativity]
    temperature, limit = routine.values()
    res = designer.design(start, 1, temperature=temperature, limit=limit)
    return res[0]

In [274]:
from tqdm.auto import tqdm
from scikit_mol.fingerprints import RDKitFingerprintTransformer
fingerprinter = RDKitFingerprintTransformer(parallel=True)

records = []
for _ in tqdm(range(100)):
    
    designer.reset()
    start = designer.design(1)[0]
    
    analogs = []
    for creativity in range(21):
        analog = make_design(start, designer, creativity)
        analogs.append(analog)

    progression = [start.mol, *[a.mol for a in analogs]]
    fps = fingerprinter.transform(progression)
    similarities = [compute_tanimoto_similarity(fps[0], f) for f in fps]
    
    records.append(similarities[1:])

  0%|          | 0/100 [00:00<?, ?it/s]

In [353]:
import pandas as pd
import seaborn as sns

df_raw = pd.DataFrame(records)
df = df_raw.melt(var_name=['creativity'], value_name='similarity')
df = df.query('creativity > 0')
df['creativity'] -= 1
df['temperature'] = \
    df['creativity'].apply([r['temperature'] for r in routines].__getitem__)
df['# reactants changed'] = \
    df['creativity'].apply([r['limit'] for r in routines].__getitem__)
df_summary = (
    df
    .groupby('creativity')
    .agg({
        'similarity': ['mean', 'std'],
        'temperature': 'mean',
        '# reactants changed': 'mean'
    })
)

top_1_similarity = (
    df_summary
    .sort_values([('similarity', 'mean')], ascending=False)
)

top_1_similarity

Unnamed: 0_level_0,similarity,similarity,temperature,# reactants changed
Unnamed: 0_level_1,mean,std,mean,mean
creativity,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,0.961456,0.106277,0.0,1.0
1,0.95783,0.115166,0.02,1.0
14,0.93299,0.126209,0.0,10.0
15,0.920125,0.149564,0.02,10.0
7,0.91932,0.149437,0.0,2.0
2,0.915386,0.155573,0.04,1.0
8,0.912076,0.166027,0.02,2.0
16,0.901296,0.192153,0.04,10.0
9,0.897109,0.181095,0.04,2.0
3,0.852827,0.239815,0.08,1.0
