# Testing New Experiment API

Load data.

In [1]:
%reload_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings('ignore') 

import rdkit
import dgym as dg

# load all data
path = '../../dgym-data'

deck = dg.MoleculeCollection.load(
    f'{path}/DSi-Poised_Library_annotated.sdf',
    reactant_names=['reagsmi1', 'reagsmi2', 'reagsmi3']
)

reactions = dg.ReactionCollection.from_json(
    path = f'{path}/All_Rxns_rxn_library_sorted.json',
    smarts_col = 'reaction_string',
    classes_col = 'functional_groups'
)

building_blocks = dg.datasets.disk_loader(f'{path}/Enamine_Building_Blocks_Stock_262336cmpd_20230630.sdf')
fingerprints = dg.datasets.fingerprints(f'{path}/Enamine_Building_Blocks_Stock_262336cmpd_20230630_atoms.fpb')

import torch
import pyarrow.parquet as pq
table = pq.read_table('../../dgym-data/sizes.parquet')[0]
sizes = torch.tensor(table.to_numpy())

Load assays and utility functions.

In [2]:
import os

def get_tcp_objectives():

    dockstring_dir = f'{path}/dockstring_targets/'
    files = os.listdir(dockstring_dir)
    configs = sorted([f for f in files if 'conf' in f])
    targets = sorted([f for f in files if 'target' in f])

    idx = 0
    with open(dockstring_dir + configs[idx], 'r') as f:
        config_ = f.readlines()
        config_ = [c.replace('\n', '') for c in config_]
        config_ = [c.split(' = ') for c in config_ if c]
        config_ = {c[0]: float(c[1]) for c in config_}

    target = targets[idx]
    name = target.split('_')[0]

    config_.update({
        'size_x': 22.5,
        'size_y': 22.5,
        'size_z': 22.5,
    })

    from dgym.envs.oracle import \
        ConstantOracle, RandomOracle, DockingOracle, CatBoostOracle, RDKitOracle, NoisyOracle
    from dgym.envs.utility import ClassicUtilityFunction

    config = {
        'search_mode': 'detailed',
        'scoring': 'vina',
        'seed': 5,
        **config_
    }

    pIC50_oracle = DockingOracle(
        f'{name} pIC50',
        receptor_path=f'{path}/dockstring_targets/{name}_target.pdbqt',
        config=config
    )
    constant_oracle = ConstantOracle('Constant', constant=1)
    random_oracle = RandomOracle('Random')
    log_P_oracle = RDKitOracle('Log P', descriptor='MolLogP')
    log_S_oracle = CatBoostOracle(
        'Log S', path='../dgym/envs/models/aqsolcb.model')
    
    return pIC50_oracle, log_P_oracle, log_S_oracle, constant_oracle, random_oracle

In [3]:
from dgym.envs.utility import ClassicUtilityFunction, MultipleUtilityFunction

# Get objectives
pIC50_oracle, log_P_oracle, log_S_oracle, constant_oracle, random_oracle = get_tcp_objectives()

# Define utility functions
pIC50_utility = ClassicUtilityFunction(
    pIC50_oracle, ideal=(9.5, 13), acceptable=(8, 13))
log_P_utility = ClassicUtilityFunction(
    log_P_oracle, ideal=(0.5, 1.85), acceptable=(-0.5, 3.5))
log_S_utility = ClassicUtilityFunction(
    log_S_oracle, ideal=(-3, 1), acceptable=(-4, 1))
constant_utility = ClassicUtilityFunction(
    constant_oracle, ideal=(3, 4), acceptable=(2, 5))
random_utility = ClassicUtilityFunction(
    random_oracle, ideal=(3, 4), acceptable=(2, 5))

# Assemble assays and surrogate models
from dgym.envs.oracle import GaussianOracle
assays = [
    pIC50_oracle,
    log_P_oracle,
    log_S_oracle,
    GaussianOracle(f'Noisy {pIC50_oracle.name}', loc=7.460298232446733, scale=100),
    GaussianOracle(f'Noisy {log_P_oracle.name}', loc=2.223214738326521, scale=100),
    GaussianOracle(f'Noisy {log_S_oracle.name}', loc=-3.752548978069126, scale=100),
]

# Environment tolerates acceptable ADMET
from copy import deepcopy
utility_agent = MultipleUtilityFunction(
    utility_functions = [pIC50_utility, log_P_utility, log_S_utility],
    weights = [0.8, 0.1, 0.1]
)
utility_env = deepcopy(utility_agent)
utility_env.utility_functions[1].ideal = utility_env.utility_functions[1].acceptable
utility_env.utility_functions[2].ideal = utility_env.utility_functions[2].acceptable

Instantiate designer.

In [4]:
from dgym.envs.designer import Designer, Generator

designer = Designer(
    Generator(building_blocks, fingerprints, sizes),
    reactions,
    cache = True
)

Pick 5 random starting hits.

In [5]:
import warnings
warnings.filterwarnings("ignore")

# select first molecule
import random
def select_molecule(deck):
    initial_index = random.randint(0, len(deck) - 1)
    initial_molecule = deck[initial_index]
    if len(initial_molecule.reactants) == 2 \
        and designer.match_reactions(initial_molecule):
        return initial_molecule
    else:
        return select_molecule(deck)

initial_molecules = [select_molecule(deck) for _ in range(5)]
library = dg.MoleculeCollection(initial_molecules).update_annotations()

# Score molecules
for assay in assays:
    if 'Noisy' not in assay.name:
        results = assay(library)
        for molecule, result in zip(library, results):
            molecule.update_annotations({assay.name: result})

# Set status to tested
library.set_status('Tested', step=0)

[18:17:40] mapped atoms in the reactants were not mapped in the products.
  unmapped numbers are: 4 
[18:17:40] mapped atoms in the reactants were not mapped in the products.
  unmapped numbers are: 3 
[18:17:40] mapped atoms in the reactants were not mapped in the products.
  unmapped numbers are: 3 
[18:17:40] mapped atoms in the reactants were not mapped in the products.
  unmapped numbers are: 3 
[18:17:40] mapped atoms in the reactants were not mapped in the products.
  unmapped numbers are: 3 
[18:17:40] mapped atoms in the reactants were not mapped in the products.
  unmapped numbers are: 4 
[18:17:40] product atom-mapping number 15 not found in reactants.
[18:17:40] product atom-mapping number 14 not found in reactants.
[18:17:40] product atom-mapping number 13 not found in reactants.
[18:17:40] reactant 1 has no mapped atoms.
[18:17:40] product atom-mapping number 2 not found in reactants.
[18:17:40] product atom-mapping number 3 not found in reactants.
[18:17:40] product atom

[]
unidock --receptor ../../dgym-data/dockstring_targets/ABL1_target.pdbqt --ligand_index /tmp/tmppywa7aip/ligands.txt --dir /tmp/tmppywa7aip --search_mode detailed --scoring vina --seed 5 --center_x 15.851 --center_y 14.647 --center_z 3.904 --size_x 22.5 --size_y 22.5 --size_z 22.5
CompletedProcess(args='unidock --receptor ../../dgym-data/dockstring_targets/ABL1_target.pdbqt --ligand_index /tmp/tmppywa7aip/ligands.txt --dir /tmp/tmppywa7aip --search_mode detailed --scoring vina --seed 5 --center_x 15.851 --center_y 14.647 --center_z 3.904 --size_x 22.5 --size_y 22.5 --size_z 22.5', returncode=0, stdout='Uni-Dock v1.1.0\n\nIf you used Uni-Dock in your work, please cite:               \n \nYu, Y., Cai, C., Wang, J., Bo, Z., Zhu, Z., & Zheng, H. (2023). \nUni-Dock: GPU-Accelerated Docking Enables Ultralarge Virtual Screening. \nJournal of Chemical Theory and Computation.                    \nhttps://doi.org/10.1021/acs.jctc.2c01145                       \n\nTang, S., Chen, R., Lin, M., L

Instantiate Environment and Agent.

In [6]:
from dgym.envs import DrugEnv
from dgym.agents import SequentialDrugAgent
from dgym.agents.exploration import EpsilonGreedy

drug_env = DrugEnv(
    designer = designer,
    library = library,
    assays = assays,
    utility_function = utility_env
)

# Construct sequence
design_grow = {'name': 'design', 'batch_size': 8, 'parameters': {'strategy': 'grow', 'size': 5}}
design_replace = {'name': 'design', 'batch_size': 8, 'parameters': {'strategy': 'replace', 'size': 5, 'temperature': 0.2}}
score = {'name': ['Noisy ABL1 pIC50', 'Noisy Log S', 'Noisy Log P'], 'batch_size': 8 * 5, 'parameters': {'batch_size': 40}}
make = {'name': 'make', 'batch_size': 8}
test = {'name': ['ABL1 pIC50', 'Log S', 'Log P'], 'batch_size': 8}
design_and_score = [design_replace, score]

sequence = [design_replace, score, design_grow, score, make, test]

drug_agent = SequentialDrugAgent(
    sequence = sequence,
    exploration_strategy = EpsilonGreedy(epsilon=0.25),
    utility_function = utility_agent
)

In [7]:
def display_best(experiment):
    import numpy as np
    observations = experiment.drug_env.library
    utilities = experiment.drug_env.utility_function(
        observations, use_precomputed=True, method='average')
    for obs, utility in zip(observations, utilities):
        obs.update_annotations({'utility': utility})
    tested = observations.tested
    if tested:
        best_tested = (
            tested
            .annotations
            .sort_values('utility', ascending=False)
            [['ABL1 pIC50', 'Log S', 'Log P', 'utility']]
            .head()
        )
        display(best_tested)

## Test loading mechanism

In [8]:
import warnings
warnings.filterwarnings('ignore')

import os
import json
import zipfile
import pandas as pd
from tqdm.auto import tqdm
from io import TextIOWrapper
from collections import defaultdict

In [9]:
path = '../../dgym-data/analysis/noise/selection_max_noise_2024-05-19_11-56-02.zip'

with zipfile.ZipFile(path, 'r') as z:
    for filename in tqdm(z.namelist()[:1]):
        if filename.endswith('.json'):
            with z.open(filename) as file:
                with TextIOWrapper(file, encoding='utf-8') as text_file:
                    result = json.load(text_file)

  0%|          | 0/1 [00:00<?, ?it/s]

In [10]:

exp_loaded = experiment.load(result)
exp_loaded.run()

  0%|          | 0/1 [00:00<?, ?it/s]

MoleculeCollection with 4871 Molecules


  0%|          | 0/100000 [00:00<?, ?it/s]

Created action
{'name': 'design', 'parameters': {'strategy': 'replace', 'size': 5, 'temperature': 0.2}, 'molecules': [2287, 4798, 3942, 3868, 4335, 1007, 936, 3846]}
['Scored', 'Tested', 'Tested', 'Tested', 'Tested', 'Scored', 'Scored', 'Tested']


[18:17:50] Can't kekulize mol.  Unkekulized atoms: 8 10 13
[18:17:50] Can't kekulize mol.  Unkekulized atoms: 8 10 12
[18:17:50] Can't kekulize mol.  Unkekulized atoms: 9 11 14
[18:17:50] Can't kekulize mol.  Unkekulized atoms: 8 10 13
[18:17:50] Can't kekulize mol.  Unkekulized atoms: 11 13 15
[18:17:51] Can't kekulize mol.  Unkekulized atoms: 10 11 14
[18:17:51] Can't kekulize mol.  Unkekulized atoms: 9 11 12
[18:17:51] Can't kekulize mol.  Unkekulized atoms: 9 10 13
[18:17:51] Can't kekulize mol.  Unkekulized atoms: 18 19 22
[18:17:51] Can't kekulize mol.  Unkekulized atoms: 8 9 12
[18:17:51] Can't kekulize mol.  Unkekulized atoms: 9 12 13
[18:17:51] Can't kekulize mol.  Unkekulized atoms: 9 12 13
[18:17:51] Can't kekulize mol.  Unkekulized atoms: 9 12 13
[18:17:51] Can't kekulize mol.  Unkekulized atoms: 9 12 13
[18:17:51] Can't kekulize mol.  Unkekulized atoms: 9 12 13
[18:17:51] Can't kekulize mol.  Unkekulized atoms: 9 12 13
[18:17:51] Can't kekulize mol.  Unkekulized atoms: 9 1

0.7588616636136815
Created action
{'name': ['Noisy ABL1 pIC50', 'Noisy Log S', 'Noisy Log P'], 'parameters': {'batch_size': 40}, 'molecules': [4892, 4871, 4872, 4873, 4875, 4874, 4876, 4877, 4878, 4879, 4902, 4880, 4881, 4882, 4883, 4884, 4885, 4886, 4887, 4897, 4888, 4889, 4890, 4893, 4891, 4894, 4896, 4895, 4898, 4899, 4900, 4901, 4903, 4904, 4905, 4907, 4906, 4908, 4909, 4910]}
['Designed', 'Designed', 'Designed', 'Designed', 'Designed', 'Designed', 'Designed', 'Designed', 'Designed', 'Designed', 'Designed', 'Designed', 'Designed', 'Designed', 'Designed', 'Designed', 'Designed', 'Designed', 'Designed', 'Designed', 'Designed', 'Designed', 'Designed', 'Designed', 'Designed', 'Designed', 'Designed', 'Designed', 'Designed', 'Designed', 'Designed', 'Designed', 'Designed', 'Designed', 'Designed', 'Designed', 'Designed', 'Designed', 'Designed', 'Designed']
{'batch_size': 40}
[6.384607999946125, 100.36985629446562, -3.9409014414031143, 159.58050731312403, -4.871004632250275, 81.995366166463

[18:17:52] Can't kekulize mol.  Unkekulized atoms: 8 10 13
[18:17:52] Can't kekulize mol.  Unkekulized atoms: 9 11 14
[18:17:52] Can't kekulize mol.  Unkekulized atoms: 8 9 12
[18:17:52] Can't kekulize mol.  Unkekulized atoms: 10 11 14
[18:17:52] Can't kekulize mol.  Unkekulized atoms: 10 11 14
[18:17:52] Can't kekulize mol.  Unkekulized atoms: 10 11 14
[18:17:52] Can't kekulize mol.  Unkekulized atoms: 9 10 13
[18:17:52] Can't kekulize mol.  Unkekulized atoms: 10 11 14
[18:17:52] Can't kekulize mol.  Unkekulized atoms: 10 11 14
[18:17:52] Can't kekulize mol.  Unkekulized atoms: 9 10 13
[18:17:52] Can't kekulize mol.  Unkekulized atoms: 10 11 14
[18:17:52] Can't kekulize mol.  Unkekulized atoms: 10 11 14
[18:17:52] Can't kekulize mol.  Unkekulized atoms: 10 11 14
[18:17:52] Can't kekulize mol.  Unkekulized atoms: 10 11 14
[18:17:52] Can't kekulize mol.  Unkekulized atoms: 10 11 14
[18:17:52] Can't kekulize mol.  Unkekulized atoms: 10 11 14
[18:17:52] Can't kekulize mol.  Unkekulized at

[-11.725029668715983, 86.78202479688444, 57.241174306086904, 99.34383503219179, 36.34847721064404, 144.39509837274935, -124.01581716760708, 44.304293015804674, 102.47088983583312, -137.04729500827057, 45.48284908281319, 48.523819407210844, 148.84445484849078, -135.49539455907762, -54.15846210048626, 29.058364409455233, 59.14176442894635, -22.058789843246156, -128.88209324465936, -59.358818557992706, 216.06014726849293, 29.76516794568084, 119.26361135034196, -19.74803913141332, -37.31701238975586, 62.47156765132884, -51.479383454771806, -86.94160338235103, 159.52563269922118, -76.29517816628919, 21.65754229958517, 44.61018773409938, -108.64230213768427, -29.466085346179245, -28.605383736894105, 85.16986417516507, -91.47994376295493, 12.676002529212123, 10.647398641408696, 129.71951870129038]
0.7588616636136815
Created action
{'name': 'make', 'parameters': {}, 'molecules': [4876, 4887, 4780, 4550, 4800, 4542, 4845, 2167]}
['Scored', 'Scored', 'Scored', 'Scored', 'Scored', 'Scored', 'Scor


KeyboardInterrupt



In [170]:
# # exp_loaded.drug_agent.reset()
# # observations, _ = exp_loaded.drug_env.reset()
# action = exp_loaded.drug_agent.act(observations)
# print(action)
# observations, _, _, _, _ = exp_loaded.drug_env.step(action)