In [1]:
#Sampling SMILES

In [2]:
import os
import json
import toml
project_dir = os.path.expanduser("/Users/devanshjain/laboratoire_d_intelligence_artificielle_en_chimie")

In [3]:
dockstream_path = os.path.expanduser("/Users/devanshjain/DockStream")
dockstream_env = os.path.expanduser("/Users/devanshjain/miniconda3/envs/DockStream")

apo_protein_filename = "/Users/devanshjain/7xn1_apo.pdb"  # Change this to your apo protein file name
reference_ligand_filename = "/Users/devanshjain/7xn1_tacrine.pdb"  # Change this to your reference ligand file name

output_prefix = "p7xn1"
project_dir = os.path.expanduser("/Users/devanshjain/laboratoire_d_intelligence_artificielle_en_chimie")

In [4]:
target_preparator = os.path.join(dockstream_path, "target_preparator.py")
docker = os.path.join(dockstream_path, "docker.py")

input_data_dir = os.path.join(project_dir, "input_data")
output_data_dir = os.path.join(project_dir, "output_data")
logs_dir = os.path.join(project_dir, "logs")
config_dir = os.path.join(project_dir, "configs")
lig_docked_dir = os.path.join(output_data_dir, "ligands_docked")
scores_dir = os.path.join(output_data_dir, "docking_scores")

# Create necessary directories
for directory in [input_data_dir, output_data_dir, logs_dir, config_dir, lig_docked_dir, scores_dir]:
    os.makedirs(directory, exist_ok=True)

# Update file paths
apo_protein_path = os.path.join(input_data_dir, apo_protein_filename)
reference_ligand_path = os.path.join(input_data_dir, reference_ligand_filename)

target_prep_path = os.path.join(config_dir, f"{output_prefix}_target_prep.json")
fixed_pdb_path = os.path.join(input_data_dir, f"{output_prefix}_fixed_target.pdb")
receptor_path = os.path.join(input_data_dir, f"{output_prefix}_receptor.pdbqt")
log_file_target_prep = os.path.join(logs_dir, f"{output_prefix}_target_prep.log")
log_file_docking = os.path.join(logs_dir, f"{output_prefix}_docking.log")
log_file_reinvent = os.path.join(logs_dir, f"{output_prefix}_reinvent.log")

docking_path = os.path.join(config_dir, f"{output_prefix}_docking.json")
ligands_docked_path = os.path.join(lig_docked_dir, f"{output_prefix}_ligands_docked.sdf")
ligands_scores_path = os.path.join(scores_dir, f"{output_prefix}_scores.csv")
ligands_conformer_path = os.path.join(lig_docked_dir, f"{output_prefix}pydantic.sdf")

In [5]:
#Sampling

In [6]:
sampling_toml = f"""

run_type = "sampling"
device = "cpu"  # set torch device e.g. "cpu"
json_out_config = "_sampling.json"  # write this TOML to JSON

[parameters]

## Reinvent: de novo sampling
#model_file = "/Users/devanshjain/REINVENT4/priors/reinvent.prior"
#transfer_model_file
model_file = "/Users/devanshjain/laboratoire_d_intelligence_artificielle_en_chimie/transfer/TL_reinvent.model"


output_file = '/Users/devanshjain/laboratoire_d_intelligence_artificielle_en_chimie/transfer/transfer_sampling.csv'  # sampled SMILES and NLL in CSV format

num_smiles = 108  # number of SMILES to be sampled, 1 per input SMILES
unique_molecules = true  # if true remove all duplicatesd canonicalize smiles
randomize_smiles = true # if true shuffle atoms in SMILES randomly

"""

In [7]:
sampling_path = os.path.join(project_dir, "sampling_config.toml")

# Parse the TOML string
sampling_dict = toml.loads(sampling_toml)

# Write the TOML content to a file
with open(sampling_path, 'w') as f:
    toml.dump(sampling_dict, f)

In [8]:
!reinvent -l {log_file_reinvent} {sampling_path}

In [9]:
#TransferLearning

In [10]:
transfer_toml = f"""

run_type = "transfer_learning"
device = "cpu"  # set torch device e.g. "cpu"
tb_logdir = "tb_TL"  # name of the TensorBoard logging directory
json_out_config = "json_transfer_learning.json"  # write this TOML to JSON

[parameters]

num_epochs = 3  # number of steps to run
save_every_n_epochs = 3  # save checkpoint model file very N steps
batch_size = 50
num_refs = 100  # number of reference molecules randomly chosen for similarity
                # set this to zero for large datasets (>200 molecules)!
sample_batch_size = 100  # number of sampled molecules to compute sample loss


## Reinvent
input_model_file = "/Users/devanshjain/REINVENT4/priors/reinvent.prior"
smiles_file = "/Users/devanshjain/smiles.smi"  # read 1st column
output_model_file = "/Users/devanshjain/laboratoire_d_intelligence_artificielle_en_chimie/transfer/TL_reinvent.model"
validation_smiles_file = "/Users/devanshjain/smiles.smi"

# Define the type of similarity and its parameters
#pairs.type = "tanimoto"
#pairs.upper_threshold = 1.0
#pairs.lower_threshold = 0.7
#pairs.min_cardinality = 1
#pairs.max_cardinality = 199

"""

In [11]:
transfer_path = os.path.join(project_dir, "transfer_config.toml")

# Parse the TOML string
transfer_dict = toml.loads(transfer_toml)

# Write the TOML content to a file
with open(transfer_path, 'w') as f:
    toml.dump(transfer_dict, f)

In [12]:
!reinvent -l {log_file_reinvent} {transfer_path}

|[32m                                                                         [0m|00:00[0m
Epoch 3: |[32m################################################################[0m|00:01[0m
0it [00:01, ?it/s]


In [14]:
!tensorboard --logdir=tb_TL

TensorFlow installation not found - running with reduced feature set.

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.19.0 at http://localhost:6006/ (Press CTRL+C to quit)
^C


In [15]:
#staged-curriculum-reinforcement

In [16]:
staged_toml = f"""
# REINVENT4 TOML input example for reinforcement/curriculum learning

run_type = "staged_learning"
device = "cpu"  
tb_logdir = "tb_RL"  # Relative path to the TensorBoard logs directory  # Edit this path as needed
json_out_config = "_staged_learning.json"  # write this TOML to JSON

[parameters]

use_checkpoint = true  # if true read diversity filter from agent_file
purge_memories = false  # if true purge all diversity filter memories after each stage

## Reinvent
prior_file = "/Users/devanshjain/REINVENT4/priors/reinvent.prior"
agent_file = "/Users/devanshjain/laboratoire_d_intelligence_artificielle_en_chimie/transfer/TL_reinvent.model"

batch_size = 128          # network

unique_sequences = true  # if true remove all duplicates raw sequences in each step
                         # only here for backward compatibility
randomize_smiles = true  # if true shuffle atoms in SMILES randomly


[learning_strategy]

type = "dap"      # dap: only one supported
sigma = 128       # sigma of the RL reward function
rate = 0.0001     # for torch.optim


[diversity_filter]  # optional, comment section out or remove if unneeded
                    # NOTE: also memorizes all seen SMILES

type = "ScaffoldSimilarity"      # IdenticalTopologicalScaffold, (SET FOR MAXIMUM NOVELTY - 7xn1 + TACRINE)
                                 # ScaffoldSimilarity, PenalizeSameSmiles
bucket_size = 50                 # memory size in number of compounds
minscore = 0.5                   # only memorize if this threshold is exceeded
minsimilarity = 0.2              # minimum similarity for ScaffoldSimilarity
penalty_multiplier = 0.7         # penalty factor for PenalizeSameSmiles


#smiles_file = "sampled.smi"  # "good" SMILES for guidance
#memory_size = 100  # number of total SMILES held in memory
#sample_size = 10  # number of SMILES randomly chosen each epoch


### Stage 1
### Note that stages must always be a list i.e. double brackets
[[stage]]

chkpt_file = '/Users/devanshjain/laboratoire_d_intelligence_artificielle_en_chimie/transfer/rl_run.chkpt'  # Edit this checkpoint file path
termination = "simple"  # termination criterion fot this stage
max_score = 0.6  # terminate if this total score is exceeded
min_steps = 25  # run for at least this number of steps
max_steps = 1000  # terminate entire run when exceeded

[stage.scoring]
type = "geometric_mean"  # aggregation function

[[stage.scoring.component]]
[[stage.scoring.component.DockStream.endpoint]]
name = "Docking"
weight = 1

params.configuration_path = "{docking_path}"
params.docker_script_path = "{docker}"
params.docker_python_path =  "{dockstream_env}/bin/python"
transform.type = "reverse_sigmoid"
transform.high = -7.5
transform.low = -5.0
transform.k = 0.25

"""

In [17]:
staged_path = os.path.join(project_dir, "staged_config.toml")

# Parse the TOML string
staged_dict = toml.loads(staged_toml)

# Write the TOML content to a file
with open(staged_path, 'w') as f:
    toml.dump(staged_dict, f)

In [18]:
!reinvent -l {log_file_reinvent} {staged_path}

^C
Traceback (most recent call last):
  File [35m"/Users/devanshjain/miniconda3/envs/reinvent4/bin/reinvent"[0m, line [35m8[0m, in [35m<module>[0m
    sys.exit([31mmain_script[0m[1;31m()[0m)
             [31m~~~~~~~~~~~[0m[1;31m^^[0m
  File [35m"/Users/devanshjain/miniconda3/envs/reinvent4/lib/python3.13/site-packages/reinvent/Reinvent.py"[0m, line [35m195[0m, in [35mmain_script[0m
    [31mmain[0m[1;31m(args)[0m
    [31m~~~~[0m[1;31m^^^^^^[0m
  File [35m"/Users/devanshjain/miniconda3/envs/reinvent4/lib/python3.13/site-packages/reinvent/Reinvent.py"[0m, line [35m164[0m, in [35mmain[0m
    [31mrunner[0m[1;31m([0m
    [31m~~~~~~[0m[1;31m^[0m
        [1;31minput_config=extract_sections(input_config),[0m
        [1;31m^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^[0m
    ...<3 lines>...
        [1;31mwrite_config=write_config,[0m
        [1;31m^^^^^^^^^^^^^^^^^^^^^^^^^^[0m
    [1;31m)[0m
    [1;31m^[0m
  File [35m"/Users/devanshjain/minic