In [2]:
#DNA Gyrase Subunit B: 3TTZ
#Reference Ligand: 07N

In [3]:
import os
import json
import toml
project_dir = os.path.expanduser("/Users/devanshjain/laboratoire_d_intelligence_artificielle_en_chimie/3ttz")

In [4]:
dockstream_path = os.path.expanduser("/Users/devanshjain/DockStream")
dockstream_env = os.path.expanduser("/Users/devanshjain/miniconda3/envs/DockStream")

In [5]:
apo_protein_filename = "/Users/devanshjain/3ttz_chainA_receptor.pdb"  # Change this to your apo protein file name
reference_ligand_filename = "/Users/devanshjain/3ttz_ligand_07N.pdb"  # Change this to your reference ligand file name

output_prefix = "3ttz"

In [6]:
target_preparator = os.path.join(dockstream_path, "target_preparator.py")
docker = os.path.join(dockstream_path, "docker.py")

input_data_dir = os.path.join(project_dir, "input_data")
output_data_dir = os.path.join(project_dir, "output_data")
logs_dir = os.path.join(project_dir, "logs")
config_dir = os.path.join(project_dir, "configs")
lig_docked_dir = os.path.join(output_data_dir, "ligands_docked")
scores_dir = os.path.join(output_data_dir, "docking_scores")

# Create necessary directories
for directory in [input_data_dir, output_data_dir, logs_dir, config_dir, lig_docked_dir, scores_dir]:
    os.makedirs(directory, exist_ok=True)

# Update file paths
apo_protein_path = os.path.join(input_data_dir, apo_protein_filename)
reference_ligand_path = os.path.join(input_data_dir, reference_ligand_filename)

target_prep_path = os.path.join(config_dir, f"{output_prefix}_target_prep.json")
fixed_pdb_path = os.path.join(input_data_dir, f"{output_prefix}_fixed_target.pdb")
receptor_path = os.path.join(input_data_dir, f"{output_prefix}_receptor.pdbqt")
log_file_target_prep = os.path.join(logs_dir, f"{output_prefix}_target_prep.log")
log_file_docking = os.path.join(logs_dir, f"{output_prefix}_docking.log")
log_file_reinvent = os.path.join(logs_dir, f"{output_prefix}_reinvent.log")

docking_path = os.path.join(config_dir, f"{output_prefix}_docking.json")
ligands_docked_path = os.path.join(lig_docked_dir, f"{output_prefix}_ligands_docked.sdf")
ligands_scores_path = os.path.join(scores_dir, f"{output_prefix}_scores.csv")
ligands_conformer_path = os.path.join(lig_docked_dir, f"{output_prefix}pydantic.sdf")

In [11]:
#Transfer Learning: SMILES from BindingDB
#Focus a given model towards a set of input SMILES.

In [12]:
transfer_toml = f"""

run_type = "transfer_learning"
device = "cpu"  # set torch device e.g. "cpu"
tb_logdir = "tb_3ttz"  # name of the TensorBoard logging directory
json_out_config = "json_transfer_learning.json"  # write this TOML to JSON


[parameters]

num_epochs = 6  # number of steps to run
save_every_n_epochs = 3  # save checkpoint model file very N steps
batch_size = 50
num_refs = 10  # number of reference molecules randomly chosen for similarity
                # set this to zero for large datasets (>200 molecules)!
sample_batch_size = 100  # number of sampled molecules to compute sample loss


## Reinvent
input_model_file = "/Users/devanshjain/REINVENT4/priors/reinvent.prior"
smiles_file = "/Users/devanshjain/3ttz_smiles.smi"  # read 1st column
output_model_file = "/Users/devanshjain/laboratoire_d_intelligence_artificielle_en_chimie/3ttz/Transfer.model"
validation_smiles_file = "/Users/devanshjain/3ttz_smiles.smi"  


# Define the type of similarity and its parameters
pairs.type = "tanimoto"
pairs.upper_threshold = 1.0
pairs.lower_threshold = 0.6
pairs.min_cardinality = 1
pairs.max_cardinality = 100


"""

In [13]:
transfer_path = os.path.join(project_dir, "transfer_config.toml")

# Parse the TOML string
transfer_dict = toml.loads(transfer_toml)

# Write the TOML content to a file
with open(transfer_path, 'w') as f:
    toml.dump(transfer_dict, f)

In [14]:
!reinvent -l {log_file_reinvent} {transfer_path}

|[32m                                                                         [0m|00:00[0m
Epoch 6: |[32m################################################################[0m|00:07[0m
0it [00:07, ?it/s]


In [15]:
#Created Transfer Model

In [16]:
#Sampling SMILES randomly from 

In [18]:
#1 Reinvent Model: Whole of ChEMBL space

In [25]:
sampling_toml = f"""


run_type = "sampling"
device = "cpu"  # set torch device e.g. "cpu"
json_out_config = "sampling.json"  # write this TOML to JSON


[parameters]

## Reinvent: de novo sampling
model_file = "/Users/devanshjain/REINVENT4/priors/reinvent.prior"

output_file = '/Users/devanshjain/laboratoire_d_intelligence_artificielle_en_chimie/3ttz/sampling_REINIVENT.csv'  # sampled SMILES and NLL in CSV format

num_smiles = 45  # number of SMILES to be sampled, 1 per input SMILES
unique_molecules = true  # if true remove all duplicatesd canonicalize smiles
randomize_smiles = true # if true shuffle atoms in SMILES randomly

"""

In [27]:
!reinvent -l {log_file_reinvent} {sampling_path}

In [26]:
sampling_path = os.path.join(project_dir, "3ttz_sampling_config.toml")

# Parse the TOML string
sampling_dict = toml.loads(sampling_toml)

# Write the TOML content to a file
with open(sampling_path, 'w') as f:
    toml.dump(sampling_dict, f)

In [21]:
#2 Transfer Model

In [28]:
sampling_toml = f"""


run_type = "sampling"
device = "cpu"  # set torch device e.g. "cpu"
json_out_config = "sampling.json"  # write this TOML to JSON


[parameters]

## Reinvent: de novo sampling
model_file = "/Users/devanshjain/laboratoire_d_intelligence_artificielle_en_chimie/3ttz/Transfer.model"

output_file = '/Users/devanshjain/laboratoire_d_intelligence_artificielle_en_chimie/3ttz/sampling_TRANSFER.csv'  # sampled SMILES and NLL in CSV format

num_smiles = 45  # number of SMILES to be sampled, 1 per input SMILES
unique_molecules = true  # if true remove all duplicatesd canonicalize smiles
randomize_smiles = true # if true shuffle atoms in SMILES randomly

"""

In [29]:
sampling_path = os.path.join(project_dir, "3ttz_sampling_config_TL.toml")

# Parse the TOML string
sampling_dict = toml.loads(sampling_toml)

# Write the TOML content to a file
with open(sampling_path, 'w') as f:
    toml.dump(sampling_dict, f)

In [30]:
!reinvent -l {log_file_reinvent} {sampling_path}

In [7]:
#TARGET Preparation

In [6]:
tp_dict = {
  "target_preparation":
  {
    "header": {                                   # general settings
      "environment": {
      },
      "logging": {                                # logging settings (e.g. which file to write to)
        "logfile": log_file_target_prep
      }
    },
    "input_path": apo_protein_path,                  # this should be an absolute path
    "fixer": {                                    # based on "PDBFixer"; tries to fix common problems with PDB files
      "enabled": True,
      "standardize": True,                        # enables standardization of residues
      "remove_heterogens": True,                  # remove hetero-entries
      "fix_missing_heavy_atoms": True,            # if possible, fix missing heavy atoms
      "fix_missing_hydrogens": True,              # add hydrogens, which are usually not present in PDB files
      "fix_missing_loops": False,                 # add missing loops; CAUTION: the result is usually not sufficient
      "add_water_box": False,                     # if you want to put the receptor into a box of water molecules
      "fixed_pdb_path": fixed_pdb_path            # if specified and not "None", the fixed PDB file will be stored here
    },
    "runs": [                                     # "runs" holds a list of backend runs; at least one is required
      {
        "backend": "AutoDockVina",                # one of the backends supported ("AutoDockVina", "OpenEye", ...)
        "output": {
          "receptor_path": receptor_path      # the generated receptor file will be saved to this location
        },
        "parameters": {
            "extract_box": {                        # in order to extract the coordinates of the pocket (see text)
            "reference_ligand_path": reference_ligand_path,   # path to the reference ligand
            "reference_ligand_format": "pdb"                  # format of the reference ligand
          }},
        "cavity": {                               # there are different ways to specify the cavity; here, a reference
                                                  # ligand is used
          "method": "reference_ligand",
          "reference_ligand_path": reference_ligand_path,
          "reference_ligand_format": "pdb"
}}]}}

with open(target_prep_path, 'w') as f:
    json.dump(tp_dict, f, indent=2)

In [7]:
!{dockstream_env}/bin/python {target_preparator} -conf {target_prep_path}



In [9]:
!head -n 25 {receptor_path}

REMARK  Name = /var/folders/jv/219p6n9s5ll8vv2nm1whxw340000gn/T/tmpdxsytoo5.pdb
REMARK                            x       y       z     vdW  Elec       q    Type
REMARK                         _______ _______ _______ _____ _____    ______ ____
ATOM      1  N   TYR A   1      -4.310  -0.078  -2.180  0.00  0.00    -0.417 NA
ATOM      2  CA  TYR A   1      -4.916   1.182  -1.660  0.00  0.00    -0.012 C 
ATOM      3  C   TYR A   1      -6.330   0.931  -1.140  0.00  0.00    +0.186 C 
ATOM      4  O   TYR A   1      -7.318   1.272  -1.796  0.00  0.00    -0.278 OA
ATOM      5  CB  TYR A   1      -4.941   2.256  -2.753  0.00  0.00    -0.053 C 
ATOM      6  CG  TYR A   1      -3.579   2.783  -3.140  0.00  0.00    -0.082 C 
ATOM      7  CD1 TYR A   1      -2.827   2.165  -4.139  0.00  0.00    -0.120 C 
ATOM      8  CD2 TYR A   1      -3.045   3.908  -2.518  0.00  0.00    -0.120 C 
ATOM      9  CE1 TYR A   1      -1.576   2.651  -4.501  0.00  0.00    -0.094 C 
ATOM     10  CE2 TYR A   1      -1.7

In [7]:
#Docking with available SMILES

vina_binary_location = os.path.expanduser("/Users/devanshjain/Downloads/autodock_vina_1_1_2_mac_catalina_64bit/bin")
smiles_path = os.path.expanduser("/Users/devanshjain/3ttz_smiles.smi")

In [12]:
ed_dict = {
  "docking": {
    "header": {
      "environment": {},
      "logging": {"logfile": log_file_docking}
    },
    "ligand_preparation": {
      "embedding_pools": [
        {
          "pool_id": "RDkit",
          "type": "RDkit",
          "parameters": {
            "protonate": True,
            "remove_hs": False,
            "coordinate_generation": {
              "method": "UFF",
              "maximum_iterations": 600
            }
          },
          "input": {
            "input_path": smiles_path,
            "type": "smi",
            "standardize_smiles": False
          },
          "use_taut_enum": {
            "prefix_execution": "module load taut_enum",
            "enumerate_protonation_states": True
          },
          "output": {                                   # the conformers can be written to a file, but "output" is
                                                        # not required as the ligands are forwarded internally
            "conformer_path": ligands_conformer_path, 
            "format": "sdf"
          }
        }
      ]
    },
    "docking_runs": [
      {
        "backend": "AutoDockVina",
        "run_id": "AutoDockVina",
        "input_pools": ["RDkit"],
        "parameters": {
          "binary_location": vina_binary_location,
          "parallelization": {"number_cores": 4},
          "seed": 42,
          "receptor_pdbqt_path": [receptor_path],
          "number_poses": 1,
          "search_space": {
              "--center_x": 0.467143,   
              "--center_y": 3.288929,
              "--center_z": 24.798214,
              "--size_x": 20,
              "--size_y": 20,
              "--size_z": 20
            }
        },
        "output": {
          "poses": {
            "poses_path": ligands_docked_path,
            "format": "sdf"
          },
          "scores": {
            "scores_path": ligands_scores_path,
            "format": "csv"
          }
        }
      }
    ]
  }
}

with open(docking_path, 'w') as f:
    json.dump(ed_dict, f, indent=2)

# print out path to generated JSON
print(docking_path)

/Users/devanshjain/laboratoire_d_intelligence_artificielle_en_chimie/3ttz/configs/3ttz_docking.json


In [12]:
!{dockstream_env}/bin/python {docker} -conf {docking_path} -print_scores

* 'underscore_attrs_are_private' has been removed
* 'allow_population_by_field_name' has been renamed to 'validate_by_name'
* 'underscore_attrs_are_private' has been removed
[10:17:51] Molecule does not have explicit Hs. Consider calling AddHs()
[10:17:52] Molecule does not have explicit Hs. Consider calling AddHs()
[10:17:52] Molecule does not have explicit Hs. Consider calling AddHs()
[10:17:52] Molecule does not have explicit Hs. Consider calling AddHs()
[10:17:52] Molecule does not have explicit Hs. Consider calling AddHs()
[10:17:52] Molecule does not have explicit Hs. Consider calling AddHs()
[10:17:52] Molecule does not have explicit Hs. Consider calling AddHs()
[10:17:52] Molecule does not have explicit Hs. Consider calling AddHs()
[10:17:52] Molecule does not have explicit Hs. Consider calling AddHs()
[10:17:52] Molecule does not have explicit Hs. Consider calling AddHs()
[10:17:52] Molecule does not have explicit Hs. Consider calling AddHs()
[10:17:52] Molecule does not have 

In [13]:
#-8 to -9 = exceptional docking

In [16]:
#checking transfer model if generates better 
smiles_path='/Users/devanshjain/laboratoire_d_intelligence_artificielle_en_chimie/3ttz/3ttz_smiles_rand_trans.smi'

In [17]:
ed_dict = {
  "docking": {
    "header": {
      "environment": {},
      "logging": {"logfile": log_file_docking}
    },
    "ligand_preparation": {
      "embedding_pools": [
        {
          "pool_id": "RDkit",
          "type": "RDkit",
          "parameters": {
            "protonate": True,
            "remove_hs": False,
            "coordinate_generation": {
              "method": "UFF",
              "maximum_iterations": 300
            }
          },
          "input": {
            "input_path": smiles_path,
            "type": "smi",
            "standardize_smiles": False
          },
          "use_taut_enum": {
            "prefix_execution": "module load taut_enum",
            "enumerate_protonation_states": True
          },
          "output": {                                   # the conformers can be written to a file, but "output" is
                                                        # not required as the ligands are forwarded internally
            "conformer_path": ligands_conformer_path, 
            "format": "sdf"
          }
        }
      ]
    },
    "docking_runs": [
      {
        "backend": "AutoDockVina",
        "run_id": "AutoDockVina",
        "input_pools": ["RDkit"],
        "parameters": {
          "binary_location": vina_binary_location,
          "parallelization": {"number_cores": 4},
          "seed": 42,
          "receptor_pdbqt_path": [receptor_path],
          "number_poses": 1,
          "search_space": {
              "--center_x": 0.467143,   
              "--center_y": 3.288929,
              "--center_z": 24.798214,
              "--size_x": 20,
              "--size_y": 20,
              "--size_z": 20
            }
        },
        "output": {
          "poses": {
            "poses_path": ligands_docked_path,
            "format": "sdf"
          },
          "scores": {
            "scores_path": ligands_scores_path,
            "format": "csv"
          }
        }
      }
    ]
  }
}

with open(docking_path, 'w') as f:
    json.dump(ed_dict, f, indent=2)

# print out path to generated JSON
print(docking_path)

/Users/devanshjain/laboratoire_d_intelligence_artificielle_en_chimie/3ttz/configs/3ttz_docking.json


In [18]:
!{dockstream_env}/bin/python {docker} -conf {docking_path} -print_scores

* 'underscore_attrs_are_private' has been removed
* 'allow_population_by_field_name' has been renamed to 'validate_by_name'
* 'underscore_attrs_are_private' has been removed
[11:00:28] Molecule does not have explicit Hs. Consider calling AddHs()
[11:00:28] Molecule does not have explicit Hs. Consider calling AddHs()
[11:00:28] Molecule does not have explicit Hs. Consider calling AddHs()
[11:00:28] Molecule does not have explicit Hs. Consider calling AddHs()
[11:00:28] Molecule does not have explicit Hs. Consider calling AddHs()
[11:00:28] Molecule does not have explicit Hs. Consider calling AddHs()
[11:00:28] Molecule does not have explicit Hs. Consider calling AddHs()
[11:00:28] Molecule does not have explicit Hs. Consider calling AddHs()
[11:00:28] Molecule does not have explicit Hs. Consider calling AddHs()
[11:00:28] Molecule does not have explicit Hs. Consider calling AddHs()
[11:00:28] Molecule does not have explicit Hs. Consider calling AddHs()
[11:00:28] Molecule does not have 

In [9]:
ed_dict = {
  "docking": {
    "header": {
      "environment": {},
      "logging": {"logfile": log_file_docking}
    },
    "ligand_preparation": {
      "embedding_pools": [
        {
          "pool_id": "RDkit",
          "type": "RDkit",
          "parameters": {
            "protonate": True,
            "AddHs": True,
            "remove_hs": False,
            "coordinate_generation": {
              "method": "UFF",
              "maximum_iterations": 300
            }
          },
          "input": {
            "input_path": smiles_path,
            "type": "smi",
            "standardize_smiles": False
          },
          "use_taut_enum": {
            "prefix_execution": "module load taut_enum",
            "enumerate_protonation_states": True
          },
          "output": {                                   # the conformers can be written to a file, but "output" is
                                                        # not required as the ligands are forwarded internally
            "conformer_path": ligands_conformer_path, 
            "format": "sdf"
          }
        }
      ]
    },
    "docking_runs": [
      {
        "backend": "AutoDockVina",
        "run_id": "AutoDockVina",
        "input_pools": ["RDkit"],
        "parameters": {
          "binary_location": vina_binary_location,
          "parallelization": {"number_cores": 4},
          "seed": 42,
          "receptor_pdbqt_path": [receptor_path],
          "number_poses": 1,
          "search_space": {
              "--center_x": 0.467143,   
              "--center_y": 3.288929,
              "--center_z": 24.798214,
              "--size_x": 20,
              "--size_y": 20,
              "--size_z": 20
            }
        },
        "output": {
          "poses": {
            "poses_path": ligands_docked_path,
            "format": "sdf"
          },
          "scores": {
            "scores_path": ligands_scores_path,
            "format": "csv"
          }
        }
      }
    ]
  }
}

with open(docking_path, 'w') as f:
    json.dump(ed_dict, f, indent=2)

# print out path to generated JSON
print(docking_path)

/Users/devanshjain/laboratoire_d_intelligence_artificielle_en_chimie/3ttz/configs/3ttz_docking.json


In [8]:
#checking transfer model if generates better 
smiles_path='/Users/devanshjain/laboratoire_d_intelligence_artificielle_en_chimie/decoy.smi'

In [10]:
!{dockstream_env}/bin/python {docker} -conf {docking_path} -print_scores

* 'underscore_attrs_are_private' has been removed
* 'allow_population_by_field_name' has been renamed to 'validate_by_name'
* 'underscore_attrs_are_private' has been removed
[09:08:40] Molecule does not have explicit Hs. Consider calling AddHs()
[09:08:40] Molecule does not have explicit Hs. Consider calling AddHs()
[09:08:40] Molecule does not have explicit Hs. Consider calling AddHs()
[09:08:40] Molecule does not have explicit Hs. Consider calling AddHs()
[09:08:40] Molecule does not have explicit Hs. Consider calling AddHs()
[09:08:40] Molecule does not have explicit Hs. Consider calling AddHs()
[09:08:40] Molecule does not have explicit Hs. Consider calling AddHs()
[09:08:40] Molecule does not have explicit Hs. Consider calling AddHs()
[09:08:40] Molecule does not have explicit Hs. Consider calling AddHs()
[09:08:40] Molecule does not have explicit Hs. Consider calling AddHs()
[09:08:40] Molecule does not have explicit Hs. Consider calling AddHs()
[09:08:40] Molecule does not have 

In [11]:
#Staged Learning - Curriculum - Reinforcement

In [19]:
# REINVENT4 TOML input example for reinforcement/curriculum learning
#
#
# Curriculum learning in REINVENT4 is a multi-stage reinforcement learning
# run.  One or more stages (auto CL) can be defined.  But it is also
# possible to continue a run from any checkpoint file that is generated
# during the run (manual CL).  Currently checkpoints are written at the end
# of a run also when the run is forcefully terminated with Ctrl-C.
staged_toml = f"""

run_type = "staged_learning"
device = "cpu"  # set torch device e.g. "cpu"
tb_logdir = "tb_logs"  # name of the TensorBoard logging directory
json_out_config = "_staged_learning.json"  # write this TOML to JSON

[parameters]

summary_csv_prefix = "staged_learning"  # prefix for the CSV file
use_checkpoint = false  # if true read diversity filter from agent_file
purge_memories = false  # if true purge all diversity filter memories after each stage

## Reinvent
prior_file = "/Users/devanshjain/REINVENT4/priors/reinvent.prior"
agent_file = "/Users/devanshjain/laboratoire_d_intelligence_artificielle_en_chimie/3ttz/Transfer.model"

batch_size = 64          # network

unique_sequences = true  # if true remove all duplicates raw sequences in each step
                         # only here for backward compatibility
randomize_smiles = true  # if true shuffle atoms in SMILES randomly

tb_isim = false  # track iSIM similarity in TensorBoard


[learning_strategy]

type = "dap"      # dap: only one supported
sigma = 128       # sigma of the RL reward function
rate = 0.0001     # for torch.optim


[diversity_filter]  # optional, comment section out or remove if unneeded
                    # NOTE: also memorizes all seen SMILES

type = "IdenticalMurckoScaffold" # IdenticalTopologicalScaffold,
                                 # ScaffoldSimilarity, PenalizeSameSmiles
bucket_size = 25                 # memory size in number of compounds
minscore = 0.4                   # only memorize if this threshold is exceeded
minsimilarity = 0.4              # minimum similarity for ScaffoldSimilarity
penalty_multiplier = 0.5         # penalty factor for PenalizeSameSmiles


# Reinvent only: guide RL in the initial phase
#[inception]  # optional, comment sectionout or remove if unneeded

#smiles_file = "sampled.smi"  # "good" SMILES for guidance
#memory_size = 100  # number of total SMILES held in memory
#sample_size = 10  # number of SMILES randomly chosen each epoch


### Stage 1
### Note that stages must always be a list i.e. double brackets
[[stage]]

chkpt_file = 'test1.chkpt'  # name of the checkpoint file, can be reused as agent

termination = "simple"  # termination criterion fot this stage
max_score = 0.6  # terminate if this total score is exceeded
min_steps = 2  # run for at least this number of steps #25
max_steps = 3  # terminate entire run when exceeded #100

# Optionally, a DF can be set for each stage but note that the global DF
# section above will always overwrite the stage section and you need to
# delete [diversity_filter] to avoid this
#
#[stage.diversity_filter]
#type = "IdenticalMurckoScaffold"
# etc.

[stage.scoring]
type = "geometric_mean"  # aggregation function

[[stage.scoring.component]]
[stage.scoring.component.DockStream]
[[stage.scoring.component.DockStream.endpoint]]
name = "Docking with Dockstream"
weight = 1
params.configuration_path = "/Users/devanshjain/laboratoire_d_intelligence_artificielle_en_chimie/3ttz/configs/3ttz_docking.json"
params.docker_script_path = "/Users/devanshjain/DockStream/docker.py"
params.docker_python_path =   "/Users/devanshjain/miniconda3/envs/DockStream/bin/python"
transform.type = "reverse_sigmoid"
transform.high = -6
transform.low = -13.5
transform.k = 0.2

"""

In [21]:
!reinvent -l {log_file_reinvent} {staged_path}

In [20]:
staged_path = os.path.join(project_dir, "staged_config.toml")

# Parse the TOML string
staged_dict = toml.loads(staged_toml)

# Write the TOML content to a file
with open(staged_path, 'w') as f:
    toml.dump(staged_dict, f)