In [3]:
import numpy as np
import pandas as pd
import os
from rdkit import Chem
from rdkit.Chem import Draw
import shutil
import tqdm
import pybel
import tarfile

In [4]:
### CREATE MAPPING

In [11]:
path = "/slgpfs/projects/irb35/agimeno/MurD/generative_models/REINVENT"
mapping = {}
c = 1

for file in sorted(os.listdir(path)):
    if file not in ["TESTS", "REINVENT_MurD_IdMrkSc-10-0.4-0.2_DS-4.5_PV-3_Inh-1.5"]:
        mapping[file] = c
        c += 1
        
with open("/aloy/home/acomajuncosa/MurD/rest_docking_reinvent/mapping.tsv", "w") as f:
    for file in mapping:
        f.write("\t".join([file, str(mapping[file])]) + "\n")

In [12]:
### CREATE FILE WITH ALL SMILES

In [26]:
def read_smiles(file):
    return sorted(pd.read_csv(file, header=None, sep='\t', names=['smiles', '?'])['smiles'])

In [55]:
path = "/slgpfs/projects/irb35/agimeno/MurD/generative_models/REINVENT"
all_smiles = []

for file in mapping:
    print(file, mapping[file])
    
    y = os.listdir(os.path.join(path, file, "1_target"))[0]
    
    path_to_smiles = os.path.join(path, file, "1_target", y, "sampling", "agent", "output", "sampled", "sampled.smi")
    try:
        smiles = read_smiles(path_to_smiles)
        all_smiles.extend([j, str(mapping[file]) + "-" + str(i)] for i, j in enumerate(smiles))
    except:
        print("Could not read smiles... ==> " + file)
    
all_smiles = np.array(all_smiles)
write_all_smiles = ["\t".join(i) for i in all_smiles]

# Print smiles
with open("/aloy/home/acomajuncosa/MurD/rest_docking_reinvent/all_smiles.tsv", "w") as f:
    f.write("\n".join(write_all_smiles))

REINVENT_MurD_IdMrkSc-10-0.4-0.2_DS-0_PV-0_Inh-6_MW-1_SlogP-1_QED-1 1
REINVENT_MurD_IdMrkSc-10-0.4-0.2_DS-0_PV-6_Inh-0_MW-1_SlogP-1_QED-1 2
REINVENT_MurD_IdMrkSc-10-0.4-0.2_DS-1_PV-2_Inh-3_MW-1_SlogP-1_QED-1 3
REINVENT_MurD_IdMrkSc-10-0.4-0.2_DS-1_PV-3_Inh-2_MW-1_SlogP-1_QED-1 4
REINVENT_MurD_IdMrkSc-10-0.4-0.2_DS-2_PV-1_Inh-3_MW-1_SlogP-1_QED-1 5
REINVENT_MurD_IdMrkSc-10-0.4-0.2_DS-2_PV-2_Inh-2_MW-1_SlogP-1_QED-1 6
REINVENT_MurD_IdMrkSc-10-0.4-0.2_DS-2_PV-3_Inh-1_MW-1_SlogP-1_QED-1 7
REINVENT_MurD_IdMrkSc-10-0.4-0.2_DS-3_PV-1_Inh-2_MW-1_SlogP-1_QED-1 8
REINVENT_MurD_IdMrkSc-10-0.4-0.2_DS-3_PV-2_Inh-1_MW-1_SlogP-1_QED-1 9
REINVENT_MurD_IdMrkSc-10-0.4-0.2_DS-6_PV-0_Inh-0_MW-1_SlogP-1_QED-1 10
REINVENT_MurD_ScSim-10-0.4-0.4_DS-0_PV-0_Inh-6_MW-1_SlogP-1_QED-1 11
REINVENT_MurD_ScSim-10-0.4-0.4_DS-0_PV-6_Inh-0_MW-1_SlogP-1_QED-1 12
REINVENT_MurD_ScSim-10-0.4-0.4_DS-1_PV-2_Inh-3_MW-1_SlogP-1_QED-1 13
REINVENT_MurD_ScSim-10-0.4-0.4_DS-1_PV-3_Inh-2_MW-1_SlogP-1_QED-1 14
REINVENT_MurD_ScSim-10-

In [82]:
unique_smiles = set()
all_smiles_unique = []

for s in all_smiles:
    if s[0] not in unique_smiles:
        unique_smiles.add(s[0])
        all_smiles_unique.append(s)
        
all_smiles_unique = np.array(all_smiles_unique)
write_all_smiles_unique = ["\t".join(i) for i in all_smiles_unique]
# Print smiles
with open("/aloy/home/acomajuncosa/MurD/rest_docking_reinvent/all_smiles_unique.tsv", "w") as f:
    f.write("\n".join(write_all_smiles))

In [83]:
len(all_smiles), len(all_smiles_unique), len(unique_smiles)

(190000, 179030, 179030)

In [86]:
smiles = np.array_split(all_smiles_unique, 2000)

In [110]:
### write files in alexandria1
outpath = "/alexandria1/acomajuncosa/MurD/ligand_preparation_Aleix/splits"

for c, smi in tqdm.tqdm(enumerate(smiles[:10])):
    
    # Create directories
    if os.path.exists(os.path.join(outpath, "smiles" + str(c))) is False:
        os.makedirs(os.path.join(outpath, "smiles" + str(c)))
    if os.path.exists(os.path.join(outpath, "out_smiles" + str(c))) is False:
        os.makedirs(os.path.join(outpath, "out_smiles" + str(c)))
        
    # Write files
    with open(os.path.join(outpath, "smiles" + str(c), "smiles" + str(c)), "w") as f:
        for s in smi:
            f.write("\t".join(s) + "\n")

10it [00:00, 32.21it/s]


In [24]:
### CHECK THAT MOST OF THEM ARE KEPT
path = '/alexandria1/acomajuncosa/MurD/ligand_preparation_Miquel/splits'
original_, final_ = [], []

for split in tqdm.tqdm(range(950)):
    original = open(os.path.join(path, "smiles" + str(split), "smiles" + str(split)), "r").readlines()
    final = [i.title for i in pybel.readfile("sdf", "/alexandria1/acomajuncosa/MurD/ligand_preparation_Miquel/splits/out_smiles" + str(split) + "/out_smiles" + str(split) + ".sdf")]
    original_.extend(original)
    final_.extend(final)
    # print(split, round(len(set(final))/len(set(original))*100, 1))

100%|██████████| 950/950 [03:25<00:00,  4.62it/s]


In [25]:
len(set(original_)), len(set(final_)), round(len(set(final_))/len(set(original_))*100, 1)

(33742, 30324, 89.9)

In [26]:
### COPY DATA TO SL

In [28]:
path = '/alexandria1/acomajuncosa/MurD/ligand_preparation_Miquel/splits'
outpath = '/slgpfs/projects/irb35/acomajuncosa/MurD/rest_docking/Miquel/structures'

for file in tqdm.tqdm(sorted(os.listdir(path))):
    if "out" in file:
        if os.path.exists(os.path.join(outpath, file)) is False: os.makedirs(os.path.join(outpath, file))
        shutil.copyfile(os.path.join(path, file, file + ".sdf"), os.path.join(outpath, file, file + ".sdf"))

100%|██████████| 1901/1901 [00:57<00:00, 32.82it/s]


In [168]:
### RUN DOCKINGS

In [32]:
path = '/slgpfs/projects/irb35/acomajuncosa/MurD/rest_docking/Miquel/structures'
elements = sorted(os.listdir(path))

In [33]:
len(elements)

950

In [16]:
elements = to_repeat

In [18]:
import sys
sys.path.insert(0, '/aloy/home/acomajuncosa/programs/hpc') #CHANGE THIS PATH TO YOUR HPC PATH!
from hpc import HPC
from starlife_config import config as cluster_config

scratch_path = "/slgpfs/scratch/irb35/acomajuncosa/MurD/rest_docking/Miquel_rep" 
script_path = "/slgpfs/projects/irb35/acomajuncosa/MurD/rest_docking/Miquel/gen_fps_rDock_center.py"

ncpus = 4
cluster = HPC(**cluster_config)
njobs = len(elements)

cluster_params = {}
cluster_params['job_name'] = 'murd_miq'
cluster_params["jobdir"] = scratch_path
cluster_params["memory"] = ncpus
cluster_params['cpu'] = ncpus
cluster_params["wait"] = False
cluster_params["elements"] = elements
cluster_params["num_jobs"] = len(elements)


singularity_image = "/slgpfs/projects/irb35/acomajuncosa/rDock_image_2.simg"
command = "singularity exec {} python {} <TASK_ID> <FILE>".format(
singularity_image,
script_path)

cluster.submitMultiJob(command, **cluster_params)

2022-06-28 10:51:25,901 chemicalchecker.util.config.config.Config [DEBUG   ] CC_CONFIG environment variable not set. Using default config file.
2022-06-28 10:51:25,901 chemicalchecker.util.config.config.Config [DEBUG   ] Loading config from: /home/acomajuncosa/programs/anaconda3/lib/python3.7/site-packages/chemicalchecker/util/config/cc_config.json
2022-06-28 10:51:26,976 hpc.HPC      [DEBUG   ] HPC system to use: slurm
2022-06-28 10:51:26,977 hpc.HPC      [DEBUG   ] initializing object slurm
2022-06-28 10:51:27,117 slurm.slurm  [DEBUG   ] Job nasdfdsfsdfsdfdme is: murd_miq
2022-06-28 10:51:27,131 slurm.slurm  [DEBUG   ] Num elements submitted 13
2022-06-28 10:51:27,131 slurm.slurm  [DEBUG   ] Num Job submitted 13
2022-06-28 10:51:27,138 slurm.slurm  [INFO    ] Writing file /slgpfs/scratch/irb35/acomajuncosa/MurD/rest_docking/Miquel_rep/job-murd_miq.sh...
2022-06-28 10:51:27,152 slurm.slurm  [DEBUG   ] HPC submission: sbatch --parsable /slgpfs/scratch/irb35/acomajuncosa/MurD/rest_docki

module 'matplotlib.font_manager' has no attribute '_rebuild'


2022-06-28 10:51:30,537 slurm.slurm  [DEBUG   ] load SINGULARITY/3


In [5]:
### LOOK FOR ERRORS/PROBLEMS

# SCRATCH
path = "/slgpfs/scratch/irb35/acomajuncosa/MurD/rest_docking/Miquel"
for file in tqdm.tqdm(sorted(os.listdir(path))):
    if '.out' in file:
        with open(os.path.join(path, file), "r") as f:
            for l in f:
                if "error" in l.lower() or "warning" in l.lower() or 'segmentation' in l.lower():
                    print(file)
                    break
                    


 11%|█         | 102/953 [00:00<00:01, 446.86it/s]

slurm-18106459_114.out
slurm-18106459_136.out


 61%|██████▏   | 584/953 [00:01<00:00, 439.72it/s]

slurm-18106459_557.out


 76%|███████▋  | 729/953 [00:01<00:00, 458.86it/s]

slurm-18106459_69.out


100%|██████████| 953/953 [00:02<00:00, 461.97it/s]

slurm-18106459_893.out
slurm-18106459_894.out
slurm-18106459_895.out





In [3]:
#RESULTS
path = "/slgpfs/projects/irb35/acomajuncosa/MurD/rest_docking/Miquel/structures"

original, final = set(), set()
to_repeat = []
data = {}

for split in tqdm.tqdm(sorted(os.listdir(path))):
    
    try:
    
        mymolecules = set([i.title for i in pybel.readfile("sdf", os.path.join(path, split, split + ".sdf"))])
        myfile = os.path.join(path, split, "rDock_results_" + split, "results.tar.gz")

        if os.path.exists(myfile):
            # if os.path.exists(os.path.join(path, split, "results", "results.sd")) is False:
            #     tar = tarfile.open(myfile, "r:gz")
            #     tar.extract("results/results.sd", os.path.join(path, split))
            myresults = set([i.title for i in pybel.readfile("sdf", os.path.join(path, split, "results", "results.sd"))])
            data[split] = myresults


            for i in mymolecules:
                original.add(i)
            for i in myresults:
                final.add(i)

            if len(myresults) < len(mymolecules)*0.5:
                print(split)
                to_repeat.append(split)

        else:
            print(split + " does not exist")
            
    except:
        
        print(split + ": some error...")
        
    # break

  7%|▋         | 69/950 [01:06<16:06,  1.10s/it]

out_smiles16


 12%|█▏        | 114/950 [01:55<10:30,  1.33it/s]

out_smiles20


 14%|█▍        | 136/950 [02:19<10:47,  1.26it/s]

out_smiles22


 15%|█▌        | 143/950 [02:25<10:06,  1.33it/s]

out_smiles226


 18%|█▊        | 168/950 [02:51<11:58,  1.09it/s]

out_smiles25


 19%|█▉        | 180/950 [03:00<06:06,  2.10it/s]

out_smiles26


 22%|██▏       | 213/950 [03:27<12:57,  1.06s/it]

out_smiles29


 39%|███▉      | 371/950 [06:24<06:51,  1.41it/s]

out_smiles432: some error...


 59%|█████▊    | 557/950 [09:49<04:49,  1.36it/s]

out_smiles6


 62%|██████▏   | 589/950 [10:24<05:30,  1.09it/s]

out_smiles629: some error...


 66%|██████▌   | 626/950 [11:01<03:56,  1.37it/s]

out_smiles661


 77%|███████▋  | 729/950 [12:40<02:26,  1.50it/s]

out_smiles754


 78%|███████▊  | 744/950 [12:49<02:04,  1.66it/s]

out_smiles769: some error...


 94%|█████████▍| 895/950 [15:18<00:24,  2.27it/s]

out_smiles901
out_smiles902: some error...
out_smiles903: some error...


100%|██████████| 950/950 [16:14<00:00,  1.03s/it]


In [4]:
len(original)

30264

In [5]:
len(final)

28537

In [12]:
original_ = []
final_ = []
for split in tqdm.tqdm(range(950)):
    try:
        original = [i.title for i in pybel.readfile("sdf", "/slgpfs/projects/irb35/acomajuncosa/MurD/rest_docking/Miquel/structures/out_smiles" + str(split) + "/out_smiles" + str(split) + ".sdf")]
        original_.extend(original)
        final = data["out_smiles" + str(split)]
        final_.extend(final)
    except:
        print(split)

 46%|████▌     | 434/950 [00:25<00:22, 23.45it/s]

432


 67%|██████▋   | 632/950 [00:37<00:17, 18.06it/s]

629


 81%|████████▏ | 773/950 [00:44<00:05, 32.36it/s]

769


 95%|█████████▌| 907/950 [00:52<00:02, 19.47it/s]

902
903


100%|██████████| 950/950 [00:54<00:00, 17.45it/s]


In [None]:
len(set(final_))

In [None]:
len(set(original_))