# 1. Install dependancies.

In [32]:
!pip install pandas pandarallel tqdm

Collecting jupyterlab-widgets<3,>=1.0.0 (from ipywidgets)
  Using cached jupyterlab_widgets-1.1.8-py3-none-any.whl.metadata (3.7 kB)
Using cached jupyterlab_widgets-1.1.8-py3-none-any.whl (237 kB)
Installing collected packages: jupyterlab-widgets
Successfully installed jupyterlab-widgets-1.1.8


In [2]:
!jupyter labextension list

JupyterLab v4.2.3
/home/lab-03/miniconda3/envs/atcgpu_Fey/share/jupyter/labextensions
        jupyterlab_pygments v0.3.0 [32menabled[0m [32mOK[0m (python, jupyterlab_pygments)
        @jupyter-widgets/jupyterlab-manager v5.0.11 [32menabled[0m [32mOK[0m (python, jupyterlab_widgets)
        @jupyter-notebook/lab-extension v7.2.1 [32menabled[0m [32mOK[0m



# 2. Import and working directory

In [13]:
import pandas as pd
import tqdm.notebook as tqdm
from tqdm import tqdm as tq
import glob
import pathlib
import os
import re
import numpy as np
import subprocess
tq.pandas()

from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True, nb_workers=2) #6 process, each will take 8 cores, which leave me with 48 other cores (96 cores in total)
                                                        #2 process, Lab computer have 10 core
#Regex for best mode energy
regex_best_energy = re.compile(r"^1 +(-?[0-9]+\.[0-9]+) +0\.000 +0.000 +")
print("\n")
current_directory = os.getcwd()
os.chdir(current_directory)
print(current_directory)


INFO: Pandarallel will run on 2 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


/home/lab-03/Desktop/GPUDock/SupperQuick_Smina (copy)


Go to your working directory

# 3. Searching for pdbqt files

This will search all `pdbqt` files in `ligands` folder and put them in a pandas dataframe (used for quick parallelisation)

In [14]:
ligand_list = glob.glob("ligands/*.pdbqt")
df = pd.DataFrame(ligand_list, columns=["LigandFile"])
display(df)

Unnamed: 0,LigandFile
0,ligands/118.pdbqt
1,ligands/692.pdbqt
2,ligands/494.pdbqt
3,ligands/518.pdbqt
4,ligands/42.pdbqt
...,...
846,ligands/718.pdbqt
847,ligands/347.pdbqt
848,ligands/265.pdbqt
849,ligands/20.pdbqt


# Run processess

In [15]:
def run_smina(row):
    file=row.iloc[0]
    basename = pathlib.Path(file).stem
    outfolder = f"all_poses/{basename}"
    if not os.path.exists(outfolder):
        os.makedirs(outfolder)

    outputposes = f"{outfolder}/{basename}.pdbqt"
    outputlog = f"{outfolder}/{basename}.log"

    #default success = False. Only change it when it worked.
    success = False
    best_energy = np.nan
    #Check if the logfile already exist
    #if os.path.exists(outputlog):
    if False:
        with open(outputlog, "r") as log:
            stdout = log.readlines()
        success = True
    else:
        results = subprocess.run([
            "./smina.static", 
            "--config","ligand.conf",
            "--ligand", file, 
            "--out", outputposes, 
            "--log",outputlog,
            "--cpu","8", 
            "--scoring","vinardo"],
            capture_output=False)
        if results.returncode == 0:
            success = True
        if success == True:
            stdout = results.stdout.decode("utf-8").split("\n")
        

    if success == True:
        #Get the best energy
        for line in stdout:
            match = regex_best_energy.findall(line)
            if match:
                best_energy=float(match[0])
                break
            else:
                best_energy = np.nan


    return pd.Series(
            {
            "name":basename,
            "filename":file,
            "success":success,
            "BestEnergy":best_energy
            }
        )
results = df.parallel_apply(lambda x: run_smina(x), axis=1)

results.to_csv("results.csv",sep=";")

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=426), Label(value='0 / 426'))), HB…

Process ForkPoolWorker-48:
Process ForkPoolWorker-47:
Traceback (most recent call last):


KeyboardInterrupt: 

  File "/home/lab-03/miniconda3/envs/atcgpu_Fey/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/lab-03/miniconda3/envs/atcgpu_Fey/lib/python3.12/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
  File "/home/lab-03/miniconda3/envs/atcgpu_Fey/lib/python3.12/multiprocessing/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
                    ^^^^^^^^^^^^^^^^^^^
  File "/home/lab-03/miniconda3/envs/atcgpu_Fey/lib/python3.12/multiprocessing/pool.py", line 51, in starmapstar
    return list(itertools.starmap(args[0], args[1]))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/lab-03/miniconda3/envs/atcgpu_Fey/lib/python3.12/site-packages/pandarallel/core.py", line 95, in __call__
    result = self.work_function(
             ^^^^^^^^^^^^^^^^^^^
  File "/home/lab-03/miniconda3/envs/atcgpu_Fey/lib/python3.12/multiprocessing/proce

In [15]:
results.query("BestEnergy < ")

Unnamed: 0,name,filename,success,BestEnergy
0,118,ligands/118.pdbqt,True,-6.1
1,494,ligands/494.pdbqt,True,-6.0
2,518,ligands/518.pdbqt,True,-7.4
3,42,ligands/42.pdbqt,True,-6.7
4,229,ligands/229.pdbqt,True,-5.2
...,...,...,...,...
666,576,ligands/576.pdbqt,True,-4.9
667,347,ligands/347.pdbqt,True,-4.6
668,265,ligands/265.pdbqt,True,-5.8
669,20,ligands/20.pdbqt,True,-6.4


# Reformating the pdbqt to be "VINA LIKE" and extract the best pose

Vina pdbqt have a line with the result in the format `REMARK VINA X X X`.  
This code will add it by parsing the logfiles.

In [7]:

def reformat_and_extract_best_pose(row):
    file=row.iloc[0]
    basename = pathlib.Path(file).stem
    outfolder = f"all_poses/{basename}"
    outputposes = f"{outfolder}/{basename}.pdbqt"
    outputlog = f"{outfolder}/{basename}.log"
    regex_results = re.compile(r"^([0-9]+) +(-?[0-9]+\.[0-9]) +(-?[0-9]+\.[0-9]+) +(-?[0-9]+\.[0-9]+)")

    #default success = False. Only change it when it worked.
    success = False
    best_energy = np.nan
    #Check if the logfile already exist
    if os.path.exists(outputlog):
        with open(outputlog, "r") as log:
            stdout = log.readlines()
        success = True
        

    if success == True:
        #Get the best energy4
        scores = {}
        for line in stdout:
            match = regex_results.findall(line)
            if len(match)>0:
                model=match[0][0]
                energy=float(match[0][1])
                rmsdLB=float(match[0][2])
                rmsdUB=float(match[0][3])
                scores[model]=(energy, rmsdLB, rmsdUB)

    if success:
        newpdbqt = []
        best_model = []
        with open(outputposes,"r") as pdbqt:
            lines = pdbqt.readlines()
            model=0
            vinaLineAdded = False
            for line in lines:
                newpdbqt.append(line)
                if line.startswith("MODEL"):
                    model = line.strip().split(" ")[-1]
                    energy = float(scores[model][0])
                    rmsdLB = float(scores[model][1])
                    rmsdUB = float(scores[model][2])
                    VINALINE = f"REMARK VINA RESULT: {energy:>10.1f} {rmsdLB:>10.3f} {rmsdUB:>10.3f}\n"
                    newpdbqt.append(VINALINE)
                
                if model == "1":
                    best_model.append(line)
                    if vinaLineAdded == False:
                        best_model.append(VINALINE)
                        vinaLineAdded = True
                    

        with open(f"{outfolder}/{basename}_vinaFormat.pdbqt",'w') as vinaout:
            for line in newpdbqt:
                vinaout.write(line)

        with open(f"{outfolder}/{basename}_bestpose.pdbqt",'w') as bestout:
            for line in best_model:
                bestout.write(line)

_ = df.progress_apply(lambda x: reformat_and_extract_best_pose(x), axis=1)

100%|█████████████████████████████████████| 578/578 [00:00<00:00, 118281.99it/s]
