In [1]:
import pandas
import tqdm
import os
import glob
from pathlib import Path
import biotite.structure.io.pdb as pdb
import biotite.database.rcsb as rcsb
import numpy as np
import nglview as nv
import matplotlib



In [3]:
# Read in df_pockets_2021_09_21_charmmGUI_norA.csv
df_pockets = pandas.read_csv('df_pockets_2021_09_21_charmmGUI_norA.csv')
# Remove rows with drugScore values lower than 0.5
df_pockets = df_pockets[df_pockets['drugScore'] > 0.5]
# Summarize df_pockets
df_pockets.describe()

Unnamed: 0,lig_cov,poc_cov,lig_name,4A_crit,volume,hull,surface,lid,depth,surf/vol,...,N,DA,DC,DG,DT,DN,UNK,simpleScore,drugScore,frame
count,60459.0,60459.0,0.0,60459.0,60459.0,60459.0,60459.0,60459.0,60459.0,60459.0,...,60459.0,60459.0,60459.0,60459.0,60459.0,60459.0,60459.0,60459.0,60459.0,60459.0
mean,0.0,0.0,,0.0,417.109141,519.230823,451.387584,67.843239,15.263253,0.458509,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.519577,0.673461,5003.357664
std,0.0,0.0,,0.0,419.907015,523.673877,467.271253,64.366543,7.722377,0.099266,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.205755,0.149888,2910.0442
min,0.0,0.0,,0.0,0.96,0.48,0.48,0.0,0.565685,0.085714,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.505004,0.0
25%,0.0,0.0,,0.0,130.176,183.92,153.12,29.12,9.28655,0.393546,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.395487,0.550407,2444.0
50%,0.0,0.0,,0.0,265.152,326.56,274.4,48.96,12.9368,0.434769,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.506663,0.621218,4941.0
75%,0.0,0.0,,0.0,538.816,613.6,525.12,87.04,19.1102,0.483375,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.612454,0.769856,7629.0
max,0.0,0.0,,0.0,3516.48,4428.64,3889.6,634.72,54.4896,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,10000.0


In [4]:
def annot_pocket_res(df_pockets):
    # Iterate over all rows in df_pockets
    for index, row in tqdm.tqdm(df_pockets.iterrows(), total=df_pockets.shape[0]):
        # Get the name and frame
        name = row['name']
        frame = row['frame']

        pocket_pdb = f'frame_{frame}.pdb_res_{name}.pdb'
        
        # Load the pocket PDB file
        pocket = pdb.PDBFile.read(os.path.join('2021_09_21_charmmGUI_norA_splitPDB_dss',pocket_pdb))

        # Find out unique residue numbers
        res_nums = list(map(int,[pocket.get_structure()[0][i].res_id for i in range(0,len(pocket.get_structure()[0]))]))

        # Get unique values in res_nums
        unique_res_nums = np.unique(res_nums)

        unique_res_str = ' '.join(unique_res_nums.astype(str))

        # Add unique_res_str to df_pockets as a new column
        df_pockets.loc[index, 'pocket_res'] = unique_res_str

    return df_pockets

In [5]:
ncores = 20

# Split df_pockets into ncores chunks.
df_pockets_chunks = np.array_split(df_pockets, ncores)

# Call annot_pocket_res() on each chunk using multiprocessing
from multiprocessing import Pool
with Pool(ncores) as p:
    df_pockets = pandas.concat(p.map(annot_pocket_res, df_pockets_chunks))


100%|██████████| 3023/3023 [25:44<00:00,  1.96it/s]  
100%|██████████| 3023/3023 [27:23<00:00,  1.84it/s]
100%|██████████| 3023/3023 [27:50<00:00,  1.81it/s]
100%|██████████| 3023/3023 [28:29<00:00,  1.77it/s]
100%|██████████| 3023/3023 [28:53<00:00,  1.74it/s]
100%|██████████| 3023/3023 [28:57<00:00,  1.74it/s]
100%|██████████| 3023/3023 [28:58<00:00,  1.74it/s]
100%|██████████| 3023/3023 [29:34<00:00,  1.70it/s]
100%|██████████| 3023/3023 [29:39<00:00,  1.70it/s]
100%|██████████| 3022/3022 [29:42<00:00,  1.70it/s]
100%|██████████| 3023/3023 [29:48<00:00,  1.69it/s]
100%|██████████| 3023/3023 [29:58<00:00,  1.68it/s]
100%|██████████| 3023/3023 [30:23<00:00,  1.66it/s]
100%|██████████| 3023/3023 [31:01<00:00,  1.62it/s]
100%|██████████| 3023/3023 [31:18<00:00,  1.61it/s]
100%|██████████| 3023/3023 [31:30<00:00,  1.60it/s]
100%|██████████| 3023/3023 [32:12<00:00,  1.56it/s]
100%|██████████| 3023/3023 [32:13<00:00,  1.56it/s]
100%|██████████| 3023/3023 [32:15<00:00,  1.56it/s]
100%|█████

In [6]:
# Save df_pockets to a CSV file
# ds_05: drugScore > 0.5
# wpr: with pocket residues
df_pockets.to_csv('df_pockets_ds_05_wpr.csv', index=True)


In [None]:
# (Optional) Remove rows that contain more than a single underscore in the name column
df_pockets = df_pockets[~df_pockets['name'].str.contains('_.*_')]

In [None]:
master_res_list = list()
# Iterate over all rows in df_pockets
for index, row in tqdm.tqdm(df_pockets.iterrows(), total=df_pockets.shape[0]):
    # Convert pocket_res to a list containing integers
    pocket_res = list(map(int, row['pocket_res'].split(' ')))

    # Add pocket_res to master_res_list
    master_res_list.extend(pocket_res)

In [None]:
# Get how frequently each value is observed in master_res_list in a data frame
df_res_freq = pandas.DataFrame(pandas.Series(master_res_list).value_counts())

In [None]:
df_res_freq

In [149]:
# Load frame_0_freq.pdb in nglview
view = nv.show_file('2021_09_21_charmmGUI_norA_splitPDB/frame_0.pdb',default_representation=True)

# Add a spacefill representation of the top 20 residues with highest frequency
for res in df_res_freq.index[:20]:

    view.add_spacefill(selection=f'{res}:A',color='blue')

view.center()
view

NGLWidget()

In [136]:
file_path = rcsb.fetch('1a0j', 'pdb')
stack = pdb.PDBFile.read(file_path)

In [141]:
mol = pdb.get_structure(stack,extra_fields=["b_factor"])

In [143]:
mol[0].b_factor

array([ 7.85,  9.03, 10.98, ..., 23.79, 50.78, 51.51])