# Plinder Tutorial
## Data Processing Pipeline
Loads Plinder systems into dictionary of numpy arrays


In [8]:
from omtra_pipelines.plinder_dataset.plinder_pipeline import *
import plinder.core.utils.config
from omtra_pipelines.plinder_dataset.store_plinder import NPNDE_MAP, LIGAND_MAP

The script will download the data from the remote data directory if it is not found in the local cache directory. 

Plinder data can be found on the cluster in `/net/galaxy/home/koes/tjkatz/.local/share/plinder/2024-06/v2`.

In [2]:
cfg = plinder.core.get_config()
print(f"local cache directory: {cfg.data.plinder_dir}")
print(f"remote data directory: {cfg.data.plinder_remote}")

local cache directory: /net/galaxy/home/koes/tjkatz/.local/share/plinder/2024-06/v2
remote data directory: gs://plinder/2024-06/v2


The `SystemProcessor` class takes in an `atom_map` and optionall a `pocket_cutoff`, which by default is set to 5A. 

The `process_system` call will convert the receptor and any linked apo structures into `StructureData` objects, which include numpy arrays of the atom coordinates, atom names, residue ids, residue names, and chain ids. 

For each ligand, a pocket will be extracted and returned as a `StructureData` object (and optionally saved as a PDB file). 

The ligands will be returned as `LigandData` objects with coords, atom_types, atom_charges, bond_types, and bond_indices as numpy arrays. 

The entire system is returned as a dictionary with the following keys: 'receptor', 'ligands', 'pockets', 'apo_structures', 'entry_annotation', 'system_annotation'. Ligands and pockets are dictionaries indexed by the plinder ligand ID and apo structures are indexed by the plinder linked structure id. Entry annotation corresponds to the PDB entry-level annotation stored in plinder and the system annotation is that at the sytem level. 

In [9]:
system_processor = SystemProcessor(
    ligand_atom_map=LIGAND_MAP,
    npnde_atom_map=NPNDE_MAP,
    pocket_cutoff=8.0,
    get_links=True,
)
result = system_processor.process_system(
    system_id="3kfy__1__1.A__1.C", save_pockets=False
)

2025-03-04 16:11:31,113 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.00s
2025-03-04 16:11:31,114 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.00s
2025-03-04 16:11:31,115 | plinder.core.index.utils:148 | INFO : loading entries from 1 zips
2025-03-04 16:11:31,123 | plinder.core.index.utils:163 | INFO : loaded 1 entries
2025-03-04 16:11:31,124 | plinder.core.index.utils.load_entries:24 | INFO : runtime succeeded: 0.12s
2025-03-04 16:11:31,222 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.00s
2025-03-04 16:11:31,223 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.00s
2025-03-04 16:11:31,512 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.10s
2025-03-04 16:11:32,271 | plinder.core.scores.links.query_links:24 | INFO : runtime succeeded: 0.96s
2025-03-04 16:11:37,992 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.00s
2025-03-04 16:11:37

In [10]:
print(result.keys())

dict_keys(['apo', 'pred', 'links', 'annotation'])


In [11]:
print(result["apo"])

{'3k74_A': [SystemData(system_id='3kfy__1__1.A__1.C', ligand_id='1.C', receptor=StructureData(coords=array([[ 21.691,  -3.847,  -9.189],
       [ 21.271,  -3.693, -10.582],
       [ 19.871,  -3.034, -10.71 ],
       ...,
       [ 20.482, -15.398, -24.993],
       [ 18.295, -15.604, -25.64 ],
       [ 23.68 ,  -8.781, -23.01 ]], dtype=float32), atom_names=array(['N', 'CA', 'C', ..., 'NH1', 'NH2', 'OXT'], dtype='<U6'), res_ids=array([  1,   1,   1, ..., 159, 159, 159]), res_names=array(['MET', 'MET', 'MET', ..., 'ARG', 'ARG', 'ARG'], dtype='<U5'), chain_ids=array(['A', 'A', 'A', ..., 'A', 'A', 'A'], dtype='<U4'), cif='systems/3kfy__1__1.A__1.C/receptor.cif'), ligand=LigandData(coords=array([[  1.431,  -4.443, -10.24 ],
       [  2.244,  -3.259, -10.466],
       [  3.377,  -3.481, -11.37 ],
       [  3.562,  -4.713, -11.863],
       [  2.744,  -5.773, -11.581],
       [  1.686,  -5.604, -10.788],
       [  4.327,  -2.577, -11.757],
       [  0.346,  -4.432,  -9.434],
       [ -0.02 ,  -3.

In [5]:
print(result["7ueu__1__1.A_2.A__2.C_2.D_2.E"])

[SystemData(system_id='7ueu__1__1.A_2.A__2.C_2.D_2.E', ligand_id='2.C', receptor=StructureData(coords=array([[-31.115, -33.887,  19.519],
       [-30.972, -33.173,  18.258],
       [-31.051, -31.662,  18.47 ],
       ...,
       [-39.928,  31.565, -28.984],
       [-39.109,  29.682, -30.212],
       [-39.605,  30.221, -29.039]], dtype=float32), atom_names=array(['N', 'CA', 'C', ..., 'CE1', 'CE2', 'CZ'], dtype='<U6'), res_ids=array([ 19,  19,  19, ..., 377, 377, 377]), res_names=array(['SER', 'SER', 'SER', ..., 'PHE', 'PHE', 'PHE'], dtype='<U5'), chain_ids=array(['A', 'A', 'A', ..., 'B', 'B', 'B'], dtype='<U4'), cif='systems/7ueu__1__1.A_2.A__2.C_2.D_2.E/receptor.cif'), ligand=LigandData(coords=array([[-27.306,  13.629, -21.888],
       [-27.091,  13.638, -23.552],
       [-28.361,  14.366, -24.192],
       [-25.82 ,  14.244, -23.951],
       [-27.06 ,  12.1  , -23.978],
       [-27.123,  11.432, -25.438],
       [-28.545,  11.036, -25.787],
       [-26.538,  12.313, -26.525],
       [-

In [24]:
print(result["receptor"])

StructureData(coords=array([[-31.115, -33.887,  19.519],
       [-30.972, -33.173,  18.258],
       [-31.051, -31.662,  18.47 ],
       ...,
       [-39.928,  31.565, -28.984],
       [-39.109,  29.682, -30.212],
       [-39.605,  30.221, -29.039]], dtype=float32), atom_names=array(['N', 'CA', 'C', ..., 'CE1', 'CE2', 'CZ'], dtype='<U6'), res_ids=array([ 19,  19,  19, ..., 377, 377, 377]), res_names=array(['SER', 'SER', 'SER', ..., 'PHE', 'PHE', 'PHE'], dtype='<U5'), chain_ids=array(['A', 'A', 'A', ..., 'B', 'B', 'B'], dtype='<U4'), cif='systems/7ueu__1__1.A_2.A__2.C_2.D_2.E/receptor.cif')


In [25]:
result["ligands"]

{'2.C': LigandData(coords=array([[-27.306,  13.629, -21.888],
        [-27.091,  13.638, -23.552],
        [-28.361,  14.366, -24.192],
        [-25.82 ,  14.244, -23.951],
        [-27.06 ,  12.1  , -23.978],
        [-27.123,  11.432, -25.438],
        [-28.545,  11.036, -25.787],
        [-26.538,  12.313, -26.525],
        [-26.257,  10.089, -25.294],
        [-25.006,  10.171, -24.597],
        [-24.43 ,   8.756, -24.475],
        [-24.156,   8.227, -25.789],
        [-23.078,   8.716, -23.729],
        [-23.312,   8.529, -22.336],
        [-22.423,   7.443, -24.317],
        [-22.882,   6.282, -23.624],
        [-23.032,   7.337, -25.729],
        [-22.164,   7.763, -26.811],
        [-22.302,   7.345, -28.104],
        [-21.403,   7.913, -28.852],
        [-20.663,   8.724, -28.078],
        [-21.13 ,   8.656, -26.761],
        [-20.541,   9.382, -25.819],
        [-19.524,  10.168, -26.127],
        [-19.053,  10.244, -27.368],
        [-19.59 ,   9.55 , -28.37 ],
        [-18.

In [26]:
result["npndes"]

{'2.D': LigandData(coords=array([[-30.3033,  14.3827, -24.0104]], dtype=float32), atom_types=array([43]), atom_charges=array([2], dtype=int32), bond_types=array([], dtype=int32), bond_indices=array([], shape=(0, 2), dtype=int64), sdf='systems/7ueu__1__1.A_2.A__2.C_2.D_2.E/ligand_files/2.D.sdf'),
 '2.E': LigandData(coords=array([[-18.115,  12.792, -31.79 ],
        [-17.439,  11.838, -30.959],
        [-19.394,  13.258, -31.103],
        [-20.277,  12.143, -30.931]], dtype=float32), atom_types=array([0, 2, 0, 2]), atom_charges=array([0, 0, 0, 0], dtype=int32), bond_types=array([1, 1, 1], dtype=int32), bond_indices=array([[0, 1],
        [0, 2],
        [2, 3]]), sdf='systems/7ueu__1__1.A_2.A__2.C_2.D_2.E/ligand_files/2.E.sdf')}

In [27]:
result["pockets"]

{'2.C': StructureData(coords=array([[-23.757,   7.527, -19.474],
        [-22.557,   8.032, -18.823],
        [-21.985,   7.054, -17.811],
        ...,
        [-29.782,   2.17 , -27.18 ],
        [-31.293,   2.101, -27.27 ],
        [-29.204,   0.819, -26.751]], dtype=float32), atom_names=array(['N', 'CA', 'C', 'O', 'N', 'CA', 'C', 'O', 'CB', 'CG1', 'CG2',
        'CD1', 'N', 'CA', 'C', 'O', 'CB', 'CG1', 'CG2', 'N', 'CA', 'C',
        'O', 'N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2', 'CE1', 'CE2',
        'CZ', 'N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'NE', 'CZ', 'NH1',
        'NH2', 'N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2', 'N', 'CA',
        'C', 'O', 'CB', 'N', 'CA', 'C', 'O', 'CB', 'CG', 'CD1', 'CD2',
        'CE1', 'CE2', 'CZ', 'N', 'CA', 'C', 'O', 'N', 'CA', 'C', 'O', 'CB',
        'CG', 'OD1', 'ND2', 'N', 'CA', 'C', 'O', 'CB', 'CG', 'CD', 'N',
        'CA', 'C', 'O', 'CB', 'CG', 'CD', 'CE', 'NZ', 'N', 'CA', 'C', 'O',
        'N', 'CA', 'C', 'O', 'N', 'CA', 'C', 'O', 'CB',

In [31]:
result["system_annotation"]

{'pdb_id': '7ueu',
 'biounit_id': '1',
 'ligands': [{'pdb_id': '7ueu',
   'biounit_id': '1',
   'asym_id': 'C',
   'instance': 2,
   'ccd_code': 'AN2',
   'plip_type': 'SMALLMOLECULE',
   'bird_id': '',
   'centroid': [-24.080585479736328, 10.235146522521973, -25.439708709716797],
   'smiles': 'Nc1ncnc2c1ncn2[C@@H]1O[C@H](CO[P@](=O)(O)O[P@@](N)(=O)O)[C@@H](O)[C@H]1O',
   'resolved_smiles': 'O[C@@H]1[C@@H](CO[P@@](=O)(O[P@@](=O)([O])N)O)O[C@H]([C@@H]1O)n1cnc2c1ncnc2N',
   'residue_numbers': [1],
   'rdkit_canonical_smiles': 'Nc1ncnc2c1ncn2[C@@H]1O[C@H](CO[P@](=O)(O)O[P@@](N)(=O)O)[C@@H](O)[C@H]1O',
   'molecular_weight': 426.0453993519999,
   'crippen_clogp': -1.7796000000000007,
   'num_rot_bonds': 6,
   'num_hbd': 6,
   'num_hba': 12,
   'num_rings': 3,
   'num_heavy_atoms': 27,
   'is_covalent': False,
   'covalent_linkages': [],
   'neighboring_residues': {'1.A': [261, 284],
    '2.A': [25,
     26,
     27,
     28,
     29,
     30,
     31,
     32,
     94,
     96,
     97,
   

In [30]:
print(result["system_annotation"]["ligands"][0]["is_artifact"])

False


In [32]:
result = system_processor.process_system(
    system_id="6rgu__1__2.A__2.H", save_pockets=False
)

2025-02-12 17:21:30,260 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.00s
2025-02-12 17:21:30,262 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.00s
2025-02-12 17:21:30,263 | plinder.core.index.utils:148 | INFO : loading entries from 1 zips
2025-02-12 17:21:30,309 | plinder.core.index.utils:163 | INFO : loaded 1 entries
2025-02-12 17:21:30,311 | plinder.core.index.utils.load_entries:24 | INFO : runtime succeeded: 0.16s
2025-02-12 17:21:30,448 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.00s
2025-02-12 17:21:30,449 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.00s
2025-02-12 17:21:30,854 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.09s
2025-02-12 17:21:31,500 | plinder.core.scores.links.query_links:24 | INFO : runtime succeeded: 0.87s
2025-02-12 17:21:35,810 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.00s
2025-02-12 17:21:35

In [33]:
result["apo_structures"]

{'5mxe_A': {'holo': StructureData(coords=array([[-41.435,  18.111,  42.813],
         [-41.035,  19.107,  43.846],
         [-39.591,  18.868,  44.318],
         ...,
         [-29.7  ,  22.965,  47.95 ],
         [-29.282,  23.292,  49.331],
         [-34.194,  18.75 ,  45.047]], dtype=float32), atom_names=array(['N', 'CA', 'C', ..., 'CE', 'NZ', 'OXT'], dtype='<U6'), res_ids=array([ 25,  25,  25, ..., 369, 369, 369]), res_names=array(['SER', 'SER', 'SER', ..., 'LYS', 'LYS', 'LYS'], dtype='<U5'), chain_ids=array(['A', 'A', 'A', ..., 'A', 'A', 'A'], dtype='<U4'), cif='systems/6rgu__1__2.A__2.H/receptor.cif'),
  'link': StructureData(coords=array([[-41.60343 ,  18.07293 ,  42.77941 ],
         [-41.132004,  18.910458,  43.918167],
         [-39.660923,  18.673107,  44.359802],
         ...,
         [-29.808643,  23.145782,  47.97537 ],
         [-29.235401,  23.28697 ,  49.33817 ],
         [-34.255478,  18.8527  ,  45.05255 ]], dtype=float32), atom_names=array(['N', 'CA', 'C', ..., 'CE

In [34]:
result["pred_structures"]

{'C7BLE4_A': {'holo': StructureData(coords=array([[-41.435,  18.111,  42.813],
         [-41.035,  19.107,  43.846],
         [-39.591,  18.868,  44.318],
         ...,
         [-29.7  ,  22.965,  47.95 ],
         [-29.282,  23.292,  49.331],
         [-34.194,  18.75 ,  45.047]], dtype=float32), atom_names=array(['N', 'CA', 'C', ..., 'CE', 'NZ', 'OXT'], dtype='<U6'), res_ids=array([ 25,  25,  25, ..., 369, 369, 369]), res_names=array(['SER', 'SER', 'SER', ..., 'LYS', 'LYS', 'LYS'], dtype='<U5'), chain_ids=array(['A', 'A', 'A', ..., 'A', 'A', 'A'], dtype='<U4'), cif='systems/6rgu__1__2.A__2.H/receptor.cif'),
  'link': StructureData(coords=array([[-41.706627,  14.399178,  42.530357],
         [-41.287693,  15.80853 ,  42.596523],
         [-39.787506,  16.022655,  42.83176 ],
         ...,
         [-29.947872,  23.047075,  47.827538],
         [-29.023716,  22.799688,  48.941223],
         [-34.029892,  18.561142,  45.028866]], dtype=float32), atom_names=array(['N', 'CA', 'C', ..., '