# Plinder Tutorial
## Data Processing Pipeline
Loads Plinder systems into dictionary of numpy arrays


In [1]:
from pipelines.plinder_dataset.plinder_pipeline import *
import plinder.core.utils.config

  from .autonotebook import tqdm as notebook_tqdm


The script will download the data from the remote data directory if it is not found in the local cache directory. 

Plinder data can be found on the cluster in `/net/galaxy/home/koes/tjkatz/.local/share/plinder/2024-06/v2`.

In [2]:
cfg = plinder.core.get_config()
print(f"local cache directory: {cfg.data.plinder_dir}")
print(f"remote data directory: {cfg.data.plinder_remote}")

local cache directory: /net/galaxy/home/koes/tjkatz/.local/share/plinder/2024-06/v2
remote data directory: gs://plinder/2024-06/v2


The `SystemProcessor` class takes in an `atom_map` and optionall a `pocket_cutoff`, which by default is set to 5A. 

The `process_system` call will convert the receptor and any linked apo structures into `StructureData` objects, which include numpy arrays of the atom coordinates, atom names, residue ids, residue names, and chain ids. 

For each ligand, a pocket will be extracted and returned as a `StructureData` object (and optionally saved as a PDB file). 

The ligands will be returned as `LigandData` objects with coords, atom_types, atom_charges, bond_types, and bond_indices as numpy arrays. 

The entire system is returned as a dictionary with the following keys: 'receptor', 'ligands', 'pockets', 'apo_structures', 'entry_annotation', 'system_annotation'. Ligands and pockets are dictionaries indexed by the plinder ligand ID and apo structures are indexed by the plinder linked structure id. Entry annotation corresponds to the PDB entry-level annotation stored in plinder and the system annotation is that at the sytem level. 

In [3]:
system_processor = SystemProcessor(atom_map=["C", "H", "N", "O", "F", "P", "S", "Cl", "Br", "I", "Se"], pocket_cutoff=8.0)
result = system_processor.process_system(system_id="4agi__1__1.C__1.W", save_pockets=False)

2025-01-30 16:47:14,162 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.00s
2025-01-30 16:47:14,163 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.00s
2025-01-30 16:47:14,465 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.13s
2025-01-30 16:47:15,024 | plinder.core.scores.links.query_links:24 | INFO : runtime succeeded: 0.86s
2025-01-30 16:47:18,070 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.00s
2025-01-30 16:47:18,071 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.00s
100%|██████████| 1060/1060 [00:00<00:00, 35001.24it/s]
2025-01-30 16:47:19,201 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.00s
2025-01-30 16:47:19,202 | plinder.core.utils.cpl.download_paths:24 | INFO : runtime succeeded: 0.00s
2025-01-30 16:47:19,203 | plinder.core.index.utils:148 | INFO : loading entries from 1 zips
2025-01-30 16:47:19,215 | plinder.core.index.

In [4]:
print(result.keys())

dict_keys(['receptor', 'ligands', 'pockets', 'apo_structures', 'pred_structures', 'entry_annotation', 'system_annotation'])


In [None]:
result["receptor"]

StructureData(coords=array([[48.761, 52.015, 58.719],
       [48.178, 50.968, 57.827],
       [49.186, 50.593, 56.712],
       ...,
       [50.066, 42.485, 54.198],
       [47.368, 43.433, 55.544],
       [50.022, 44.674, 54.248]], dtype=float32), atom_names=array(['N', 'CA', 'C', ..., 'O', 'CB', 'OXT'], dtype='<U6'), res_ids=array([  2,   2,   2, ..., 315, 315, 315]), res_names=array(['SER', 'SER', 'SER', ..., 'ALA', 'ALA', 'ALA'], dtype='<U5'), chain_ids=array(['A', 'A', 'A', ..., 'A', 'A', 'A'], dtype='<U4'), res_idx=None, cif='systems/4agi__1__1.C__1.W/receptor.cif')

In [6]:
result["ligands"]

{'1.W': LigandData(sdf='systems/4agi__1__1.C__1.W/ligand_files/1.W.sdf', coords=array([[48.954, 28.186, 62.068],
        [48.681, 29.643, 61.777],
        [48.556, 29.933, 60.282],
        [47.499, 28.988, 59.699],
        [47.948, 27.56 , 59.939],
        [46.992, 26.503, 59.374],
        [49.72 , 30.41 , 62.383],
        [48.255, 31.352, 60.148],
        [46.262, 29.193, 60.366],
        [48.031, 27.383, 61.339],
        [50.647, 25.895, 62.426],
        [50.853, 27.716, 61.696]]), atom_types=array([ 0,  0,  0,  0,  0,  0,  3,  3,  3,  3,  0, 10]), atom_charges=array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32), bond_types=array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32), bond_indices=(array([ 0,  0,  0,  1,  1,  2,  2,  3,  3,  4,  4, 10]), array([ 1,  9, 11,  2,  6,  3,  7,  4,  8,  5,  9, 11])))}

In [None]:
result["pockets"]

{'1.W': StructureData(coords=array([[44.456, 21.838, 56.99 ],
        [45.184, 23.078, 57.089],
        [45.47 , 23.828, 55.805],
        [46.357, 24.707, 55.763],
        [46.485, 22.852, 57.881],
        [47.472, 21.956, 57.163],
        [47.141, 21.267, 56.191],
        [48.692, 21.909, 57.687],
        [50.638, 26.241, 53.267],
        [51.918, 26.3  , 53.961],
        [52.388, 27.719, 54.269],
        [53.271, 27.896, 55.086],
        [38.372, 34.33 , 61.835],
        [38.778, 35.493, 61.058],
        [37.833, 36.652, 61.382],
        [36.632, 36.472, 61.501],
        [38.839, 35.223, 59.514],
        [39.8  , 34.085, 59.249],
        [41.153, 34.303, 59.264],
        [39.354, 32.785, 59.147],
        [42.075, 33.245, 59.146],
        [40.24 , 31.748, 59.031],
        [41.587, 31.979, 59.021],
        [42.448, 30.88 , 58.94 ],
        [50.84 , 32.2  , 53.855],
        [49.823, 33.125, 53.341],
        [50.168, 33.821, 52.088],
        [51.284, 34.305, 51.923],
        [49.498, 34.

In [8]:
result["apo_structures"]

{'4uou_B': StructureData(coords=array([[48.429, 51.569, 59.041],
        [47.897, 50.834, 57.841],
        [48.919, 50.677, 56.677],
        ...,
        [50.148, 42.882, 53.814],
        [47.726, 43.263, 55.61 ],
        [49.977, 45.05 , 54.02 ]], dtype=float32), atom_names=array(['N', 'CA', 'C', ..., 'O', 'CB', 'OXT'], dtype='<U6'), res_ids=array([  1,   1,   1, ..., 314, 314, 314]), res_names=array(['SER', 'SER', 'SER', ..., 'ALA', 'ALA', 'ALA'], dtype='<U5'), chain_ids=array(['B', 'B', 'B', ..., 'B', 'B', 'B'], dtype='<U4'), res_idx=None, cif='linked_structures/apo/4agi__1__1.C__1.W/4uou_B/superposed.cif'),
 '4uou_C': StructureData(coords=array([[48.891, 51.823, 58.85 ],
        [48.219, 50.988, 57.793],
        [49.153, 50.808, 56.609],
        ...,
        [49.93 , 42.166, 53.822],
        [47.663, 43.299, 55.58 ],
        [50.202, 44.363, 53.803]], dtype=float32), atom_names=array(['N', 'CA', 'C', ..., 'O', 'CB', 'OXT'], dtype='<U6'), res_ids=array([  1,   1,   1, ..., 314, 314,

In [9]:
result["pred_structures"]

{'Q4WW81_A': StructureData(coords=array([[49.542, 52.802, 62.086],
        [49.671, 53.148, 60.651],
        [49.148, 51.981, 59.823],
        ...,
        [49.97 , 42.45 , 54.315],
        [47.126, 43.54 , 55.386],
        [49.955, 44.676, 54.549]], dtype=float32), atom_names=array(['N', 'CA', 'C', ..., 'O', 'CB', 'OXT'], dtype='<U6'), res_ids=array([  1,   1,   1, ..., 315, 315, 315]), res_names=array(['MET', 'MET', 'MET', ..., 'ALA', 'ALA', 'ALA'], dtype='<U5'), chain_ids=array(['A', 'A', 'A', ..., 'A', 'A', 'A'], dtype='<U4'), res_idx=None, cif='linked_structures/pred/4agi__1__1.C__1.W/Q4WW81_A/superposed.cif')}

In [10]:
result["entry_annotation"]

{'pdb_id': '4agi',
 'release_date': '2012-01-30',
 'oligomeric_state': 'dimeric',
 'determination_method': 'X-RAY DIFFRACTION',
 'keywords': 'SUGAR BINDING PROTEIN',
 'pH': '',
 'resolution': 1.6,
 'chains': {'A': {'asym_id': 'A',
   'auth_id': 'A',
   'entity_id': '1',
   'chain_type_str': 'polypeptide(L)',
   'residues': {'22': {'chain': 'A',
     'index': 20,
     'number': 22,
     'auth_number': '22',
     'one_letter_code': 'N',
     'name': 'ASN',
     'chem_type': 'A',
     'validation': {'altcode': '.',
      'inscode': '?',
      'rsr': 0.06,
      'rsrz': -0.73,
      'rscc': 0.973,
      'average_occupancy': 1.0,
      'average_b_factor': 11.36875,
      'unknown_residue': False,
      'atom_count': 8,
      'unknown_atom_count': 0,
      'heavy_atom_count': 8,
      'num_unresolved_heavy_atoms': 0,
      'is_outlier': {'geometry': False,
       'density': False,
       'chirality': False,
       'clashes': False},
      'is_atom_count_consistent': True,
      'has_clashing

In [11]:
result["system_annotation"]

{'pdb_id': '4agi',
 'biounit_id': '1',
 'ligands': [{'pdb_id': '4agi',
   'biounit_id': '1',
   'asym_id': 'W',
   'instance': 1,
   'ccd_code': 'SFU',
   'plip_type': 'SMALLMOLECULE',
   'bird_id': '',
   'centroid': [49.18450164794922, 28.385324478149414, 61.18277359008789],
   'smiles': 'C[Se][C@@H]1O[C@@H](C)[C@@H](O)[C@@H](O)[C@@H]1O',
   'resolved_smiles': 'C[Se][C@@H]1O[C@@H](C)[C@H]([C@H]([C@@H]1O)O)O',
   'residue_numbers': [1],
   'rdkit_canonical_smiles': 'C[Se][C@@H]1O[C@@H](C)[C@@H](O)[C@@H](O)[C@@H]1O',
   'molecular_weight': 242.005730228,
   'crippen_clogp': -1.4338000000000004,
   'num_rot_bonds': 1,
   'num_hbd': 3,
   'num_hba': 4,
   'num_rings': 1,
   'num_heavy_atoms': 12,
   'is_covalent': False,
   'covalent_linkages': [],
   'neighboring_residues': {'1.C': [22,
     23,
     24,
     40,
     44,
     271,
     280,
     282,
     284,
     296,
     298,
     300,
     308,
     309]},
   'neighboring_ligands': [],
   'interacting_residues': {'1.C': [282, 44, 

In [12]:
print(result["system_annotation"]["ligands"][0]['is_artifact'])

False
