# Task 1: conformational relationships within a single ensemble

Relationships within an ensemble will be identified considering the structural features of single
conformations

### 0. Set-up

In [17]:
import numpy as np
from Bio.PDB import PDBList, Superimposer, is_aa
from Bio.PDB.PDBParser import PDBParser
import requests

### 1. Load PDB structures of one single PED ensemble

In [3]:
ped_id = "PED00153"
url = "https://proteinensemble.org/api/" + ped_id
resp_json = requests.get(url).json()
print(resp_json["title"])
ensembles_ids = []
for curr_ensemble in resp_json["ensembles"]:
    ensembles_ids.append(curr_ensemble["ensemble_id"])

# get direct link to the downloadable file (url as string)
url = "https://proteinensemble.org/api/download"
parameters = {
   "ensemble_id": ensembles_ids
}
download_link = requests.get(url, params=parameters).text
resp_file = requests.get(download_link.replace('"', ''))

# download ensembles
with open("data/" + ped_id + ".tar", "wb") as f:
    f.write(resp_file.content)
  

Structure and dynamics of the MKK7


In [6]:
tar_to_extract = "./data/" + ped_id + ".tar"
data_folder = "./data/"

In [10]:
!tar -xzvf $tar_to_extract -C $data_folder

x PED00153e007.pdb.tar.gz
x PED00153e008.pdb.tar.gz
x PED00153e009.pdb.tar.gz
x PED00153e010.pdb.tar.gz
x PED00153e011.pdb.tar.gz


In [12]:
for ensemble in ensembles_ids:
    tar_to_extract = "./data/" + ensemble + ".pdb.tar.gz"
    !tar -xzvf $tar_to_extract -C $data_folder

x PED00153e007.pdb
x PED00153e008.pdb
x PED00153e009.pdb
x PED00153e010.pdb
x PED00153e011.pdb


In [14]:
structure = PDBParser(QUIET=True).get_structure(ensembles_ids[0], "data/{}.pdb".format(ensembles_ids[0]))

In [29]:
for chain in structure[0]:
    for residue in chain:
        if is_aa(residue):  # Filter hetero groups (returns only amino acids)
                # residue.id tuple contains hetero_flag, position, insertion_code
                # print("model {} chain {} residue_id {} resname {} resname_3to1 {}".format(model.id, chain.id, residue.id, residue.get_resname(),
                #                                        IUPACData.protein_letters_3to1.get(residue.get_resname().capitalize())))
                for atom in residue:
                     print("atom {} {} {}".format(atom.id, atom.get_bfactor(), atom.get_coord()))
        else:
            print(residue.id)

atom N 0.0 [-35.049 -34.886  70.519]
atom CA 0.0 [-34.236 -35.921  69.886]
atom C 0.0 [-33.103 -35.265  69.134]
atom O 0.0 [-32.371 -35.901  68.361]
atom 1H 0.0 [-35.769 -35.116  71.008]
atom 2H 0.0 [-35.451 -34.259  70.015]
atom 3H 0.0 [-34.678 -34.327  71.119]
atom CB 0.0 [-35.093 -36.781  68.955]
atom SD 0.0 [-37.113 -38.632  68.543]
atom CE 0.0 [-38.238 -39.428  69.688]
atom CG 0.0 [-36.121 -37.639  69.675]
atom N 0.0 [-32.938 -33.973  69.349]
atom CA 0.0 [-31.882 -33.208  68.691]
atom C 0.0 [-32.201 -33.096  67.219]
atom O 0.0 [-31.403 -32.6    66.409]
atom H 0.0 [-33.5   -33.564  69.921]
atom CB 0.0 [-30.531 -33.869  68.914]
atom N 0.0 [-33.38  -33.556  66.846]
atom CA 0.0 [-33.823 -33.513  65.455]
atom C 0.0 [-33.9   -32.073  65.008]
atom O 0.0 [-34.3   -31.171  65.76 ]
atom H 0.0 [-33.917 -33.904  67.479]
atom CB 0.0 [-35.167 -34.21   65.305]
atom N 0.0 [-33.516 -31.831  63.769]
atom CA 0.0 [-33.537 -30.485  63.201]
atom C 0.0 [-34.735 -30.354  62.29 ]
atom O 0.0 [-35.129 -31.2

atom CA 0.0 [-67.027   3.105  10.717]
atom C 0.0 [-67.582   2.576   9.415]
atom O 0.0 [-68.618   1.896   9.37 ]
atom H 0.0 [-68.077   2.639  12.431]
atom CB 0.0 [-65.883   2.228  11.228]
atom HG 0.0 [-66.499   0.603  10.552]
atom OG 0.0 [-66.277   0.869  11.306]
atom N 0.0 [-66.898   2.882   8.329]
atom CA 0.0 [-67.316   2.439   7.001]
atom C 0.0 [-67.057   0.957   6.873]
atom O 0.0 [-67.609   0.266   6.004]
atom H 0.0 [-66.154   3.379   8.424]
atom CB 0.0 [-66.577   3.226   5.917]
atom CD 0.0 [-66.204   5.443   4.766]
atom CG 0.0 [-66.957   4.696   5.849]
atom OE1 0.0 [-65.345   4.823   4.104]
atom OE2 0.0 [-66.473   6.648   4.578]
atom N 0.0 [-66.208   0.441   7.741]
atom CA 0.0 [-65.862  -0.978   7.737]
atom C 0.0 [-67.114  -1.79    7.969]
atom O 0.0 [-67.905  -1.531   8.888]
atom H 0.0 [-65.838   0.993   8.349]
atom CB 0.0 [-64.804  -1.275   8.802]
atom HG 0.0 [-65.216  -3.075   9.06 ]
atom OG 0.0 [-64.522  -2.662   8.869]
atom N 0.0 [-67.316  -2.792   7.134]
atom CA 0.0 [-68.484  

atom O 0.0 [-19.987   7.678  -7.081]
atom H 0.0 [-20.129   8.547  -9.23 ]
atom CB 0.0 [-22.304   6.422  -9.221]
atom CD 0.0 [-21.575   4.221 -10.222]
atom CG 0.0 [-21.143   5.481  -9.498]
atom OE1 0.0 [-22.006   4.272 -11.374]
atom 1HE2 0.0 [-21.702   2.305  -9.93 ]
atom 2HE2 0.0 [-21.139   3.089  -8.706]
atom NE2 0.0 [-21.459   3.083  -9.547]
atom N 0.0 [-21.964   7.032  -6.251]
atom CA 0.0 [-21.427   6.742  -4.924]
atom C 0.0 [-20.689   5.425  -4.969]
atom O 0.0 [-21.279   4.345  -5.12 ]
atom H 0.0 [-22.839   6.907  -6.42 ]
atom CB 0.0 [-22.551   6.712  -3.886]
atom CD 0.0 [-23.21    6.489  -1.458]
atom CG 0.0 [-22.073   6.492  -2.46 ]
atom OE1 0.0 [-24.352   6.807  -1.854]
atom OE2 0.0 [-22.961   6.169  -0.277]
atom N 0.0 [-19.377   5.494  -4.839]
atom CA 0.0 [-18.534   4.301  -4.864]
atom C 0.0 [-18.515   3.684  -3.486]
atom O 0.0 [-18.378   4.369  -2.461]
atom H 0.0 [-19.004   6.306  -4.733]
atom CB 0.0 [-17.107   4.628  -5.342]
atom CD1 0.0 [-15.815   5.71   -7.255]
atom CG1 0.0 

### 2. Feature extraction (single conformation)
In the following we summarize the single conformation features that we are going to compute

| Feature       | Dimension     |
| :------------- |:-------------|
| Radius of gyration of the structure | Scalar |
| Relative accessible surface area (ASA) for each residue | Vector of size $N$      |
| Secondary structure (SS) for each residue and class | Vector of size $N$       |
|Residue distance matrix considering Cα atoms | Matrix of shape $N \times N$ (symmetric) |

where $N$ is the number of residues in one conformation.

* Claudia --> 1, 4
* Matteo --> 2
* Leonardo --> 3

Leggere un po Lazar.

### 3. Graph extraction
Nodes are a subset of representative conformations and edges represent their similarity (or distance). The similarity is calculated combining all feature values. Representative conformations are found by unsupervised
clustering and the number of clusters is identified automatically.

### 4. Pymol image
An Pymol image including the PDB structures corresponding to the graph nodes.
Centered (translate) the selected structures on the position(s) with the lowest feature
variance. Residues in each structure will be displayed (color, size, ...) based on their
feature variability within the ensemble.