**CMP585 - Structural Bioinformatics**

Cristian Lopes

*Assignment 2*

In [1]:
import pandas as pd
import numpy as np

# Load Data

In [2]:
def read_molecule(path):
    atoms = []
    with open(path, 'r') as f:
        for line in f:
            if line.startswith('ATOM'):
                atoms.append({'atom_name': line[12:16].strip(), 
                             'residue_name': line[17:20].strip(),
                             'residue_seq_number': line[22:26].strip(),
                             'x': float(line[30:38].strip()), 
                             'y': float(line[38:46].strip()),
                             'z': float(line[46:54].strip())})
    return pd.DataFrame(atoms)

In [3]:
molecule = read_molecule('./data/1crn.pdb')

In [4]:
molecule.head(20)

Unnamed: 0,atom_name,residue_name,residue_seq_number,x,y,z
0,N,THR,1,17.047,14.099,3.625
1,CA,THR,1,16.967,12.784,4.338
2,C,THR,1,15.685,12.755,5.133
3,O,THR,1,15.268,13.825,5.594
4,CB,THR,1,18.17,12.703,5.337
5,OG1,THR,1,19.334,12.829,4.463
6,CG2,THR,1,18.15,11.546,6.304
7,N,THR,2,15.115,11.555,5.265
8,CA,THR,2,13.856,11.469,6.066
9,C,THR,2,14.164,10.785,7.379


# Q1) Secondary to Tertiary Structure

## Symbol of each residue

First of all, we define the symbol of each residue.

In [5]:
symbol_to_residue_name = {
    'T': 'THR', 'C': 'CYS', 'P': 'PRO', 'S': 'SER', 'I': 'ILE',
    'V': 'VAL', 'A': 'ALA', 'R': 'ARG', 'N': 'ASN', 'F': 'PHE',
    'L': 'LEU', 'G': 'GLY', 'E': 'GLU', 'D': 'ASP', 'Y': 'TYR'
}

## Position of each atom inside the residue 

For estimating the position of each atom inside the residues, we take the mean of the positions of each atom for each residue type.

In [6]:
residues = pd.DataFrame(molecule.groupby(by=['residue_name', 'atom_name']).mean())

In [7]:
residues.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,x,y,z
residue_name,atom_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ALA,C,10.6928,10.9064,8.3516
ALA,CA,11.111,11.299,8.2574
ALA,CB,11.6866,11.8166,8.3848
ALA,N,10.9246,11.2466,8.011
ALA,O,10.6648,10.5226,8.5476
ARG,C,6.768,4.5845,2.7335
ARG,CA,7.1955,5.549,3.812
ARG,CB,8.6085,5.2635,4.287
ARG,CD,9.5875,6.823,3.3145
ARG,CG,9.2345,5.647,4.1775


Then, we move all the atoms of each residue type to the reference, considering the Alpha Carbon as system reference for each residue type.

In [8]:
residues.x -= residues.unstack(level=1).x['CA']
residues.y -= residues.unstack(level=1).y['CA']
residues.z -= residues.unstack(level=1).z['CA']

In [9]:
residues.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,x,y,z
residue_name,atom_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ALA,C,-0.4182,-0.3926,0.0942
ALA,CA,0.0,0.0,0.0
ALA,CB,0.5756,0.5176,0.1274
ALA,N,-0.1864,-0.0524,-0.2464
ALA,O,-0.4462,-0.7764,0.2902
ARG,C,-0.4275,-0.9645,-1.0785
ARG,CA,0.0,0.0,0.0
ARG,CB,1.413,-0.2855,0.475
ARG,CD,2.392,1.274,-0.4975
ARG,CG,2.039,0.098,0.3655


Right now, we have a dictionnary of residues containing the position of each atom inside the residue.

**Note: I calculated the postion of each atom just to have some meaning in the atoms positions, but of course it is not right: it does not take into account the rotation of each aminoacid and it takes only the aminoacids of one protein to estimate the mean position of each atom inside the aminoacid.**

## Secondary structure to tertiary structure

Next step is to replace each symbol of the secondary structure (e.g. TTCCPSIVARSNFNVCRLPGTPEAICATYTGCIIIPGATCPGDYAN) by its corresponding residue. We also calculate the necessary translation in each residue in order to give meaning to the positions of the atoms (considering a distance of 1 ångström between the Alpha Carbon and the Nitrogen of the peptide bond between 2 residues).  

In [10]:
# necessary distance in each axis in order 
# to have euclidean distance equals to 1
desired_axis_distance = 1 / np.sqrt(3)
    
def _euclidean_distance(ca_position, n_position):
    return np.sqrt(np.sum((ca_position - n_position) ** 2))

def _calculate_next_residue_translation(previous_residue, next_residue):
    axis_distance = next_residue.loc['N'] - previous_residue.loc['CA']
    translation = desired_axis_distance - axis_distance
    peptide_bond_distance = _euclidean_distance(previous_residue.loc['CA'], 
                                                next_residue.loc['N'] + translation)
    assert 1.001 > peptide_bond_distance > 0.999
    return translation

Once we have a way to calculate the translation of the next protein in the chain given the previous one, we can implement the routine to generate the tertiary structure of a protein given the secondary structure.

**Note: I only used this calculate translation method in order to have some meaning in the atoms positions, but of course it is not right: it does not take into account the rotation of each aminoacid, it uses an arbitrary distance between the peptide bond and it does not take into account the Phi and Psi angles.**

In [11]:
secondary_structure = 'TTCCPSIVARSNFNVCRLPGTPEAICATYTGCIIIPGATCPGDYAN'

In [12]:
def _infer_tertiary_structure(secondary_structure, residues_df):
    protein = pd.DataFrame(data=None, columns=['atom_name', 'residue_name', 'residue_seq_number', 'x', 'y', 'z'])
    
    previous_residue = None
    zero_translation = pd.Series(data=[0.0, 0.0, 0.0], index=['x', 'y', 'z'])
    
    for i, r in enumerate(secondary_structure):
        residue_3D = residues_df.loc[symbol_to_residue_name[r]].copy()
        translation = _calculate_next_residue_translation(
            previous_residue, residue_3D) if previous_residue is not None else zero_translation
        
        translated_residue_3D = (residue_3D + translation) 
        previous_residue = translated_residue_3D

        residue_3D = translated_residue_3D.reset_index()
        residue_3D['residue_name'] = symbol_to_residue_name[r]
        residue_3D['residue_seq_number'] = i
  
        protein = protein.append(residue_3D, sort=True, ignore_index=True)
    
    return protein

In [13]:
protein = _infer_tertiary_structure(secondary_structure, residues)

In [14]:
protein.head(20)

Unnamed: 0,atom_name,residue_name,residue_seq_number,x,y,z
0,C,THR,0,-0.240167,-0.037167,0.801833
1,CA,THR,0,0.0,0.0,0.0
2,CB,THR,0,0.147333,-0.027,0.160167
3,CG2,THR,0,0.073,0.057833,0.250167
4,N,THR,0,0.3885,0.220833,-0.9045
5,O,THR,0,-0.154167,-0.066667,1.378667
6,OG1,THR,0,0.593833,-0.321167,-0.5525
7,C,THR,1,-0.051316,0.31935,2.283684
8,CA,THR,1,0.18885,0.356517,1.48185
9,CB,THR,1,0.336184,0.329517,1.642017


Done!