**CMP585 - Structural Bioinformatics**

Cristian Lopes

*Assignment 2*

In [1]:
import pandas as pd
import numpy as np
import glob

# Load Data

In [2]:
def read_molecule(path):
    atoms = []
    with open(path, 'r') as f:
        for line in f:
            if line.startswith('ATOM'):
                atoms.append({'atom_name': line[12:16].strip(), 
                             'residue_name': line[17:20].strip(),
                             'residue_seq_number': line[22:26].strip(),
                             'x': float(line[30:38].strip()), 
                             'y': float(line[38:46].strip()),
                             'z': float(line[46:54].strip())})
    return pd.DataFrame(atoms)

In [3]:
molecule = read_molecule('./data/1crn.pdb')

In [4]:
molecule.head(20)

Unnamed: 0,atom_name,residue_name,residue_seq_number,x,y,z
0,N,THR,1,17.047,14.099,3.625
1,CA,THR,1,16.967,12.784,4.338
2,C,THR,1,15.685,12.755,5.133
3,O,THR,1,15.268,13.825,5.594
4,CB,THR,1,18.17,12.703,5.337
5,OG1,THR,1,19.334,12.829,4.463
6,CG2,THR,1,18.15,11.546,6.304
7,N,THR,2,15.115,11.555,5.265
8,CA,THR,2,13.856,11.469,6.066
9,C,THR,2,14.164,10.785,7.379


# Q1) Primary to Tertiary Structure

## Symbol of each amino acid

First of all, we define the symbol of each amino acid.

In [5]:
symbol_to_residue_name = {
    'T': 'THR', 'C': 'CYS', 'P': 'PRO', 'S': 'SER', 'I': 'ILE',
    'V': 'VAL', 'A': 'ALA', 'R': 'ARG', 'N': 'ASN', 'F': 'PHE',
    'L': 'LEU', 'G': 'GLY', 'E': 'GLU', 'D': 'ASP', 'Y': 'TYR'
}

## Load Amino Acids 

Then, we load the amino acids from the Protein Data Bank.

In [6]:
AMINOACID_PATH = './data/amino_acids/'
aminoacids = pd.DataFrame()

for f in glob.glob(AMINOACID_PATH + "*.pdb"):
    aminoacids = aminoacids.append(read_molecule(f))

In [7]:
aminoacids.head(20)

Unnamed: 0,atom_name,residue_name,residue_seq_number,x,y,z
0,N,ALA,1,0.039,-0.028,0.0
1,CA,ALA,1,1.499,-0.043,0.0
2,C,ALA,1,2.055,1.361,0.0
3,O,ALA,1,1.321,2.356,0.011
4,CB,ALA,1,1.956,-0.866,-1.217
5,1H,ALA,1,-0.524,0.894,0.0
6,2H,ALA,1,-0.543,-0.938,0.0
7,HA,ALA,1,1.847,-0.534,0.928
8,1HB,ALA,1,3.058,-0.939,-1.274
9,2HB,ALA,1,1.571,-1.903,-1.181


Right now, we have a dictionnary of amino acids containing the position of each atom inside the amino acid.

## Primary structure to tertiary structure

Next step is to replace each symbol of the secondary structure by its corresponding residue. 

We also calculate the necessary translation in each residue in order to give meaning to the positions of the atoms (considering a distance of 1.33 ångström between the Carbon and the Nitrogen of the peptide bond between 2 residues).  

In [8]:
# necessary distance in each axis in order 
# to have C-N euclidean distance equals to 1.33
desired_axis_distance = np.sqrt(np.square(1.33) / 3)
    
def _euclidean_distance(c_position, n_position):
    return np.sqrt(np.sum(((c_position - n_position) ** 2).values.reshape(-1)))

def _calculate_next_residue_translation(previous_aminoacid, next_aminoacid):
    c_position = previous_aminoacid[previous_aminoacid.atom_name == 'C'][['x', 'y', 'z']].reset_index(drop=True)
    n_position = next_aminoacid[next_aminoacid.atom_name == 'N'][['x', 'y', 'z']].reset_index(drop=True)
    axis_distance = n_position - c_position
    translation = desired_axis_distance - axis_distance
    peptide_bond_distance = _euclidean_distance(c_position, n_position + translation)

    assert 1.331 > peptide_bond_distance > 1.329
    return translation.loc[0]

Once we have a way to calculate the translation of the next aminoacidd in the chain given the previous one, we can implement the routine to generate the tertiary structure of a protein given the primary structure.

**Note: I only used this calculate translation method in order to give meaning to the atoms positions, but of course it is not right:  it uses a mean distance between C-N bond, it does not take into account the rotation of each aminoacid and it does not take into account the Phi and Psi angles.**

In [9]:
primary_structure = 'TTCCPSIVARSNFNVCRLPGTPEAICATYTGCIIIPGATCPGDYAN'

In [10]:
def _infer_tertiary_structure(primary_structure, aminoacids_df):
    protein = pd.DataFrame(data=None, columns=['atom_name', 'residue_name', 'residue_seq_number', 'x', 'y', 'z'])
    
    previous_aminoacid = None
    zero_translation = pd.Series(data=[0.0, 0.0, 0.0], index=['x', 'y', 'z'])
    
    for i, r in enumerate(primary_structure):
        aminoacid = aminoacids_df[aminoacids_df.residue_name == symbol_to_residue_name[r]].copy()
        translation = _calculate_next_residue_translation(
            previous_aminoacid, aminoacid) if previous_aminoacid is not None else zero_translation
        
        aminoacid.loc[:, ['x', 'y', 'z']] = aminoacid.loc[:, ['x', 'y', 'z']] + translation
        aminoacid['residue_seq_number'] = i
        
        previous_aminoacid = aminoacid
        protein = protein.append(aminoacid, sort=True, ignore_index=True)
    return protein

In [11]:
protein = _infer_tertiary_structure(primary_structure, aminoacids)

In [12]:
protein.head(20)

Unnamed: 0,atom_name,residue_name,residue_seq_number,x,y,z
0,N,THR,0,0.08,-0.083,0.0
1,CA,THR,0,1.539,-0.033,0.0
2,C,THR,0,2.032,1.394,0.0
3,O,THR,0,1.268,2.339,0.0
4,CB,THR,0,2.116,-0.818,-1.226
5,OG1,THR,0,1.743,-2.188,-1.162
6,CG2,THR,0,3.654,-0.842,-1.361
7,1H,THR,0,-0.523,0.813,0.0
8,2H,THR,0,-0.461,-1.018,0.0
9,HA,THR,0,1.904,-0.508,0.929


Done!