**CMP585 - Structural Bioinformatics**

Cristian Lopes

*Assignment 4*

In [1]:
import pandas as pd
import numpy as np
import glob
import matplotlib.pyplot as plt
import time

# Load Data

In [2]:
def read_molecule(path):
    atoms = []
    with open(path, 'r') as f:
        for line in f:
            if line.startswith('ATOM'):
                atoms.append({'atom_name': line[12:16].strip(),
                              'element': line[77:78].strip(),
                             'residue_name': line[17:20].strip(),
                             'residue_seq_number': line[22:26].strip(),
                             'x': float(line[30:38].strip()), 
                             'y': float(line[38:46].strip()),
                             'z': float(line[46:54].strip())})
    return pd.DataFrame(atoms)

In [3]:
molecule = read_molecule('./data/1rop.pdb')

In [4]:
molecule.head(20)

Unnamed: 0,atom_name,element,residue_name,residue_seq_number,x,y,z
0,N,N,MET,1,31.007,2.29,18.034
1,CA,C,MET,1,32.39,2.582,17.546
2,C,C,MET,1,32.808,1.432,16.618
3,O,O,MET,1,32.375,0.28,16.846
4,CB,C,MET,1,33.271,2.428,18.81
5,CG,C,MET,1,34.257,3.54,18.961
6,SD,S,MET,1,35.104,3.15,20.545
7,CE,C,MET,1,33.734,3.285,21.683
8,N,N,THR,2,33.714,1.715,15.687
9,CA,C,THR,2,34.312,0.62,14.917


# Q1 - a) Peptide Bond

First of all, we define the symbol of each amino acid.

In [5]:
symbol_to_residue_name = {
    'T': 'THR', 'C': 'CYS', 'P': 'PRO', 'S': 'SER', 'I': 'ILE',
    'V': 'VAL', 'A': 'ALA', 'R': 'ARG', 'N': 'ASN', 'F': 'PHE',
    'L': 'LEU', 'G': 'GLY', 'E': 'GLU', 'D': 'ASP', 'Y': 'TYR',
    'M': 'MET', 'K': 'LYS', 'Q': 'GLN', 'H': 'HIS', 'W': 'TRP'
}

## Load Amino Acids

Then, we load the amino acids from the Protein Data Bank.

In [6]:
AMINOACID_PATH = './data/amino_acids/'
aminoacids = pd.DataFrame()

for f in glob.glob(AMINOACID_PATH + "*.pdb"):
    aminoacids = aminoacids.append(read_molecule(f))

Right now, we have a dictionnary of amino acids containing the position of each atom inside the amino acid.

## Translation and rotation

Next step is to replace each symbol of the primary structure by its corresponding residue. First, we define a translation and a rotation function to move and rotate the next amino acid given the position of the previous aminoacid.

In [7]:
def translate(aminoacid, translation):
    aminoacid[['x', 'y', 'z']] = aminoacid[['x', 'y', 'z']] + translation
    return aminoacid

In [8]:
def translation_to_origin(aminoacid, ref_atom='N'):
    ref_translation = aminoacid[aminoacid.atom_name == ref_atom][['x', 'y', 'z']].values.reshape(-1)
    return ref_translation, translate(aminoacid, -ref_translation)

def rotate(aminoacid, angle_x, angle_y, angle_z):  
    Rx = [[1, 0, 0], [0, np.cos(angle_x), -np.sin(angle_x)], [0, np.sin(angle_x), np.cos(angle_x)]]
    Ry = [[np.cos(angle_y), 0, np.sin(angle_y)], [0, 1, 0], [-np.sin(angle_y), 0, np.cos(angle_y)]]
    Rz = [[np.cos(angle_z), -np.sin(angle_z), 0], [np.sin(angle_z), np.cos(angle_z), 0], [0, 0, 1]]
    R = np.dot(np.dot(Rx, Ry), Rz)
    display(R)
    
    ref_translation, aminoacid = translation_to_origin(aminoacid)
    display(aminoacid)
    aminoacid[['x', 'y', 'z']] = R.dot(aminoacid[['x', 'y', 'z']].values.T).T
    aminoacid = translate(aminoacid, ref_translation)

    return aminoacid

## Peptide Bond  Formation

A peptide bond is a chemical bond formed between two molecules when the carboxyl group of one molecule reacts with the amino group of the other molecule, releasing a molecule of water (H2O). So, given the position of the Oxygen of the carboxyl group of the previous molecule and the position of the Nitrogen of the next molecule, we can translate the next molecule in order for its Nitrogen to be around the Oxygen position of the previous molecule.

In [9]:
sequence = 'MTKQEKTALNMARFIRSQTLTLLEKLNELDADEQADICESLHDHADELYRSCLARFGDDGENL'

In [10]:
def _calculate_translation(previous_aminoacid, next_aminoacid):
    o_pos = previous_aminoacid[previous_aminoacid.atom_name == 'OC'][['x', 'y', 'z']].values.reshape(-1)
    n_pos = next_aminoacid[next_aminoacid.atom_name == 'N'][['x', 'y', 'z']].values.reshape(-1)
    return o_pos - n_pos       

def remove_H2O(aminoacid):
    return aminoacid[~aminoacid.atom_name.isin(['OC', 'HC', '1H'])]

def _assemble_3D_structure(sequence, aminoacids_df):
    protein = pd.DataFrame(data=None, columns=['atom_name', 'element','residue_name', 'residue_seq_number', 'x', 'y', 'z'])    
    previous_aminoacid, rotation = None, (0.0, 0.0, 0.0)
    
    for i, r in enumerate(sequence):
        aminoacid = aminoacids_df[aminoacids_df.residue_name == symbol_to_residue_name[r]].copy()
              
        if previous_aminoacid is not None:
            translation = _calculate_translation(previous_aminoacid, aminoacid)
            aminoacid = translate(aminoacid, translation)
        
        aminoacid['residue_seq_number'] = str(i)
        
        previous_aminoacid = aminoacid
        
        aminoacid = remove_H2O(aminoacid)
        protein = protein.append(aminoacid, sort=True, ignore_index=True)
     
    return protein

In [11]:
protein = _assemble_3D_structure(sequence, aminoacids)

In [12]:
protein.head()

Unnamed: 0,atom_name,element,residue_name,residue_seq_number,x,y,z
0,N,N,MET,0,0.071,-0.215,0.0
1,CA,C,MET,0,1.519,-0.024,0.0
2,C,C,MET,0,1.871,1.444,0.0
3,O,O,MET,0,1.002,2.328,0.0
4,CB,C,MET,0,2.146,-0.745,-1.225


## Save do .pdb

In [13]:
def write_atom_to_file(atom_row, aminoacid_seq_number, fd):
    global atom_seq_number
    atom_seq_number += 1
    if 'H' not in atom_row.atom_name:
        template_row =\
        'ATOM    '+\
        ('%d'%atom_seq_number).rjust(3)+\
        '  '+\
        ('%s'%atom_row[0]).rjust(3)+\
        ' '+\
        ('%s'%atom_row.residue_name).rjust(3)+\
        ' A'+\
        ('%d'%aminoacid_seq_number).rjust(4)+\
        '    '+\
        ('%.3f'%atom_row.x).rjust(8)+\
        ('%.3f'%atom_row.y).rjust(8)+\
        ('%.3f'%atom_row.z).rjust(8)+\
        '  1.00  5.99           '+\
        ('%s  \n'%atom_row.element)
        f.write(template_row)

In [14]:
atom_seq_number = 0

with open('inferred_molecule.pdb', 'w') as f:
    for idx in range(len(sequence)):
        for _, atom_row in protein[protein.residue_seq_number == str(idx)].iterrows():
            write_atom_to_file(atom_row, idx, f)

Done!