# CMP585 - Structural Bioinformatics

Cristian Lopes

# Assignment 1: Q2

In [1]:
import pandas as pd
import numpy as np

# Load Data

## Read Molecules

In [2]:
reference_path = 'data/reference.pdb'
model_paths = ['data/1ACW-0%s.pdb' % (1 + n) for n in range(6)]

In [3]:
def read_molecule(path):
    atoms = []
    with open(path, 'r') as f:
        for line in f:
            if line.startswith('ATOM'):
                atoms.append({'atom_name': line[12:16].strip(), 
                             'residue_name': line[17:20].strip(),
                             'residue_seq_number': line[22:26].strip(),
                             'x': float(line[30:38].strip()), 
                             'y': float(line[38:46].strip()),
                             'z': float(line[46:54].strip())})
    return pd.DataFrame(atoms)

### Reference Molecule

Given that all model molecules have 29 residues, we consider only the first 29 residues from the reference molecule.

In [4]:
reference_molecule = read_molecule(reference_path).iloc[0:410] # considering only the first 29 residues

In [5]:
reference_molecule.head()

Unnamed: 0,atom_name,residue_name,residue_seq_number,x,y,z
0,N,VAL,1,0.965,0.298,-0.467
1,CA,VAL,1,1.811,0.25,-1.701
2,C,VAL,1,3.29,0.4,-1.32
3,O,VAL,1,3.628,1.053,-0.346
4,CB,VAL,1,1.417,1.384,-2.664


### Model Molecules

In [6]:
model_molecules = [read_molecule(path) for path in model_paths] 

In [7]:
model_molecules[0].head()

Unnamed: 0,atom_name,residue_name,residue_seq_number,x,y,z
0,N,VAL,1,-1.352,-2.044,0.0
1,H1,VAL,1,-1.99,-1.261,0.0
2,H2,VAL,1,-1.446,-2.54,0.875
3,H3,VAL,1,-1.446,-2.54,-0.875
4,CA,VAL,1,0.0,-1.523,0.0


# a) Calculate RMSD

In [8]:
def euclidean_distance(p1, p2):
    squared_axis_distances = [(a - b) ** 2 for a, b in zip(p1, p2)]        
    return np.sqrt(np.sum(squared_axis_distances))

def rmsd(distances):
    squared_distances = [d ** 2 for d in distances]
    return np.sqrt(np.mean(squared_distances))

def molecules_rmsd(ref_molecule, model_molecule):  
    molecules_df = pd.merge(ref_molecule, model_molecule, on='residue_seq_number')
    
    euclidean_distances = []
    for idx, row in molecules_df.iterrows():
        p1 = [row.x_x, row.y_x, row.z_x]
        p2 = [row.x_y, row.y_y, row.z_y]
        euclidean_distances.append(euclidean_distance(p1, p2))
        
    return rmsd(euclidean_distances)

### Calculate RMSD for each model molecule

Let's calculate the RMSD between the reference molecule and the model molecule considering only the Alpha Carbons.

In [9]:
for i, model_molecule in enumerate(model_molecules):
    # Selecting only the Alpha Carbons
    reference_molecule_ca = reference_molecule[reference_molecule.atom_name == 'CA']
    model_molecule_ca = model_molecule[model_molecule.atom_name == 'CA']

    print('RMSD between reference molecule and model molecule %d: %.4f' % 
          ((i + 1), molecules_rmsd(reference_molecule_ca, model_molecule_ca)))

RMSD between reference molecule and model molecule 1: 23.5192
RMSD between reference molecule and model molecule 2: 35.7089
RMSD between reference molecule and model molecule 3: 23.6157
RMSD between reference molecule and model molecule 4: 20.7876
RMSD between reference molecule and model molecule 5: 22.1190
RMSD between reference molecule and model molecule 6: 25.3267


# b) Translate 3D Structure to Reference and Calculate RMSD

In [10]:
def calculate_translations(ref_atom, model_atom):
    t_x = ref_atom.x - model_atom.x
    t_y = ref_atom.y - model_atom.y
    t_z = ref_atom.z - model_atom.z
    return t_x, t_y, t_z

def translate_molecule_to_reference(ref_molecule, model_molecule):
    # Taking the first Alpha Carbon from reference molecule and model molecule as system reference
    ref_atom = ref_molecule[ref_molecule.atom_name == 'CA'].iloc[0]
    model_atom = model_molecule[model_molecule.atom_name == 'CA'].iloc[0]
    
    t_x, t_y, t_z = calculate_translations(ref_atom, model_atom)
    
    translated_model_molecule = model_molecule.copy()
    translated_model_molecule['x'] = model_molecule['x'] + t_x
    translated_model_molecule['y'] = model_molecule['y'] + t_y
    translated_model_molecule['z'] = model_molecule['z'] + t_z
    
    return translated_model_molecule

### Translate each model molecule

In [11]:
translated_model_molecules = []

for model_molecule in model_molecules:
    translated_model_molecules.append(
        translate_molecule_to_reference(reference_molecule, model_molecule))

### Calculate RMSD for each translated model molecule

In [12]:
for i, translated_model_molecule in enumerate(translated_model_molecules):
    # Selecting only the Alpha Carbons
    reference_molecule_ca = reference_molecule[reference_molecule.atom_name == 'CA']
    translated_model_molecule_ca = translated_model_molecule[translated_model_molecule.atom_name == 'CA']

    print('RMSD between reference molecule and translated model molecule %d: %.4f' % 
          ((i + 1), molecules_rmsd(reference_molecule_ca, translated_model_molecule_ca)))

RMSD between reference molecule and translated model molecule 1: 23.2142
RMSD between reference molecule and translated model molecule 2: 33.8033
RMSD between reference molecule and translated model molecule 3: 24.4166
RMSD between reference molecule and translated model molecule 4: 19.7216
RMSD between reference molecule and translated model molecule 5: 23.5102
RMSD between reference molecule and translated model molecule 6: 25.9930


Done!