In [None]:
import pandas as pd
import numpy as np

# Load Data

## Data Paths

In [None]:
reference_path = 'data/reference.pdb'
model_paths = ['data/1ACW-0%s.pdb' % (1 + n) for n in range(5)]

## Read Molecules

In [None]:
def read_molecule(path):
    atoms = []
    with open(path, 'r') as f:
        for line in f:
            if line.startswith('ATOM'):
                atoms.append({'atom_name': line[12:16].strip(), 
                             'residue_name': line[17:20].strip(),
                             'residue_seq_number': line[22:26].strip(),
                             'x': float(line[30:38].strip()), 
                             'y': float(line[38:46].strip()),
                             'z': float(line[46:54].strip())})
    return pd.DataFrame(atoms)

## Reference Molecule

Given that all model molecules have 29 residues, we consider only the first 29 residues from the reference molecule.

In [None]:
reference_molecule = read_molecule(reference_path).iloc[0:410] # considering only the first 29 residues

In [None]:
reference_molecule.head()

## Model Molecule

In [None]:
model_molecule = read_molecule(model_paths[0])

In [None]:
model_molecule.head()

# Calculate RMSD

In [None]:
def euclidean_distance(p1, p2):
    axis_squared_distances = [(a - b) ** 2 for a, b in zip(p1, p2)]        
    return np.sqrt(axis_squared_distances.sum())

def rmsd(distances):
    squared_distances = [d ** 2 for d in distances]
    mean_squared_distances = np.mean(squared_distances)
    return np.sqrt(mean_squared_distances)

def molecules_rmsd(ref_molecule, model_molecule):  
    molecules_df = pd.merge(ref_molecule, model_molecule, on='residue_seq_number')
    
    distances = []
    for idx, row in molecules_df.iterrows():
        p1 = [row.x_x, row.y_x, row.z_x]
        p2 = [row.x_y, row.y_y, row.z_y]
        distances.append(euclidean_distance(p1, p2))
        
    return rmsd(distances)

## Select only alpha carbon

In [None]:
reference_molecule_ca = reference_molecule[reference_molecule.atom_name == 'CA']
model_molecule_ca = model_molecule[model_molecule.atom_name == 'CA']

In [None]:
rmsd(reference_molecule_ca, model_molecule_ca)

Done!