**CMP585 - Structural Bioinformatics**

Cristian Lopes

*Assignment 1: Q3*

In [1]:
import pandas as pd
import numpy as np

# Load Data

## Read Molecules

In [2]:
reference_path = 'data/reference.pdb'
model_paths = ['data/1ACW-0%s.pdb' % (1 + n) for n in range(6)]

In [3]:
def read_molecule(path):
    atoms = []
    with open(path, 'r') as f:
        for line in f:
            if line.startswith('ATOM'):
                atoms.append({'atom_name': line[12:16].strip(), 
                             'residue_name': line[17:20].strip(),
                             'residue_seq_number': line[22:26].strip(),
                             'x': float(line[30:38].strip()), 
                             'y': float(line[38:46].strip()),
                             'z': float(line[46:54].strip())})
    return pd.DataFrame(atoms)

## Reference Molecule

In [4]:
reference_molecule = read_molecule(reference_path).iloc[:410]

In [5]:
reference_molecule.head()

Unnamed: 0,atom_name,residue_name,residue_seq_number,x,y,z
0,N,VAL,1,0.965,0.298,-0.467
1,CA,VAL,1,1.811,0.25,-1.701
2,C,VAL,1,3.29,0.4,-1.32
3,O,VAL,1,3.628,1.053,-0.346
4,CB,VAL,1,1.417,1.384,-2.664


## Model Molecules

In [6]:
model_molecules = [read_molecule(path) for path in model_paths]

In [7]:
model_molecules[0].head()

Unnamed: 0,atom_name,residue_name,residue_seq_number,x,y,z
0,N,VAL,1,-1.352,-2.044,0.0
1,H1,VAL,1,-1.99,-1.261,0.0
2,H2,VAL,1,-1.446,-2.54,0.875
3,H3,VAL,1,-1.446,-2.54,-0.875
4,CA,VAL,1,0.0,-1.523,0.0


# Calculate RMSD for translated molecules

## Define translation

In [8]:
def calculate_translation(ref_atom, model_atom):
    return ref_atom[['x', 'y', 'z']] - model_atom[['x', 'y', 'z']]

def translate_molecule_to_new_reference(ref_molecule, model_molecule, translation): 
    translated_model_molecule = model_molecule.copy()
    translated_model_molecule['x'] = model_molecule['x'] + translation['x']
    translated_model_molecule['y'] = model_molecule['y'] + translation['y']
    translated_model_molecule['z'] = model_molecule['z'] + translation['z']
    
    return translated_model_molecule

## Define RMSD

In [9]:
def molecules_rmsd(ref_molecule, model_molecule):
    molecules_df = pd.merge(ref_molecule, model_molecule, on=['residue_seq_number', 'atom_name'])
    
    molecules_df['x_distance_squared'] = (molecules_df['x_x'] - molecules_df['x_y']) ** 2
    molecules_df['y_distance_squared'] = (molecules_df['y_x'] - molecules_df['y_y']) ** 2
    molecules_df['z_distance_squared'] = (molecules_df['z_x'] - molecules_df['z_y']) ** 2
    
    molecules_df['euclidean_distance_squared'] = (molecules_df['x_distance_squared'] + 
                                                  molecules_df['y_distance_squared'] +
                                                  molecules_df['z_distance_squared'])
    
    return np.sqrt(np.mean(molecules_df['euclidean_distance_squared']))

# Genetic Algorithm

In [10]:
model_molecule = model_molecules[0]

## Genetic Algorithm Step by Step

### Initialization

In [11]:
DIM_POPULATION = 3
N_POPULATION = 10

In [12]:
def initialize_population(reference_molecule, model_molecule):
    # This method considers the first Alpha Carbon as system reference
    ref_system_ref = reference_molecule[reference_molecule.atom_name == 'CA'].iloc[0]
    molecule_system_ref = model_molecule[model_molecule.atom_name == 'CA'].iloc[0]

    initial_magnitude = calculate_translation(ref_system_ref, molecule_system_ref)
    initial_population = [initial_magnitude * np.random.uniform(-1, 1, DIM_POPULATION)
                          for _ in range(N_POPULATION)]

    return initial_population

In [13]:
initial_population = initialize_population(reference_molecule, model_molecule)

In [14]:
pd.DataFrame(initial_population)

Unnamed: 0,x,y,z
0,-0.939931,1.348071,1.132427
1,0.05396,1.082299,-1.271295
2,-1.720649,0.839037,-0.948324
3,-1.678878,0.768482,-0.913947
4,-0.169963,-1.086532,-1.063963
5,1.525452,-0.211527,-0.549096
6,0.880129,1.508413,-1.599898
7,-0.758998,-1.761677,0.44397
8,0.24257,-0.258912,1.038927
9,0.166745,-0.174647,1.487495


### Fitness Function

In [15]:
def individual_fitness(reference_molecule, model_molecule, translation, atoms=['C', 'N', 'O']):
    translated_model_molecule = translate_molecule_to_new_reference(reference_molecule, model_molecule, translation)
    
    # Selecting atoms of interest
    reference_molecule_filtered = reference_molecule[reference_molecule.atom_name.isin(atoms)]
    translated_model_molecule_filtered = translated_model_molecule[translated_model_molecule.atom_name.isin(atoms)]

    return molecules_rmsd(reference_molecule_filtered, translated_model_molecule_filtered)

In [16]:
population = initial_population

fitness = [individual_fitness(reference_molecule, model_molecule, individual) 
                      for individual in population]

In [17]:
pd.DataFrame(fitness, columns=['fitness'])

Unnamed: 0,fitness
0,22.498588
1,23.543758
2,23.808348
3,23.821679
4,24.684095
5,23.782393
6,23.394216
7,24.541927
8,23.266211
9,23.056183


### Selection

In [18]:
def fitness_proportionate_selection(population, fitness):
    probabilities = (np.max(fitness) - fitness) / np.sum(np.max(fitness) - fitness)
    idx = np.random.choice(N_POPULATION, replace=True, p=probabilities)
    
    return population[idx]

In [19]:
parent = fitness_proportionate_selection(population, fitness)

In [20]:
pd.DataFrame(parent)

Unnamed: 0,0
x,1.52545
y,-0.211527
z,-0.549096


### Crossover

In [21]:
def intermediate_recombination(population, fitness):
    p = 0.1
    alpha = np.random.uniform(-p, 1+p)
    beta = np.random.uniform(-p, 1+p)
    
    parent_1 = fitness_proportionate_selection(population, fitness)
    parent_2 = fitness_proportionate_selection(population, fitness)
    
    children_1 = alpha * parent_1 + (1 - alpha) * parent_2
    children_2 = beta * parent_1 + (1 - beta) * parent_2   
    
    return [children_1, children_2]

In [22]:
childrens = intermediate_recombination(population, fitness)

In [23]:
pd.DataFrame(childrens)

Unnamed: 0,x,y,z
0,0.044032,1.192938,-1.514128
1,0.169284,-0.202943,1.5496


### Mutation

In [24]:
def gaussian_convolution(children):
    p = np.random.rand(DIM_POPULATION)
    sigma = 1
    noise = np.random.randn(DIM_POPULATION)
    
    return children + noise * (p > 0.5)

In [25]:
mutated_children = gaussian_convolution(childrens[0])

In [26]:
pd.DataFrame(population)

Unnamed: 0,x,y,z
0,-0.939931,1.348071,1.132427
1,0.05396,1.082299,-1.271295
2,-1.720649,0.839037,-0.948324
3,-1.678878,0.768482,-0.913947
4,-0.169963,-1.086532,-1.063963
5,1.525452,-0.211527,-0.549096
6,0.880129,1.508413,-1.599898
7,-0.758998,-1.761677,0.44397
8,0.24257,-0.258912,1.038927
9,0.166745,-0.174647,1.487495


## Genetic Algorithm Implementation

In [27]:
class GeneticAlgorithm:
    def __init__(self, dim_populaton, n_population, ref_molecule, molecule, atoms):
        self.DIM_POPULATION = 3
        self.N_POPULATION = 30

        self.ref_molecule = ref_molecule
        self.molecule = molecule
        self.atoms = atoms

        self.population = self.initialize_population()
        self.fitness = self.compute_population_fitness()

    def compute_translation(self, ref_atom, atom):
        return ref_atom[['x', 'y', 'z']] - atom[['x', 'y', 'z']]

    def translate_molecule_to_new_reference(self, ref_molecule, molecule, translation):
        translated_molecule = molecule.copy()
        translated_molecule['x'] = molecule['x'] + translation['x']
        translated_molecule['y'] = molecule['y'] + translation['y']
        translated_molecule['z'] = molecule['z'] + translation['z']

        return translated_molecule

    def compute_rmsd(self, ref_molecule, molecule):
        molecules_df = pd.merge(ref_molecule, molecule, on=['residue_seq_number', 'atom_name'])

        molecules_df['x_distance_squared'] = (molecules_df['x_x'] - molecules_df['x_y']) ** 2
        molecules_df['y_distance_squared'] = (molecules_df['y_x'] - molecules_df['y_y']) ** 2
        molecules_df['z_distance_squared'] = (molecules_df['z_x'] - molecules_df['z_y']) ** 2

        molecules_df['euclidean_distance_squared'] = (molecules_df['x_distance_squared'] +
                                                      molecules_df['y_distance_squared'] +
                                                      molecules_df['z_distance_squared'])

        return np.sqrt(np.mean(molecules_df['euclidean_distance_squared']))
    
    # Initialization
    def initialize_population(self):
        # This method considers the first Alpha Carbon as system reference
        ref_system_ref = self.ref_molecule[self.ref_molecule.atom_name == 'CA'].iloc[0]
        molecule_system_ref = self.molecule[self.molecule.atom_name == 'CA'].iloc[0]

        initial_magnitude = self.compute_translation(ref_system_ref, molecule_system_ref)
        initial_population = [initial_magnitude * np.random.uniform(-1, 1, self.DIM_POPULATION)
                              for _ in range(self.N_POPULATION)]

        return initial_population
    
    def generate_new_population(self):
        new_population = []
        for _ in range(self.N_POPULATION // 2):
            childrens = self.intermediate_recombination()
            new_population.extend([self.gaussian_convolution(children) for children in childrens])

        return new_population

    # Fitness
    def individual_fitness(self, translation):
        translated_molecule = self.translate_molecule_to_new_reference(self.ref_molecule, self.molecule, translation)

        # Selecting atoms of interest
        ref_molecule_filtered = self.ref_molecule[self.ref_molecule.atom_name.isin(self.atoms)]
        translated_molecule_filtered = translated_molecule[translated_molecule.atom_name.isin(self.atoms)]

        return self.compute_rmsd(ref_molecule_filtered, translated_molecule_filtered)

    def compute_population_fitness(self):
        return [self.individual_fitness(individual)
                for individual in self.population]


    # Selection
    def fitness_proportionate_selection(self):
        probabilities = (np.max(self.fitness) - self.fitness) / np.sum(np.max(self.fitness) - self.fitness)
        idx = np.random.choice(self.N_POPULATION, replace=True, p=probabilities)

        return self.population[idx]

    # Crossover
    def intermediate_recombination(self):
        p = 0.1
        alpha = np.random.uniform(-p, 1+p)
        beta = np.random.uniform(-p, 1+p)

        parent_1 = self.fitness_proportionate_selection()
        parent_2 = self.fitness_proportionate_selection()

        children_1 = alpha * parent_1 + (1 - alpha) * parent_2
        children_2 = beta * parent_1 + (1 - beta) * parent_2

        return [children_1, children_2]

    # Mutation
    def gaussian_convolution(self, children):
        add_noise_prob = np.random.rand(self.DIM_POPULATION)
        
        sigma = 0.2
        noise = sigma * np.random.randn(self.DIM_POPULATION)

        return children + noise * (add_noise_prob > 0.5)

    def optimize(self):
        # Stop criterion is missing
        for _ in range(100):
            self.population = self.generate_new_population()
            self.fitness = self.compute_population_fitness()
            
        print('Optimal Solution:')
        pd.DataFrame(self.population[np.argmin(self.fitness)])
        print('Optimized RMSD: %.2f' % np.min(self.fitness))

# Q3 - a) Backbone

In [28]:
atoms = ['C', 'N', 'O']

In [29]:
genetic_algorithm = GeneticAlgorithm(DIM_POPULATION, N_POPULATION, reference_molecule, model_molecule, atoms)

In [30]:
genetic_algorithm.optimize()

Optimal Solution:
Optimized RMSD: 17.41


# Q3 - b) Alpha Carbon

In [31]:
atoms = ['CA']

In [32]:
genetic_algorithm = GeneticAlgorithm(DIM_POPULATION, N_POPULATION, reference_molecule, model_molecule, atoms)

In [33]:
genetic_algorithm.optimize()

Optimal Solution:
Optimized RMSD: 17.59


# Q3 - c) Polypeptide

In [34]:
atoms = ['C', 'N', 'O', 'H']

In [35]:
genetic_algorithm = GeneticAlgorithm(DIM_POPULATION, N_POPULATION, reference_molecule, model_molecule, atoms)

In [36]:
genetic_algorithm.optimize()

Optimal Solution:
Optimized RMSD: 17.46


Done!