In [2]:
import mdtraj as md
import numpy as np
import math

import networkx as nx

from ele import element_from_symbol

# Hydrogen bond analysis by atomic species

This example will perform effectively the same analysis as the `simple_hbond_calculation.ipynb` by will consider only a single frame, outputting a list of pairs of involved in hydrogen bonding, allowing us to examine which groups on each molecule are most involved.

We will quickly go through all of the pieces that are identical to the prior notebook

In [3]:
# path to the trajectory file
traj_path = '../../4_prod_f305/trimmed_470.xtc'

# we need to tell us what tpr file to use
tpr_path = '../../4_prod_f305/run.tpr'

# we also need to know the location of the index file
ndx_path = '../../index.ndx'


In [4]:
groups =!cat ../../index.ndx | grep '\['

#strip out the square brackets and space
for i,group in enumerate(groups):
    group = group.strip('[]')
    group = group.strip()
    groups[i] = group
    
stripped_group = []
for i in range(0, len(groups)):
    if 'Other' not in groups[i]:
        if 'System' not in groups[i]:
            if 'lipids' not in groups[i]:
                temp_group = {'name': groups[i], 'index':i}
                stripped_group.append(temp_group)
                


## Calculating by atom 

GROMACS will provide a list of pairs forming hydrogen bonds as an index file, by setting the flag `-hbn` and a filename.

However, if you consider a whole trajectory, it mushes it all together in a way that doesn't make clear which bonds occur at which time. The easiest way I've found to handle this is to simple perform the calculation at a single timestep (and just do this calculation in a loop if we want to have averages or  information as a function of time).  

Setting the time range is accomplished via flags for the beginning `-b` and ending `-e` times to calculate. Note these times are in ps, and are not integers associated with individual frames.  

We can get the time associated with each from from mdtraj:

In [5]:
traj = md.load('../../4_prod_f305/trimmed_470.xtc', 
            top='../../4_prod_f305/confout.gro')

We will just grab the last two times to set the range for our analysis

In [9]:
time_end = traj[-1].time[0]
time_start = traj[-2].time[0]

filenames = []
for i in range(0, len(stripped_group)):
    for j in range(i, len(stripped_group)):
        index_i = stripped_group[i]['index']
        index_j = stripped_group[j]['index']
        name_i = stripped_group[i]['name']
        name_j = stripped_group[j]['name']
        print(index_i, index_j, '##', name_i, name_j)

        
        text_file = open(f'selection.txt', 'w')
        text_file.write(f'{index_i}\n{index_j}\n')
        text_file.close()

        output_file = f'hbond_{name_i}_{name_j}.xvg'
        output_ndx_file = f'hbond_{name_i}_{name_j}.ndx'
        filenames.append(output_ndx_file)
        msg = f'/usr/local/gromacs/bin/gmx hbond -f {traj_path} -s {tpr_path} -n {ndx_path} -num {output_file} -hbn {output_ndx_file} -b {time_start} -e {time_end}< selection.txt'
        
        print(msg)
        !{msg}

2 2 ## ucer2 ucer2
/usr/local/gromacs/bin/gmx hbond -f ../../4_prod_f305/trimmed_470.xtc -s ../../4_prod_f305/run.tpr -n ../../index.ndx -num hbond_ucer2_ucer2.xvg -hbn hbond_ucer2_ucer2.ndx -b 28200.0 -e 28300.0< selection.txt
                      :-) GROMACS - gmx hbond, 2022.2 (-:

Executable:   /usr/local/gromacs/bin/gmx
Data prefix:  /usr/local/gromacs
Working dir:  /Users/cri/Dropbox/Mac (3)/Documents/Projects/CER_reverse_mapped_v3_allext/analysis_scripts/hydrogen_bonding
Command line:
  gmx hbond -f ../../4_prod_f305/trimmed_470.xtc -s ../../4_prod_f305/run.tpr -n ../../index.ndx -num hbond_ucer2_ucer2.xvg -hbn hbond_ucer2_ucer2.ndx -b 28200.0 -e 28300.0

Reading file ../../4_prod_f305/run.tpr, VERSION 2020.6 (single precision)
Note: file tpx version 119, software tpx version 127
Specify 2 groups to analyze:
Group     0 (         System) has 271956 elements
Group     1 (          Other) has 271956 elements
Group     2 (          ucer2) has 103200 elements
Group     3 (         

## Parsing the data

Let us use the Lipid class to make it easier to deal with the index files that are generated.

In [10]:
#a data container class to make it easier to query the system than mdtraj's datastructure
# in particular, this will allow me to calculate com of each lipid, including separately consider
# each tail in the CERs and identify hairpin vs. extended

class Lipid:
    def __init__(self, name, res_id):
        self.name = name
        self.atom_xyz = []
        self.atom_index = []
        self.atom_element = []
        self.atom_name = []
        self.res_id = res_id
        self.com = []
        self.is_hairpin = True

        self.com1 = []  #if we are a two chain lipid, we will also calculate com of each chain
        self.com2 = []
        self.headgroup_com = []
        self.carbons_chain1 = [] #if we are a two chain lipid, we need to know which indices to consider
        self.carbons_chain2 =[]
        self.headgroup = []
        self.cer_graph = nx.Graph() 
        self.angle = 0
        
        self.calc_com_performed = False
        self.calc_tails_performed = False


    #set up arays that store the pattern for each chain
    def add_atom(self, xyz, element, index):
        self.atom_xyz.append(np.array(xyz))
        # often the element name has a digit appended to it
        # while useful for identifing atoms within the molecule, we don't need it and 
        # actually need to remove to to identify the mass
        element_stripped = "".join(filter(lambda x: not x.isdigit(), element))
        self.atom_element.append(element_stripped)
        self.atom_name.append(element)
        self.atom_index.append(index)
    
    def calc_tails(self):
                
        #loop over all C-C and C-H particle pairs in the residue
        for i in range(0, len(self.atom_xyz)):
            if self.atom_element[i] == 'C':
                for j in range(i+1, len(self.atom_xyz)):
                    if self.atom_element[j] == 'C' or self.atom_element[j] == 'H':
                        #note we are assuming we have an unwrapped trajectory
                        dist = np.linalg.norm(self.atom_xyz[j]-self.atom_xyz[i])
                        if dist < 0.18:
                            self.cer_graph.add_edge(i,j) #note we will use relative indices in the residue
                            
        chain_ids = []

        for c in nx.connected_components(self.cer_graph):                
            chain_temp = []
            for node in self.cer_graph.subgraph(c).nodes:
                # first let us consider the C-backbone atoms
                if 'C' in self.atom_element[node]:
                    connections = self.cer_graph.edges(node)
                    #only consider those with 4 connections
                    if len(connections) == 4:
                        C_count = 0
                        H_count = 0
                        for connection in connections:
                            temp_id = connection[1] #who we are connected to is the second element
                            if 'H' in self.atom_element[temp_id]:
                                H_count = H_count + 1
                            elif 'C' in self.atom_element[temp_id]:
                                C_count = C_count + 1
                        #we specified we need 4 connections and that they need to be either [C][C][H][H] or [C][H][H][H]
                        if C_count >= 1 and H_count == 2:
                            chain_temp.append(node) #add the base carbon to the list of atoms

                            for connection in connections:
                                temp_id = connection[1]
                                #note only add the hydrogen atoms since we looping over all Carbons already
                                if 'H' in self.atom_element[temp_id]:
                                    chain_temp.append(connection[1])
                                    
            chain_ids.append(chain_temp) 
                
        if len(chain_ids) == 2:
            self.carbons_chain1 = chain_ids[0]
            self.carbons_chain2 = chain_ids[1]
        else:
            print("Expected two tails for cer, but we identified: ", len(chain_ids))
        for i in range(0, len(self.atom_xyz)):
            if i not in self.carbons_chain1:
                if i not in self.carbons_chain2:
                    self.headgroup.append(i)

        self.calc_tails_performed = True

                          
    def calc_com(self):
        temp_pos = [0,0,0] 
        norm = 0
        for i in range(0, len(self.atom_element)):
            temp_mass = element_from_symbol(self.atom_element[i]).mass
            norm = norm + temp_mass
            for j in range(0,3):
                temp_pos[j] = temp_pos[j] + temp_mass * self.atom_xyz[i][j]
        
        for j in range(0,3):
            temp_pos[j] = temp_pos[j]/(norm)
        self.com = temp_pos
        if self.name == 'ucer2':
            #make sure we have identified the tails 
            # call the function if we haven't yet
            if self.calc_tails_performed == False:
                self.calc_tails()
            temp_pos1 = [0,0,0] 
            norm = 0
            for i in range(0, len(self.carbons_chain1)):
                ii = self.carbons_chain1[i]
                temp_mass = element_from_symbol(self.atom_element[ii]).mass
                norm = norm + temp_mass
                for j in range(0,3):
                    temp_pos1[j] = temp_pos1[j] + temp_mass * self.atom_xyz[ii][j]
        
            for j in range(0,3):
                temp_pos1[j] = temp_pos1[j]/(norm)
            self.com1 = temp_pos1
            
            temp_pos2 = [0,0,0] 
            norm = 0
            for i in range(0, len(self.carbons_chain2)):
                ii = self.carbons_chain2[i]
                temp_mass = element_from_symbol(self.atom_element[ii]).mass
                norm = norm + temp_mass
                for j in range(0,3):
                    temp_pos2[j] = temp_pos2[j] + temp_mass * self.atom_xyz[ii][j]
        
            for j in range(0,3):
                temp_pos2[j] = temp_pos2[j]/(norm)
            self.com2 = temp_pos2
            
            temp_pos3 = [0,0,0] 
            norm = 0
            for i in range(0, len(self.headgroup)):
                ii = self.headgroup[i]
                temp_mass = element_from_symbol(self.atom_element[ii]).mass
                norm = norm + temp_mass
                for j in range(0,3):
                    temp_pos3[j] = temp_pos3[j] + temp_mass * self.atom_xyz[ii][j]
        
            for j in range(0,3):
                temp_pos3[j] = temp_pos3[j]/(norm)
            self.headgroup_com = temp_pos3
        self.calc_com_performed = True
    
    # mdtraj/mbuild/vmd all like nm as their unit; vmd does angstroms, so often useful to convert
    def scale_com(self, factor=10.0):    
        for j in range(0,3):
            self.com[j] = self.com[j]*factor
            if self.name == 'ucer2':
                self.com1[j] = self.com1[j]*factor
                self.com2[j] = self.com2[j]*factor

    def check_hairpin(self): 
        if self.calc_com_performed == False:
            self.calc_com()

        temp1 = np.array(self.com1)- np.array(self.headgroup_com)
        temp2 = np.array(self.com2)- np.array(self.headgroup_com)
        
        uv_1 = temp1/np.linalg.norm(temp1)
        uv_2 = temp2/np.linalg.norm(temp2)

        dp = np.dot(uv_1, uv_2)
        self.angle = np.arccos(dp)
        
        #hairpin if greater than 90
        if self.angle > 3.14/2:
            self.is_hairpin = False

In [11]:
lipids = []
total_lipids = 0

chain_id = 0
cer_id= 0
res_id = 0
for residue in traj[0].topology.residues:
    temp_lipid = Lipid(residue.name,res_id)
    for atom in residue.atoms:
        temp_lipid.add_atom(traj.xyz[0,atom.index,:],atom.name, atom.index)
    if 'cer' in residue.name:
        temp_lipid.calc_tails()
        temp_lipid.check_hairpin()
    temp_lipid.calc_com()
    lipids.append(temp_lipid)
    res_id = res_id+1

We will define a simple dictionary that allows us to quickly associate an atom index with the atom_name rather than dealing with the mdtraj trajectory (which is going to be slower to index into).

In [12]:
lookup_dict = {}
for lipid in lipids:
    for i,atom in enumerate(lipid.atom_index):
        lookup_dict[atom] = lipid.atom_name[i] 

We will define a class that allows us to see if an h-bond pair already exists in the list, if not, we will append it and then it will update a counter for each pair so we can keep track of the number of hbonds by atomtype pairs.    This is probably not the most efficient but works. 

In [13]:
class hbond_agg:
    
    def __init__(self):
        self.hbonds_pair = []
        self.count_pair = []
        
    def check_hbond(self, check_pair):
        in_list = False
        for i, pair in enumerate(self.hbonds_pair):
            if pair[0] == check_pair[0]:
                if pair[1] == check_pair[1]:
                    self.count_pair[i] = self.count_pair[i]+1
                    in_list = True
            elif pair[1] == check_pair[0]:
                if pair[0] == check_pair[1]:
                    self.count_pair[i] = self.count_pair[i]+1
                    in_list = True
        if in_list == False:
            self.hbonds_pair.append(check_pair)
            self.count_pair.append(1)
            





The code below will simple read in the index files we generated above, parse them, and count up all of the hbond pairs.

In [14]:
import os


for i, filename in enumerate(filenames):
    hbonds_array = []
    count = 0 
        
    if(os.path.exists(filename)):
        hbonds_file = open(filename, 'r')
        lines = hbonds_file.readlines()
        count = count + 1
        read_the_line = False
        for line in lines:
            if read_the_line:
                breakdown = line.split()
                hbonds_temp = [int(breakdown[0])-1, int(breakdown[1])-1,int(breakdown[2])-1] 
                hbonds_array.append(hbonds_temp)
            if '[ hbonds' in line:
                read_the_line = True
    
    hbonds = hbond_agg()
    for hbond in hbonds_array:
        temp1 = [lookup_dict[hbond[0]], lookup_dict[hbond[2]]]
        hbonds.check_hbond(temp1)
    print(f'{filename}, {count}')
    print(len(hbonds.hbonds_pair))
    for i in range(0, len(hbonds.hbonds_pair)):
        print(hbonds.hbonds_pair[i], hbonds.count_pair[i]/count)

hbond_ucer2_ucer2.ndx, 1
8
['O80', 'O84'] 201.0
['N1', 'O80'] 134.0
['O84', 'O84'] 57.0
['O80', 'O4'] 235.0
['O84', 'O4'] 215.0
['N1', 'O4'] 155.0
['O84', 'N1'] 72.0
['O80', 'O80'] 58.0
hbond_ucer2_chol.ndx, 1
4
['O80', 'O3'] 136.0
['O84', 'O3'] 98.0
['N1', 'O3'] 13.0
['O3', 'O4'] 60.0
hbond_ucer2_ffa24.ndx, 1
7
['N1', 'O27'] 111.0
['O80', 'O27'] 121.0
['O84', 'O25'] 101.0
['O84', 'O27'] 136.0
['O80', 'O25'] 116.0
['N1', 'O25'] 44.0
['O25', 'O4'] 150.0
hbond_ucer2_tip3p.ndx, 1
4
['N1', 'O1'] 207.0
['O80', 'O1'] 368.0
['O84', 'O1'] 389.0
['O1', 'O4'] 450.0
hbond_chol_chol.ndx, 0
0
hbond_chol_ffa24.ndx, 1
2
['O3', 'O27'] 56.0
['O3', 'O25'] 83.0
hbond_chol_tip3p.ndx, 1
1
['O3', 'O1'] 166.0
hbond_ffa24_ffa24.ndx, 1
2
['O25', 'O27'] 269.0
['O25', 'O25'] 32.0
hbond_ffa24_tip3p.ndx, 1
2
['O25', 'O1'] 579.0
['O1', 'O27'] 568.0
hbond_tip3p_tip3p.ndx, 1
1
['O1', 'O1'] 90741.0
