# Feature Extract

Usage: function plotting will plot rmsd, rmsd histogram, rmsf and PCA projection

In [1]:
from __future__ import print_function
%matplotlib inline
import mdtraj as md
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy
from scipy.spatial.distance import squareform
import pandas as pd
import matplotlib.gridspec as gridspec
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')
import MDAnalysis as mda
#XRD Ensemble
#28 4NPQ
#18 4HFI

In [2]:
residue_selection = 'resSeq 8 to 316'

In [3]:
traj_notes = ['5NJY_pH70_md1','5NJY_pH70_md2','5NJY_pH70_md3','5NJY_F238L_pH70_md1',
                 '5NJY_F238L_pH70_md2','5NJY_F238L_pH70_md3','5NJY_I233T_pH70_md1',
                  '5NJY_I233T_pH70_md2','5NJY_I233T_pH70_md3','5NJY_F238LI233T_pH70_md1',
                  '5NJY_F238LI233T_pH70_md2','5NJY_F238LI233T_pH70_md3','5NJY_pH46_md1','5NJY_pH46_md2','5NJY_pH46_md3','5NJY_F238L_pH46_md1',
                 '5NJY_F238L_pH46_md2','5NJY_F238L_pH46_md3','5NJY_I233T_pH46_md1',
                  '5NJY_I233T_pH46_md2','5NJY_I233T_pH46_md3','5NJY_F238LI233T_pH46_md1',
                  '5NJY_F238LI233T_pH46_md2','5NJY_F238LI233T_pH46_md3','4HFI_pH46_md1','4HFI_pH46_md2','4HFI_pH46_md3','4HFI_F238L_pH46_md1',
                 '4HFI_F238L_pH46_md2','4HFI_F238L_pH46_md3','4HFI_I233T_pH46_md1',
                  '4HFI_I233T_pH46_md2','4HFI_I233T_pH46_md3','4HFI_F238LI233T_pH46_md1',
                  '4HFI_F238LI233T_pH46_md2','4HFI_F238LI233T_pH46_md3','4NPQ_pH70_md5','4NPQ_pH70_md6','4NPQ_pH70_md7','4NPQ_F238L_pH70_md3',
                 '4NPQ_F238L_pH70_md4','4NPQ_F238L_pH70_md5','4NPQ_I233T_pH70_md3',
                  '4NPQ_I233T_pH70_md4','4NPQ_I233T_pH70_md5','4NPQ_F238LI233T_pH70_md3',
                  '4NPQ_F238LI233T_pH70_md4','4NPQ_F238LI233T_pH70_md5','4HFI_pH70_md1','4NPQ_F238LI233T_pH46_md1',
               '4NPQ_F238L_pH46_md1','4NPQ_F238L_pH46_md2','4NPQ_pH46_md1','4NPQ_I233T_pH46_md1','4NPQ_I233T_pH46_md2','4NPQ_I233T_pH46_md3']

In [4]:
def create_md_dataframe():    
    md_data = pd.DataFrame(columns=list(['MD_name','pH','replicate','traj_time']))
    return md_data

In [5]:
def create_metadata(md_data= None):   
    def append_metadata(traj_note,location = '/media/scottzhuang/data/MD/',skip=10,md_data= md_data):
        top_location = traj_note + '/' + traj_note + ".ca.gro"
        traj_location = traj_note + '/' + traj_note + ".skip" + str(skip) + ".ca.xtc"
        if skip == 1:
            traj = md.load(location + traj_location,top= location + top_location,stride=10)
        else:
            traj = md.load(location + traj_location,top= location + top_location)
        print("In " + traj_note + ", simulation runs " + str(10 * traj.n_frames) + " ns.")
        md_name = traj_note[:traj_note.find('pH')-1]
        pH = traj_note[traj_note.find('pH')+2:traj_note.find('pH')+4]
        md_replicate = traj_note[-1]
        for i in range(0,traj.n_frames):
            md_data.loc[md_data.shape[0]+1] = [md_name,pH,md_replicate,i]

    for traj_note in traj_notes:
        append_metadata(traj_note)

In [6]:
def create_system_notation(md_data = None):
    system_notation = 0
    notation = -1
    location = '/media/scottzhuang/data/MD/'
    skip=10
    notations = []
    increment = 0
    for traj_note in traj_notes:
        top_location = traj_note + '/' + traj_note + ".ca.gro"
        traj_location = traj_note + '/' + traj_note + ".skip" + str(skip) + ".ca.xtc"
        traj = md.load(location + traj_location,top= location + top_location)
        if traj_note.find('md1') >= 0:
            notation = notation + 1
        if traj_note == '4NPQ_pH70_md5' or  traj_note == '4NPQ_F238L_pH70_md3' or traj_note == '4NPQ_I233T_pH70_md3' or traj_note == '4NPQ_F238LI233T_pH70_md3':
            notation = notation + 1

        for frame in range(0,traj.n_frames):
            notations.append(notation)
        #if increment % 3 == 2:
        #    notation = notation + 1
        #increment = increment + 1 
        
    md_data['system'] = notations

In [7]:
md_data = create_md_dataframe()
create_metadata(md_data= md_data)
create_system_notation(md_data = md_data)

In 5NJY_pH70_md1, simulation runs 1030 ns.
In 5NJY_pH70_md2, simulation runs 750 ns.
In 5NJY_pH70_md3, simulation runs 810 ns.
In 5NJY_F238L_pH70_md1, simulation runs 540 ns.
In 5NJY_F238L_pH70_md2, simulation runs 680 ns.
In 5NJY_F238L_pH70_md3, simulation runs 720 ns.
In 5NJY_I233T_pH70_md1, simulation runs 620 ns.
In 5NJY_I233T_pH70_md2, simulation runs 590 ns.
In 5NJY_I233T_pH70_md3, simulation runs 820 ns.
In 5NJY_F238LI233T_pH70_md1, simulation runs 830 ns.
In 5NJY_F238LI233T_pH70_md2, simulation runs 800 ns.
In 5NJY_F238LI233T_pH70_md3, simulation runs 600 ns.
In 5NJY_pH46_md1, simulation runs 550 ns.
In 5NJY_pH46_md2, simulation runs 800 ns.
In 5NJY_pH46_md3, simulation runs 560 ns.
In 5NJY_F238L_pH46_md1, simulation runs 930 ns.
In 5NJY_F238L_pH46_md2, simulation runs 950 ns.
In 5NJY_F238L_pH46_md3, simulation runs 1090 ns.
In 5NJY_I233T_pH46_md1, simulation runs 810 ns.
In 5NJY_I233T_pH46_md2, simulation runs 870 ns.
In 5NJY_I233T_pH46_md3, simulation runs 980 ns.
In 5NJY_F23

In [9]:
traj_note = '5NJY_pH70_md1'
location = '/media/scottzhuang/data/MD/'
skip=10
top_location = traj_note + '/' + traj_note + ".ca.gro"
traj_location = traj_note + '/' + traj_note + ".skip" + str(skip) + ".ca.xtc"
traj = md.load(location + traj_location,top= location + top_location)

In [10]:
import itertools

In [11]:
itertools.product(range(198,315),repeat=4)

<itertools.product at 0x7fd633866678>

In [28]:
traj.n_atoms

1555

In [29]:
dihedral_data = []
dihedral_indice = []
for i in itertools.product([198,199,200,300,400,500,600,700,1000,1200],repeat=4):
    dihedral_indice.append(np.asarray(i))
dihedral_data.extend(md.compute_dihedrals(traj,dihedral_indice))

In [31]:
dihedral_data

[array([ 0.,  0.,  0., ...,  0.,  0.,  0.], dtype=float32),
 array([ 0.,  0.,  0., ...,  0.,  0.,  0.], dtype=float32),
 array([ 0.,  0.,  0., ...,  0.,  0.,  0.], dtype=float32),
 array([ 0.,  0.,  0., ...,  0.,  0.,  0.], dtype=float32),
 array([ 0.,  0.,  0., ...,  0.,  0.,  0.], dtype=float32),
 array([ 0.,  0.,  0., ...,  0.,  0.,  0.], dtype=float32),
 array([ 0.,  0.,  0., ...,  0.,  0.,  0.], dtype=float32),
 array([ 0.,  0.,  0., ...,  0.,  0.,  0.], dtype=float32),
 array([ 0.,  0.,  0., ...,  0.,  0.,  0.], dtype=float32),
 array([ 0.,  0.,  0., ...,  0.,  0.,  0.], dtype=float32),
 array([ 0.,  0.,  0., ...,  0.,  0.,  0.], dtype=float32),
 array([ 0.,  0.,  0., ...,  0.,  0.,  0.], dtype=float32),
 array([ 0.,  0.,  0., ...,  0.,  0.,  0.], dtype=float32),
 array([ 0.,  0.,  0., ...,  0.,  0.,  0.], dtype=float32),
 array([ 0.,  0.,  0., ...,  0.,  0.,  0.], dtype=float32),
 array([ 0.,  0.,  0., ...,  0.,  0.,  0.], dtype=float32),
 array([ 0.,  0.,  0., ...,  0.,  0.,  0

In [82]:
md_data.to_csv("glic_gating_feature.csv")

In [8]:
md_data.columns

Index(['MD_name', 'pH', 'replicate', 'traj_time', 'system'], dtype='object')