In [1]:
#Modules
from __future__ import print_function
import mdtraj as md
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy
from scipy.spatial.distance import squareform
import pandas as pd
import matplotlib.gridspec as gridspec
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline


In [85]:
md_data = pd.read_pickle("glic_gating.pickle")
md_data = md_data[md_data['traj_time'] > 5]
md_data['traj_time'] = md_data['traj_time'] * 10
md_data.columns

Index(['MD_name', 'pH', 'replicate', 'traj_time', 'system', 'ecd_pc1',
       'ecd_pc2', 'tmd_pc1', 'tmd_pc2', 'wholepca_pc1', 'wholepca_pc2',
       'wholepca_pc3', 'wholepca_pc4', 'wholepca_pc5', 'domain twist',
       'helix tilt angle', 'helix twist angle', 'beta_expansion', 'M2_radius',
       'ECD_radius', 'M2_M1_distance', 'M1_kink', 'hydration_data',
       'hydration_data235_3a', 'hydration_data233_3a', 'hydration_data238_3a',
       'hydration_data226_3a', 'hydration_data240_3a', 'hydration_data_m123',
       'pore_profile_222', 'pore_profile_226', 'pore_profile_230',
       'pore_profile_233', 'pore_profile_237', 'pore_profile_240', 'rmsd4NPQ',
       'rmsd4HFI', 'rmsd5NJY', 'tmd_rmsd', 'ecd_rmsd', 'rmsd', 'cluster_index',
       'ECD twist', 'hydration_data_236_center', 'tmd_noM2_pc1',
       'tmd_noM2_pc2', 'rmsd_TMD_noM2'],
      dtype='object')

In [79]:
md_5NJY_70 = md_data[(md_data.system >= 0) & (md_data.system <= 3)]
md_5NJY_46 = md_data[(md_data.system >= 4) & (md_data.system <= 7)]
md_4HFI_46 = md_data[(md_data.system >= 8) & (md_data.system <= 11)]
md_4NPQ_70 = md_data[(md_data.system >= 12) & (md_data.system <= 15)]

In [80]:
md_WT = md_data[~md_data.MD_name.str.contains('F238L') & ~md_data.MD_name.str.contains('I233T')]
md_F238L = md_data[md_data.MD_name.str.contains('F238L') & ~md_data.MD_name.str.contains('F238LI233T')]
md_I233T = md_data[md_data.MD_name.str.contains('I233T') & ~md_data.MD_name.str.contains('F238LI233T')]
md_F238LI233T = md_data[md_data.MD_name.str.contains('F238LI233T')]

In [86]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.cluster import DBSCAN

class Clusterer(object):
    
    def __init__(self, eps):
        self.eps = eps
        
    
    def _preprocess(self, feature):
        

        ss = StandardScaler()
        X = ss.fit_transform(feature[['ecd_pc1','tmd_pc1']].values)
        
        return X
    
    
    def predict(self, feature):
        
        X = self._preprocess(feature)
        
        cl = DBSCAN(eps=self.eps, min_samples=1, algorithm='kd_tree')
        labels = cl.fit_predict(X)
        
        return labels

In [87]:
md_data['dbscan'] = 0
from statistics import mode
for system in range(8,16):
    model = Clusterer(eps=0.2)
    label = model.predict(md_data[md_data.system == system])
    md_data.loc[md_data.system == system,'dbscan'] = label
    md_data.loc[(md_data.system == system) & (md_data.dbscan != mode(label)),'dbscan'] = -1
    md_data.loc[(md_data.system == system) & (md_data.dbscan == mode(label)),'dbscan'] = 0

In [18]:
model = Clusterer(eps=0.2)
label = model.predict(md_4HFI_46[md_data.system == 9])

In [93]:
md_data = pd.read_pickle('clustered.pickle')

In [2]:
traj_notes = ['5NJY_pH70_md1','5NJY_pH70_md2','5NJY_pH70_md3','5NJY_F238L_pH70_md1',
                 '5NJY_F238L_pH70_md2','5NJY_F238L_pH70_md3','5NJY_I233T_pH70_md4',
                  '5NJY_I233T_pH70_md2','5NJY_I233T_pH70_md3','5NJY_F238LI233T_pH70_md1',
                  '5NJY_F238LI233T_pH70_md2','5NJY_F238LI233T_pH70_md3','5NJY_pH46_md1','5NJY_pH46_md2','5NJY_pH46_md3','5NJY_F238L_pH46_md1',
                 '5NJY_F238L_pH46_md2','5NJY_F238L_pH46_md3','5NJY_I233T_pH46_md1',
                  '5NJY_I233T_pH46_md2','5NJY_I233T_pH46_md3','5NJY_F238LI233T_pH46_md1',
                  '5NJY_F238LI233T_pH46_md2','5NJY_F238LI233T_pH46_md3','4HFI_pH46_md1','4HFI_pH46_md2','4HFI_pH46_md3','4HFI_F238L_pH46_md1',
                 '4HFI_F238L_pH46_md2','4HFI_F238L_pH46_md3','4HFI_I233T_pH46_md1',
                  '4HFI_I233T_pH46_md2','4HFI_I233T_pH46_md3','4HFI_F238LI233T_pH46_md1',
                  '4HFI_F238LI233T_pH46_md2','4HFI_F238LI233T_pH46_md3','4NPQ_pH70_md5','4NPQ_pH70_md6','4NPQ_pH70_md7','4NPQ_F238L_pH70_md3',
                 '4NPQ_F238L_pH70_md4','4NPQ_F238L_pH70_md5','4NPQ_I233T_pH70_md3',
                  '4NPQ_I233T_pH70_md4','4NPQ_I233T_pH70_md5','4NPQ_F238LI233T_pH70_md3',
                  '4NPQ_F238LI233T_pH70_md4','4NPQ_F238LI233T_pH70_md5']

In [9]:
def extract_after_40(traj_note):
    location = '/media/scottzhuang/data/MD/'
    top_location = traj_note + '/' + traj_note + ".system.pdb"
    traj_location = traj_note + '/' + traj_note + ".skip10.system.xtc"
    traj = md.load(location + traj_location,top= location + top_location)
    traj[40].save_pdb(location + 'md_ensemble/' + traj_note + '/' + traj_note + ".system.40.pdb")
    traj[40:].save_xtc(location + 'md_ensemble/' + traj_note + '/' + traj_note + ".skip10.system.after40.xtc")

In [10]:
from joblib import Parallel, delayed
import multiprocessing
num_cores = multiprocessing.cpu_count()
Parallel(n_jobs=num_cores)(delayed(extract_after_40)(traj_note) for (traj_note) in traj_notes)

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [None]:
location = '/media/scottzhuang/data/MD/'
for traj_note in traj_notes:
    top_location = traj_note + '/' + traj_note + ".protein.pdb"
    traj_location = traj_note + '/' + traj_note + ".skip1.protein.xtc"
    traj = md.load(location + traj_location,top= location + top_location)
    traj[400].save_pdb(location + traj_note + '/' + traj_note + ".protein.400.pdb")
    traj[400:].save_xtc(location + traj_note + '/' + traj_note + ".skip1.protein.after400.xtc")