# Cluster

In [98]:

from __future__ import print_function
import random
from collections import defaultdict
import mdtraj as md
import numpy as np
import scipy.cluster.hierarchy
import pickle
import msmbuilder.cluster as msmcluster

In [72]:
traj_notes = ['5NJY_pH70_md1','5NJY_pH70_md2','5NJY_pH70_md3','5NJY_F238L_pH70_md1',
                 '5NJY_F238L_pH70_md2','5NJY_F238L_pH70_md3','5NJY_I233T_pH70_md4',
                  '5NJY_I233T_pH70_md2','5NJY_I233T_pH70_md3','5NJY_F238LI233T_pH70_md1',
                  '5NJY_F238LI233T_pH70_md2','5NJY_F238LI233T_pH70_md3','5NJY_pH46_md1','5NJY_pH46_md2','5NJY_pH46_md3','5NJY_F238L_pH46_md1',
                 '5NJY_F238L_pH46_md2','5NJY_F238L_pH46_md3','5NJY_I233T_pH46_md1',
                  '5NJY_I233T_pH46_md2','5NJY_I233T_pH46_md3','5NJY_F238LI233T_pH46_md1',
                  '5NJY_F238LI233T_pH46_md2','5NJY_F238LI233T_pH46_md3','4HFI_pH46_md1','4HFI_pH46_md2','4HFI_pH46_md3','4HFI_F238L_pH46_md1',
                 '4HFI_F238L_pH46_md2','4HFI_F238L_pH46_md3','4HFI_I233T_pH46_md1',
                  '4HFI_I233T_pH46_md2','4HFI_I233T_pH46_md3','4HFI_F238LI233T_pH46_md1',
                  '4HFI_F238LI233T_pH46_md2','4HFI_F238LI233T_pH46_md3','4NPQ_pH70_md5','4NPQ_pH70_md6','4NPQ_pH70_md7','4NPQ_F238L_pH70_md3',
                 '4NPQ_F238L_pH70_md4','4NPQ_F238L_pH70_md5','4NPQ_I233T_pH70_md3',
                  '4NPQ_I233T_pH70_md4','4NPQ_I233T_pH70_md5','4NPQ_F238LI233T_pH70_md3',
                  '4NPQ_F238LI233T_pH70_md4','4NPQ_F238LI233T_pH70_md5']

In [93]:
def rmsd_ward_cluster(traj_note,location = '/media/scottzhuang/data/MD/', skip=10):
    top_location = traj_note + '/' + traj_note + ".ca.pdb"
    traj_location = traj_note + '/' + traj_note + ".skip" + str(skip) + ".ca.xtc"
    traj = md.load(location + traj_location,top = location + top_location)
    traj = traj.atom_slice(traj.topology.select('resSeq 198 to 315'))
    traj.superpose(traj,0)
    traj.unitcell_lengths = None
    traj.unitcell_angles = None

    distances = np.empty((traj.n_frames, traj.n_frames))
    for i in range(traj.n_frames):
        distances[i] = md.rmsd(traj, traj, i)
    print('In ' + traj_note)
    print('Max pairwise rmsd: %f nm' % np.max(distances))
    reduced_distances = squareform(distances, checks=False)
    n_clusters = 3
    linkage = scipy.cluster.hierarchy.ward(reduced_distances)
    labels = scipy.cluster.hierarchy.fcluster(linkage, t=n_clusters, criterion='maxclust')    
    print(labels)
    mapping = defaultdict(lambda : [])
    for i, label in enumerate(labels):
        mapping[label].append(i)

    print(mapping)
    n_leaders_per_cluster = 2
    leaders = md.Trajectory(xyz=np.empty((0, traj.n_atoms, 3)),
                            topology=traj.topology)
    leader_labels = []
    for label, indices in mapping.items():
        leaders = leaders.join(traj[np.random.choice(indices, n_leaders_per_cluster)])
        leader_labels.extend([label] * n_leaders_per_cluster)
    print(leaders)
    print(leader_labels)
    labels = []
    for frame in traj:
        labels.append(leader_labels[np.argmin(md.rmsd(leaders, frame, 0))])
    labels = np.array(labels)
#    leaders.save_pdb('ward_cluster.pdb')
    if labels[0] == 2:
        labels = labels % 2 + 1
    elif labels[0] == 3:
        labels = labels % 3 + 1

    print(labels)

    with open(location + traj_note + '/ward_cluster_labels.pickle', 'wb') as f:
        pickle.dump(labels, f, pickle.HIGHEST_PROTOCOL)

In [117]:
def msm_builder_cluster(traj_note,location = '/media/scottzhuang/data/MD/', skip=10):
    top_location = traj_note + '/' + traj_note + ".protein.pdb"
    traj_location = traj_note + '/' + traj_note + ".skip" + str(skip) + ".protein.xtc"
    traj = md.load(location + traj_location,top = location + top_location)
    traj.superpose(traj,0)
    dataset = []
    indices, phi = md.compute_phi(traj)
    dataset.append(phi)
    cluster = msmcluster.KMeans(n_clusters=2)
    cluster.fit(dataset)

    print(cluster.labels_)

In [118]:
msm_builder_cluster('4HFI_F238LI233T_pH46_md2')

[array([1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 0], dtype=int32)]


In [None]:
from joblib import Parallel, delayed
import multiprocessing
num_cores = multiprocessing.cpu_count()
Parallel(n_jobs=num_cores)(delayed(rmsd_ward_cluster)(traj_note) for traj_note in traj_notes)