In [None]:
#TICA Clustering of MD Trajectory on backbone atoms (TICA feature scoring)

In [None]:
import pyemma
import glob
import numpy as np
import pyemma.plots as mplt
%pylab inline
import mdtraj as md
import pyemma.coordinates as coor
 
def average_by_state(dtraj, x, nstates):
    assert(len(dtraj) == len(x))
    N = len(dtraj)
    res = np.zeros((nstates))
    for i in range(nstates):
        I = np.argwhere(dtraj == i)[:,0]
        res[i] = np.mean(x[I])
    return res
 
def avg_by_set(x, sets):
    # compute mean positions of sets. This is important because of some technical points the set order
    # in the coarse-grained TPT object can be different from the input order.
    avg = np.zeros(len(sets))
    for i in range(len(sets)):
        I = list(sets[i])
        avg[i] = np.mean(x[I])
    return avg

In [None]:
trajfile = []
for i in range(1,6):
    path = '/net/jam-amaro-shared/bccgc4/Strided_Traj/joined_traj_md'+str(i)+'.nc'
    trajfile.append(path)

top = "/net/jam-amaro-shared/bccgc4/Strided_Traj/protein.h5"

#assigns the features
positions_feat = coor.featurizer(top)
positions_feat.add_selection(positions_feat.select_Backbone())
#selecting distances at minimum & closest heavy atoms
    
inp = coor.source(trajfile, positions_feat)
print(inp)
print('trajectory length = ',inp.trajectory_length(0))
print('number of dimension = ',inp.dimension())
  

In [None]:
#Running TICA

lag=1000
tica_obj = coor.tica(inp, lag=lag, var_cutoff=0.95, kinetic_map=False)


In [None]:
# here we get the data that has been projected onto the first 2 IC's. It's a list, because we could generally
# have a list of trajectories, so we just get the first element.
Y = tica_obj.get_output()
#print('Projected data shape = ', Y.shape)

print('Mean values: ', np.mean(Y, axis=0))
print('Variances:   ', np.var(Y, axis=0))
print(-lag/np.log(tica_obj.eigenvalues[:5]))


In [None]:
#making clusters from kmeans
cl = coor.cluster_kmeans(data=Y,k=10, max_iter=30)


In [None]:
# for later use we save the discretetrajectories (dtrajs) and cluster center coordinates:
dtrajs = cl.dtrajs
cc_x = cl.clustercenters[:,0]
cc_y = cl.clustercenters[:,1]
 
print(dtrajs)
print(np.size(dtrajs))

In [None]:
cl.converged

In [None]:
#plotting the free energy in subplots for tics 0-5

for s in range(6):
    fig, ax = plt.subplots(1, 6, sharex='col', sharey='row', figsize = (20,3.4)) #creating 1x6 subplot grid
    
    for w in range(6):
        mplt.plot_free_energy(np.vstack(Y)[:,s], np.vstack(Y)[:,w], ax = ax[w], cmap = 'viridis')#, cbar = False, cbar_label = None)
        cc_x = cl.clustercenters[:,s]
        cc_y = cl.clustercenters[:,w]
        #ax[w].plot(cc_x,cc_y, linewidth=0, marker='o', markersize=5, color='red')
        colors = ['black','gray','red','saddlebrown','darkorange','gold','darkgreen','aqua','darkviolet','deeppink']
        for i in range(10):
            ax[w].scatter(cc_x[i], cc_y[i], color = colors[i])
    
    for a in range(6):
        ax[a].set(xlabel = ('TIC '+str(a)))
        
    fig.text(0.001, 0.5, 'TIC '+str(s), va = 'center', rotation='vertical')
    
    fig.suptitle('TICA Cluster Centroids',fontsize = 16, y=1.06)
    fig.tight_layout()

    plt.savefig('/home/jegan/Clustering_methods/TICA/figs/bkbnpos/TICA_FE_TIC'+str(s)+'.png', bbox_inches = 'tight')


In [None]:
#This tells us how much each feature contributes to each TIC
tica_obj.feature_TIC_correlation

In [None]:
#This tells us the number of TICs TICA produces to retain the variance we want, which is .95
tica_obj.cumvar

In [None]:
#This prints the discrete trajectories we saved out before, then saves out the percent of frames in each cluster
print(dtrajs)
print(dtrajs[0])

with open('/home/jegan/Clustering_methods/TICA/clusters_bkbnpos.txt', 'w') as newfile:
    numb = []
    for i in range(10):
        frames = []
        for k in dtrajs:
            for p in k:
                if p == i:
                    frames.append(p)

        print(len(frames))
        numb.append(len(frames))
    
    newfile.write('Percent of frames per TICA bkbnpos cluster:\n')
    tot = 0
    index = 0
    for j in numb:
        tot += j
        percent = (j/450000)*100
        num = str(percent)
        newfile.write('Cluster '+ str(index)+' = '+num+' %\n')
        index += 1
    print(tot)


In [None]:
#Extracting Centroids

avg = [100, 100, 100, 100, 100, 100, 100, 100, 100, 100]

indices = {}
for i in range(len(Y)):
    for k in range(len(Y[i])):
        c = cl.clustercenters[cl.dtrajs[i][k]]
        v = Y[i][k]
        newavg = np.linalg.norm(c-v)
        if avg[cl.dtrajs[i][k]] > newavg:
            avg[cl.dtrajs[i][k]] = newavg
            indices[cl.dtrajs[i][k]] = [i, k]

#trajectory=md.load(['/net/jam-amaro-shared/bccgc4/Strided_Traj/joined_traj_md1.nc', '/net/jam-amaro-shared/bccgc4/Strided_Traj/joined_traj_md2.nc', '/net/jam-amaro-shared/bccgc4/Strided_Traj/joined_traj_md3.nc', '/net/jam-amaro-shared/bccgc4/Strided_Traj/joined_traj_md4.nc', '/net/jam-amaro-shared/bccgc4/Strided_Traj/joined_traj_md5.nc'], top="/net/jam-amaro-shared/bccgc4/Strided_Traj/protein.h5")

z = 0
for key in indices:
        z = z + 1
        index = indices[key][1]*(1 + indices[key][0])
        print(index)
        #i = trajectory[index].topology.select("protein")
        #new_traj = trajectory[index].atom_slice(i)
        #new_traj.save_pdb('/home/jegan/Clustering_methods/TICA/TICA_bkbnpos_feat_joinedtraj_centroids/TICA_%s.pdb' % (z-1))