In [2]:
import sys
import os
import importlib
import random
from itertools import combinations

import numpy as np
import scipy
import sklearn

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import ticker
import seaborn as sns
import prettypyplot as pplt

import mdtraj as md

import extq

In [3]:
sys.path.insert(1, "../../python")
sys.path.insert(1, "../../..")
import util
import plotting

In [4]:
pplt.load_cmaps()
plt.style.use("custom")  # custom style sheet
plt.style.use("muted")  # muted color theme from SciencePlots
colors = mpl.colors.to_rgba_array(
    [
        "#364B9A",
        "#4A7BB7",
        "#6EA6CD",
        "#98CAE1",
        "#C2E4EF",
        "#EAECCC",
        "#FEDA8B",
        "#FDB366",
        "#F67E4B",
        "#DD3D2D",
        "#A50026",
    ]
)
cm_div = mpl.colors.LinearSegmentedColormap.from_list("diverging", colors)
mpl.colormaps.register(cm_div, force=True)

In [5]:
# load committors
DATA_DIR = "/project/dinner/scguo/ci-vsd/data"
qp_du = np.load(f"{DATA_DIR}/feat2_dist_du_anton2/qp_downup_3.npy", allow_pickle=True)[8] # 50 ns
weights = np.load(f"{DATA_DIR}/feat2_dist_du_anton2/weights_3_feat5ivac.npy", allow_pickle=True)

# Bin structures by committor

In [6]:
def bin_inds(q, qstep=0.05, low=0, hi=1):
    q_arr = np.concatenate(q)
    nsteps = round((hi - low) / qstep)
    all_inds = []
    steps = np.linspace(low, hi - qstep, nsteps)
    for i, s in enumerate(steps):
        q_inds = ((q_arr >= s) & (q_arr <= s + qstep)).nonzero()[0]
        all_inds.append(q_inds)
    return steps, all_inds

# Cluster points RMSD

In [8]:
import glob
import kmedoids

## $q_+ \approx 0.25$

In [10]:
# load points (2000) at q_+ 0.2-0.3
topfile = "/project/dinner/scguo/ci-vsd/models/MD-clustering-center/civsd.psf"
q2 = md.load(glob.glob(f"{DATA_DIR}/q_bin/q2*.xtc"), top=topfile)
q3 = md.load(glob.glob(f"{DATA_DIR}/q_bin/q3*.xtc"), top=topfile)

In [16]:
import re

In [40]:
# find points with q_+ 0.2-0.3
qp_values_q2 = []
for i in range(10):
    with open(f"{DATA_DIR}/q_bin/q_2_{i}.txt", mode='r') as f:
        lines = f.readlines()
        for line in lines:
            # print(line.strip("\n"))
            if line.startswith("/beagle3"):
                # count from 237 but trajectories numbered from 3
                traj_id = int(line.split(".")[1]) + 237 - 3
                frame_id = int(line.split()[1])
                # print(traj_id, frame_id, qp_du[traj_id][frame_id])
            else:
                traj_id = int(re.search(r'\d+', line).group())
                frame_id = int(line.split()[1])
                # print(traj_id, frame_id, qp_du[traj_id][frame_id])
            qp_values_q2.append(qp_du[traj_id][frame_id])

In [50]:
qp_values_q2 = np.array(qp_values_q2)[:len(q2)]
q2_ids = ((qp_values_q2 > 0.2) & (qp_values_q2 < 0.3)).nonzero()[0]
q2_sliced = q2[q2_ids]

In [42]:
# find points with q_+ 0.2-0.3
qp_values_q3 = []
for i in range(10):
    with open(f"{DATA_DIR}/q_bin/q_3_{i}.txt", mode='r') as f:
        lines = f.readlines()
        for line in lines:
            # print(line.strip("\n"))
            if line.startswith("/beagle3"):
                # count from 237 but trajectories numbered from 3
                traj_id = int(line.split(".")[1]) + 237 - 3
                frame_id = int(line.split()[1])
                # print(traj_id, frame_id, qp_du[traj_id][frame_id])
            else:
                traj_id = int(re.search(r'\d+', line).group())
                frame_id = int(line.split()[1])
                # print(traj_id, frame_id, qp_du[traj_id][frame_id])
            qp_values_q3.append(qp_du[traj_id][frame_id])

In [51]:
qp_values_q3 = np.array(qp_values_q3)[:len(q3)]
q3_ids = ((qp_values_q3 > 0.2) & (qp_values_q3 < 0.3)).nonzero()[0]
q3_sliced = q3[q3_ids]

In [52]:
len(q2_sliced), len(q3_sliced)

(998, 1005)

In [35]:
combined_q2q3 = q2_sliced.join(q3_sliced)

In [36]:
prot_noh_ids = combined_q2q3.top.select("protein and not element H")

In [38]:
distances_noh = np.empty((combined_q2q3.n_frames, combined_q2q3.n_frames))
for i in range(combined_q2q3.n_frames):
    distances_noh[i] = md.rmsd(combined_q2q3, combined_q2q3, i, atom_indices=prot_noh_ids)
print('Max pairwise rmsd: %f nm' % np.max(distances_noh))

Max pairwise rmsd: 0.878240 nm


In [None]:
all_c = []
for i in range(1, 4):
    c = kmedoids.fasterpam(distances_noh, i)
    print(c)
    all_c.append(c)

KMedoidsResult(loss=688.0804843902588, labels=[0 0 0 ... 0 0 0], medoids=[1659], n_iter=1, n_swaps=1)
KMedoidsResult(loss=652.4815418571234, labels=[0 1 1 ... 1 1 0], medoids=[ 143 1560], n_iter=2, n_swaps=14)
KMedoidsResult(loss=633.7204019203782, labels=[1 1 2 ... 1 1 0], medoids=[ 749 1637 1406], n_iter=2, n_swaps=21)


In [None]:
for c in all_c:
    print("Result:")
    for m in c.medoids:
        if m < 998:
            m2 = q2_ids[m]
            i = m2 // 200
            j = m2 % 200
            with open(f"../../data/q_bin/q_2_{int(i)}.txt", mode='r') as f:
                lines = f.readlines()
                print(lines[int(j)].strip("\n"))
        else:
            m2 = q3_ids[int(m) - 998]
            i = m2 // 200
            j = m2 % 200
            with open(f"../../data/q_bin/q_3_{int(i)}.txt", mode='r') as f:
                lines = f.readlines()
                print(lines[int(j)].strip("\n"))

Result:
/project/dinner/scguo/anton-old/civsd_182.dcd	2640
Result:
/beagle3/dinner/scguo/anton2-backup/dcdfiles/civsd.6.dcd	4040
/beagle3/dinner/scguo/anton2-backup/dcdfiles/civsd.18.dcd	4352
Result:
/beagle3/dinner/scguo/anton2-backup/dcdfiles/civsd.94.dcd	1616
/project/dinner/scguo/anton-old/civsd_21.dcd	2174
/project/dinner/scguo/anton-old/civsd_179.dcd	38272


In [58]:
np.save("../../data/q_bin/q23_rmsd_noh_square.npy", distances_noh)
# distances = np.load("../../data/q_bin/q5_rmsd_noh_square.npy")

## $q_+\approx 0.75$

In [61]:
# load points (2000) at q_+ 0.7-0.8
topfile = "/project/dinner/scguo/ci-vsd/models/MD-clustering-center/civsd.psf"
q7 = md.load(glob.glob(f"{DATA_DIR}/q_bin/q7*.xtc"), top=topfile)
q8 = md.load(glob.glob(f"{DATA_DIR}/q_bin/q8*.xtc"), top=topfile)

In [66]:
qp_values_q7 = []
for i in range(10):
    with open(f"{DATA_DIR}/q_bin/q_7_{i}.txt", mode='r') as f:
        lines = f.readlines()
        for line in lines:
            print(line.strip("\n"))
            if line.startswith("/beagle3"):
                # count from 237 but trajectories numbered from 3
                traj_id = int(line.split(".")[1]) + 237 - 3
                frame_id = int(line.split()[1])
                print(traj_id, frame_id, qp_du[traj_id][frame_id])
            else:
                traj_id = int(re.search(r'\d+', line).group())
                frame_id = int(line.split()[1])
                print(traj_id, frame_id, qp_du[traj_id][frame_id])
            qp_values_q7.append(qp_du[traj_id][frame_id])

/project/dinner/scguo/anton-old/civsd_222.dcd	4183
222 4183 0.9273329783955526
/beagle3/dinner/scguo/anton2-backup/dcdfiles/civsd.63.dcd	3971
297 3971 0.7330071428569428
/project/dinner/scguo/anton-old/civsd_257.dcd	5192
257 5192 0.6705001350660104
/project/dinner/scguo/anton-old/civsd_265.dcd	5101
265 5101 0.9295971828323967
/beagle3/dinner/scguo/anton2-backup/dcdfiles/civsd.88.dcd	9213
322 9213 0.7244689494892748
/project/dinner/scguo/anton-old/civsd_181.dcd	90642
181 90642 0.7029652635305124
/beagle3/dinner/scguo/anton2-backup/dcdfiles/civsd.84.dcd	1435
318 1435 0.7371785790859159
/project/dinner/scguo/anton-old/civsd_222.dcd	762
222 762 0.9459737845956803
/project/dinner/scguo/anton-old/civsd_181.dcd	28189
181 28189 0.6994977372182601
/beagle3/dinner/scguo/anton2-backup/dcdfiles/civsd.77.dcd	8140
311 8140 0.712682883598523
/beagle3/dinner/scguo/anton2-backup/dcdfiles/civsd.23.dcd	9733
257 9733 0.7055495793087161
/project/dinner/scguo/anton-old/civsd_181.dcd	29874
181 29874 0.722594

In [67]:
qp_values_q7 = np.array(qp_values_q7)[:len(q7)]
q7_ids = ((qp_values_q7 > 0.7) & (qp_values_q7 < 0.8)).nonzero()[0]
q7_sliced = q7[q7_ids]

In [69]:
qp_values_q8 = []
for i in range(10):
    with open(f"{DATA_DIR}/q_bin/q_8_{i}.txt", mode='r') as f:
        lines = f.readlines()
        for line in lines:
            print(line.strip("\n"))
            if line.startswith("/beagle3"):
                # count from 237 but trajectories numbered from 3
                traj_id = int(line.split(".")[1]) + 237 - 3
                frame_id = int(line.split()[1])
                print(traj_id, frame_id, qp_du[traj_id][frame_id])
            else:
                traj_id = int(re.search(r'\d+', line).group())
                frame_id = int(line.split()[1])
                print(traj_id, frame_id, qp_du[traj_id][frame_id])
            qp_values_q8.append(qp_du[traj_id][frame_id])

/project/dinner/scguo/anton-old/civsd_87.dcd	3258
87 3258 0.8098559721620057
/project/dinner/scguo/anton-old/civsd_185.dcd	74166
185 74166 0.8375965763592461
/beagle3/dinner/scguo/anton2-backup/dcdfiles/civsd.86.dcd	6286
320 6286 0.7626614222693222
/project/dinner/scguo/anton-old/civsd_208.dcd	5614
208 5614 0.9363724420457858
/project/dinner/scguo/anton-old/civsd_185.dcd	10788
185 10788 0.8412049297766231
/project/dinner/scguo/anton-old/civsd_99.dcd	8498
99 8498 0.8336251397988289
/beagle3/dinner/scguo/anton2-backup/dcdfiles/civsd.88.dcd	685
322 685 0.7718775042943368
/project/dinner/scguo/anton-old/civsd_87.dcd	5724
87 5724 0.7587691876056765
/project/dinner/scguo/anton-old/civsd_147.dcd	7407
147 7407 0.8486380322468041
/project/dinner/scguo/anton-old/civsd_97.dcd	459
97 459 0.827217244827135
/project/dinner/scguo/anton-old/civsd_137.dcd	9037
137 9037 0.8001227567817885
/project/dinner/scguo/anton-old/civsd_221.dcd	6464
221 6464 0.9246630242675399
/project/dinner/scguo/anton-old/civsd

In [70]:
qp_values_q8 = np.array(qp_values_q8)[:len(q8)]
q8_ids = ((qp_values_q8 > 0.7) & (qp_values_q8 < 0.8)).nonzero()[0]
q8_sliced = q8[q8_ids]

In [71]:
len(q7_sliced), len(q8_sliced)

(789, 490)

In [72]:
combined_q7q8 = q7_sliced.join(q8_sliced)

In [73]:
distances_noh = np.empty((combined_q7q8.n_frames, combined_q7q8.n_frames))
for i in range(combined_q7q8.n_frames):
    distances_noh[i] = md.rmsd(combined_q7q8, combined_q7q8, i, atom_indices=prot_noh_ids)
print('Max pairwise rmsd: %f nm' % np.max(distances_noh))

Max pairwise rmsd: 0.817945 nm


In [74]:
all_c = []
for i in range(1, 4):
    c = kmedoids.fasterpam(distances_noh, i)
    print(c)
    all_c.append(c)

KMedoidsResult(loss=500.35512090474367, labels=[0 0 0 ... 0 0 0], medoids=[1030], n_iter=1, n_swaps=1)
KMedoidsResult(loss=441.50501349568367, labels=[0 0 1 ... 1 0 0], medoids=[776 321], n_iter=2, n_swaps=11)
KMedoidsResult(loss=420.4224669486284, labels=[0 2 1 ... 1 0 2], medoids=[776 523 310], n_iter=2, n_swaps=16)


In [75]:
for c in all_c:
    print("Result:")
    for m in c.medoids:
        if m < 998:
            m2 = q2_ids[m]
            i = m2 // 200
            j = m2 % 200
            with open(f"../../data/q_bin/q_7_{int(i)}.txt", mode='r') as f:
                lines = f.readlines()
                print(lines[int(j)].strip("\n"))
        else:
            m2 = q3_ids[int(m) - 998]
            i = m2 // 200
            j = m2 % 200
            with open(f"../../data/q_bin/q_8_{int(i)}.txt", mode='r') as f:
                lines = f.readlines()
                print(lines[int(j)].strip("\n"))

Result:
/project/dinner/scguo/anton-old/civsd_221.dcd	6227
Result:
/project/dinner/scguo/anton-old/civsd_17.dcd	447
/beagle3/dinner/scguo/anton2-backup/dcdfiles/civsd.91.dcd	1680
Result:
/project/dinner/scguo/anton-old/civsd_17.dcd	447
/beagle3/dinner/scguo/anton2-backup/dcdfiles/civsd.64.dcd	3308
/project/dinner/scguo/anton-old/civsd_181.dcd	49045


In [76]:
np.save("../../data/q_bin/q78_rmsd_noh_square.npy", distances_noh)
# distances = np.load("../../data/q_bin/q5_rmsd_noh_square.npy")