In [1]:
import sys
import os
import importlib
import gc
import glob
import random
from itertools import combinations

import numpy as np
import sklearn
from sklearn import metrics

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import ticker
import seaborn as sns

import pyemma
import msmtools
import mdtraj as md

import ivac
import extq

In [2]:
sys.path.insert(1, "../../python")
sys.path.insert(1, "../../..")
import util
import plotting

In [3]:
plt.style.use("custom")  # custom style sheet
plt.style.use("muted")  # muted color theme from SciencePlots
cm_seq = sns.cubehelix_palette(
    start=0, rot=-0.70, gamma=0.40, light=0.9, dark=0.1, as_cmap=True, reverse=True
)
cm_seq2 = sns.cubehelix_palette(
    start=0, rot=-0.70, gamma=0.40, light=0.8, dark=0.1, as_cmap=True, reverse=False
)
colors = mpl.colors.to_rgba_array(
    [
        "#364B9A",
        "#4A7BB7",
        "#6EA6CD",
        "#98CAE1",
        "#C2E4EF",
        "#EAECCC",
        "#FEDA8B",
        "#FDB366",
        "#F67E4B",
        "#DD3D2D",
        "#A50026",
    ]
)
cm_div = mpl.colors.LinearSegmentedColormap.from_list("", colors)

# Load data

In [4]:
cv_trajs = list(
    np.load("../../data/raw_feat/cv_dist_spin_anton.npy", allow_pickle=True)
)
cv_trajs.extend(np.load("../../data/raw_feat/cv_dist_spin_anton2.npy"))

In [5]:
sb_labels = []
for r in ("R217", "R223", "R226", "R229", "R232"):
    for n in ("D129", "D136", "D151", "D164", "E183", "D186"):
        sb_labels.append(f"{r} - {n}")

In [6]:
sb_trajs = list(np.load("../../data/raw_feat/feat2_raw_anton.npy", allow_pickle=True))
# sb_trajs = []
# for i in range(6):
#     sb_trajs.extend(np.load(f'../../data/raw_feat/feat2_suppl_{i:02d}.npy', allow_pickle=True))
sb_trajs.extend(np.load("../../data/raw_feat/feat2_raw_anton2.npy"))

In [7]:
cv_arr = np.concatenate(cv_trajs)
sb_arr = np.concatenate(sb_trajs)
print(cv_arr.shape, sb_arr.shape)

(3170017, 2) (3170017, 60)


# Delay embed CVs

In [8]:
n_embed = 20  # number of embeddings snapshots
delta = 10  # 1 ns, time between embedded snapshots

In [9]:
# use smaller input feature set
# R223-D129, R226-D129, R226-D186, R229-D129, R229-D186, R232-D186
sb_ids = [36, 42, 47, 48, 53, 59]
data_orig = [traj[:, sb_ids] for traj in sb_trajs]
embed_trajs = util.delay_embed(data_orig, n_embed, delta)
embed_arr = np.concatenate(embed_trajs)

print(data_orig[0].shape)
print(len(embed_trajs), embed_trajs[0].shape)
print(embed_arr.shape)

(10000, 6)
254 (9800, 126)
(3119217, 126)


# Cluster then embed indicators

# Chatipat's method (remove linear combinations before embedding)

In [184]:
def delay(tlist, n, max_embed, lag):
    embed_tlist = []
    for traj in tlist:
        N_i = len(traj)
        if N_i - (lag * n) <= 0:
            continue
        start_ix = lag * (max_embed - n)
        stop_ix = N_i - (lag * n)
        embed_tlist.append(traj[start_ix:stop_ix])
    return embed_tlist

In [221]:
def embed_reduce(tlist, max_embed, lag):
    embed_tlist = []
    g = tlist
    g_list = []
    for i in range(max_embed):
        # delay
        f_next = delay(g, i, max_embed, lag)

        # remove span of g
        f = remove_span(f_next, g_list)

        # compute most important PCA eigenvector
        pca = pyemma.coordinates.pca(data=f, dim=1)
        g = pca.transform(f)
        g_list.append(g)
    return g_list


def remove_span(tlist, vectors):
    """Remove span of each of the vectors from
    a given dataset
    """
    if len(vectors) == 0:
        return tlist
    ortho_tlist = []
    for traj, vecs in zip(tlist, vectors):
        ans = traj
        for v in vecs:
            ans -= v * np.dot(traj, v) / np.dot(v, v)
        ortho_tlist.append(ans)
    return ortho_tlist