# String Method Analysis Markov-State-Models
## Imports

In [None]:
%load_ext lab_black

import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib.ticker import MultipleLocator, FormatStrFormatter, AutoMinorLocator
import os
import pickle
import sys
import logging
import numpy as np
import pyemma

logging.getLogger("matplotlib").setLevel(logging.ERROR)
logging.getLogger("blib2to3.pgen2.driver").setLevel(logging.WARNING)
logging.getLogger("pyemma").setLevel(logging.NOTSET)
sys.path.append("../string-method-gmxapi/")
from stringmethod.config import *
from stringmethod.postprocessing import *

In [None]:
from scipy import stats

In [None]:
def colorbar(mappable, cmap, norm, label0, size=10):
    from mpl_toolkits.axes_grid1 import make_axes_locatable

    ax = mappable.axes
    fig = ax.figure
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="5%", pad=0.05)
    cbar = mpl.colorbar.ColorbarBase(cax, cmap=cmap, norm=norm)
    cbar.set_label(label0, size=size)
    return cbar

In [None]:
def load_swarm_data(extract, first_iteration=1, last_iteration=None):
    if last_iteration == None:
        last_iteration = sys.maxsize
    if extract:
        config = load_config("config.json")

        ce = CvValueExtractor.from_config(
            config=config,
            first_iteration=first_iteration,  # Exclude the first iterations to let the system equilibrate.
            last_iteration=last_iteration,  # Usefull to make blocks of the simulation
        )
        ce.run()
        ce.persist()
    return np.load("postprocessing/cv_coordinates.npy")

In [None]:
plt.rcParams["axes.facecolor"] = "#f9f9fb"
plt.rcParams["grid.color"] = "white"
plt.rcParams["grid.linestyle"] = "-"
plt.rcParams["grid.linewidth"] = 2
plt.rcParams["axes.grid"] = True
plt.rcParams["lines.solid_capstyle"] = "round"

## Load data

In [None]:
%ls ../data/raw

In [None]:
simulation_directory = "/data/sperez/Projects/string_sims/data/raw/C2I_lb_v2/"
simulation_directory = "/data/sperez/Projects/string_sims/data/raw/C2I_lb_v1/"
simulation_directory = "/data/sperez/Projects/string_sims/data/raw/C2I_v1/"
os.chdir(simulation_directory)
os.getcwd()

In [None]:
with open("cv.pkl", "rb") as file:
    cvs, ndx_groups = pickle.load(file)

The `load_swarm_data` function will load the swarm data in the `cv_coordinates`. If you set `extract=True` it will read the data from the swarm files. If you have done this previously you can set `extract=False` so the function just reads `postprocessing/cv_coordinates.npy`. `first_iteration` can be used to exclude initial swarms as equilibration and `last_iteration` can be done to exclude some iterations for example if you want to estimate the FES convergence by comparing blocks of data.

In [None]:
cv_coordinates0 = load_swarm_data(
    extract=True, first_iteration=100, last_iteration=None
)

## Choose cvs to use

### CVs for MSMs

In [None]:
data = []
for i in range(cv_coordinates0.shape[0]):
    data.append(cv_coordinates0[i, :, :])

In [None]:
data = pyemma.coordinates.tica(data, lag=1).get_output()

### CVs for projection

In [None]:
cv_proj = np.concatenate(
    [
        np.mean([cv_coordinates0[:, :, 0:1], cv_coordinates0[:, :, 1:2]], axis=0),
        np.mean([cv_coordinates0[:, :, 10:11], cv_coordinates0[:, :, 11:12]], axis=0),
    ],
    axis=2,
)

### Prepare data for pyemma

In [None]:
data_proj = []
for i in range(cv_proj.shape[0]):
    data_proj.append(cv_proj[i, :, :])

In [None]:
data_concat = np.concatenate(data)

In [None]:
data_proj_concat = np.concatenate(data_proj)

## Cluster

In [None]:
def score_cv(data, dim, lag, number_of_splits=10, validation_fraction=0.5):
    """Compute a cross-validated VAMP2 score.

    We randomly split the list of independent trajectories into
    a training and a validation set, compute the VAMP2 score,
    and repeat this process several times.

    Parameters
    ----------
    data : list of numpy.ndarrays
        The input data.
    dim : int
        Number of processes to score; equivalent to the dimension
        after projecting the data with VAMP2.
    lag : int
        Lag time for the VAMP2 scoring.
    number_of_splits : int, optional, default=10
        How often do we repeat the splitting and score calculation.
    validation_fraction : int, optional, default=0.5
        Fraction of trajectories which should go into the validation
        set during a split.
    """
    # we temporarily suppress very short-lived progress bars
    pyemma.util.config.show_progress_bars = False
    nval = int(len(data) * validation_fraction)
    scores = np.zeros(number_of_splits)
    for n in range(number_of_splits):
        ival = np.random.choice(len(data), size=nval, replace=False)
        vamp = pyemma.coordinates.vamp(
            [d for i, d in enumerate(data) if i not in ival], lag=lag, dim=dim
        )
        scores[n] = vamp.score([d for i, d in enumerate(data) if i in ival])
    pyemma.util.config.show_progress_bars = True
    return scores

In [None]:
%%time
n_clustercenters = [5, 10, 30, 
                    50, 
    75,
    100, 
                    200, 500
]
scores = np.zeros((len(n_clustercenters), 5))
pyemma.util.config.show_progress_bars = True
clusters = []
for n, k in enumerate(n_clustercenters):
    print(k)
    for m in range(5):
        _cl = pyemma.coordinates.cluster_kmeans(data, k=k, max_iter=50, stride=10)
        _msm = pyemma.msm.estimate_markov_model(_cl.dtrajs, 1)
        scores[n, m] = _msm.score_cv(
            _cl.dtrajs, n=1, score_method="VAMP2", score_k=min(10, k)
        )
    clusters.append(_cl)
fig, ax = plt.subplots()
lower, upper = pyemma.util.statistics.confidence_interval(scores.T.tolist(), conf=0.9)
ax.fill_between(n_clustercenters, lower, upper, alpha=0.3)
ax.plot(n_clustercenters, np.mean(scores, axis=1), "-o")
ax.semilogx()
ax.set_xlabel("number of cluster centers")
ax.set_ylabel("VAMP-2 score")
fig.tight_layout()

In [None]:
def get_cluster_plot(data, data_proj_concat, dmin, ax):
    cluster = pyemma.coordinates.cluster_regspace(data, dmin=dmin)
    dtrajs_concatenated = np.concatenate(cluster.dtrajs)
    pyemma.plots.plot_density(*data_proj_concat.T, ax=ax, cbar=False, alpha=0.3)
    ax.scatter(*cluster.clustercenters.T, s=5, c="C1")
    ax.set_xlabel("CV 0")
    ax.set_ylabel("CV 1")
    return ax, cluster

In [None]:
def get_cluster_plot(data, data_proj_concat, k, ax):
    cluster = pyemma.coordinates.cluster_kmeans(
        data, k=k, max_iter=500, stride=1, fixed_seed=1
    )
    dtrajs_concatenated = np.concatenate(cluster.dtrajs)
    #    pyemma.plots.plot_density(*data_proj_concat.T, ax=ax, cbar=False, alpha=0.3)
    #    ax.scatter(*cluster.clustercenters.T, s=5, c="C1")
    #    ax.set_xlabel("CV 0")
    #    ax.set_ylabel("CV 1")
    return cluster  # , ax

In [None]:
k = 50
fig, ax = plt.subplots(1, 1, figsize=(4, 4))
cluster = get_cluster_plot(data, data_proj_concat, k, ax)
fig.tight_layout()

## MSM

In [None]:
msm = pyemma.msm.bayesian_markov_model(
    cluster.dtrajs,
    lag=1,
    dt_traj="0.01 ns",
    nsamples=500,
    sparse=True,
    reversible=False,
)
print("fraction of states used = {:.2f}".format(msm.active_state_fraction))
print("fraction of counts used = {:.2f}".format(msm.active_count_fraction))

In [None]:
m1 = data_proj_concat[:, 0]
m2 = data_proj_concat[:, 1]
xmin = m1.min()
xmax = m1.max()
ymin = m2.min()
ymax = m2.max()
nbins = 55
nbins = nbins * 1j
X, Y = np.mgrid[xmin:xmax:nbins, ymin:ymax:nbins]
positions = np.vstack([X.ravel(), Y.ravel()])
values = np.vstack([m1, m2])
kernel = stats.gaussian_kde(
    data_proj_concat.T,
    weights=np.concatenate(msm.trajectory_weights()),
    bw_method=0.1,
)
Z = -np.log(np.reshape(kernel(positions), X.shape))
Z -= Z.min()
Z = Z.T
Z[Z > 40] = np.nan

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 7), sharex=True, sharey=True)
n_colors = 50
f_min = 0
f_max = 40
cmap = plt.cm.RdYlBu_r
colors = cmap(np.linspace(0, 1, n_colors))  # yellow to blue
norm = mpl.colors.Normalize(vmin=f_min, vmax=f_max)
cbar_label = "Free Energy (kT)"
ax.contourf(
    Z,
    cmap=plt.cm.RdYlBu_r,
    extent=[xmin, xmax, ymin, ymax],
    vmax=f_max,
    vmin=f_min,
    levels=n_colors,
)
cbar = colorbar(ax, cmap, norm, cbar_label, 20)
fig.tight_layout()

In [None]:
pyemma.plots.plot_markov_model(msm)

In [None]:
sys.path.append("/home/sperez/Projects/InfleCS")
import free_energy_clustering as FEC

In [None]:
data_0 = data_proj_concat[:, 0:1]
data_1 = data_proj_concat[:, 1:2]
data = np.concatenate((data_0, data_1), axis=1)

In [None]:
fec = FEC.FreeEnergyClustering(
    data,
    min_n_components=1,
    max_n_components=20,
    temperature=1.0,
    n_iterations=5,
    n_grids=80,
    n_splits=1,
    stack_landscapes=False,
    data_weights=np.concatenate(msm.trajectory_weights()),
)

In [None]:
coords, FE_landscape, FE_points = fec.landscape()

In [None]:
fec.visualize(
    savefig=False,
    show_data=False,
    vmax=20,
    n_contour_levels=50,
    ylabel="IG (nm)",
    xlabel="SF (nm)",
    filename="free_energy_landscape",
    title="Free energy landscape",
)