# String Method Analysis Markov-State-Models
## Imports

In [None]:
import os
import pickle
import sys
import logging
import numpy as np
import matplotlib.pyplot as plt
import glob
import MDAnalysis as mda
from math import ceil
from tqdm import tqdm


logging.getLogger("stringmethod").setLevel(logging.ERROR)
sys.path.append("../string-method-gmxapi/")
import src.analysis as spc

In [None]:
spc.jupyter_lab_notification()

In [None]:
spc.jupyter_lab_error()

In [None]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

## Control Variables

In [None]:
extract_data = True
if os.getenv("CALC_FES") == "True":
    calculate_vamp_scores = True
    calculate_FES_errors = True
    calculate_H2O_behind_SF = True
else:
    calculate_vamp_scores = False
    calculate_FES_errors = True
    calculate_H2O_behind_SF = True
    calculate_SF_occupation = True

## Load data

This notebook needs to run in the string simulation folder, this cell will get you there. You also set up a path for writing the figures.

In [None]:
beads_per_iter = 18 - 2
swarms_per_bead = 32
steps_per_swarm = 2

In [None]:
if os.getenv("NAME_SIM") == None:
    name_sim = "C2I_v1_amber/"
    name_sim = "C2I_lb_v1_amber/"
    name_sim = "C2I_lb_v1/"
    name_sim = "C2I_v1/"
else:
    name_sim = os.getenv("NAME_SIM")
path_raw = os.path.expanduser(f"~/Projects/string_sims/data/raw/{name_sim}")
path_interim = os.path.expanduser(f"~/Projects/string_sims/data/interim/{name_sim}")
path_processed = os.path.expanduser(f"~/Projects/string_sims/data/processed/{name_sim}")
path_XRD = os.path.expanduser(f"~/Projects/string_sims/models/raw_pdb/")
path_report = os.path.expanduser(f"~/Projects/string_sims/reports/figures/{name_sim}")
os.chdir(path_raw)
os.getcwd()

In [None]:
with open("cv.pkl", "rb") as file:
    cvs, ndx_groups = pickle.load(file)

The `load_swarm_data` function will load the swarm data in the `cv_coordinates`. If you set `extract=True` it will read the data from the swarm files. If you have done this previously you can set `extract=False` so the function just reads `postprocessing/cv_coordinates.npy`. `first_iteration` can be used to exclude initial swarms as equilibration and `last_iteration` can be done to exclude some iterations for example if you want to estimate the FES convergence by comparing blocks of data.

In [None]:
first_iteration = {
    "C2I_lb_v1/": 100,
    "C2I_v1/": 400,
    "C2I_v1_amber/": 100,
    "C2I_lb_v1_amber/": 100,
}
first_iteration = first_iteration[name_sim]

In [None]:
if extract_data:
    cv_coordinates = spc.load_swarm_data(
        extract=True, first_iteration=first_iteration, last_iteration=None
    )
    np.save(f"{path_interim}cv_coordinates.npy", cv_coordinates)
else:
    cv_coordinates = np.load(f"{path_interim}cv_coordinates.npy")

In [None]:
files = spc.natural_sort(glob.glob("./strings/string[0-9]*txt"))
strings = np.array([np.loadtxt(file).T for file in files])

Load pickle with data from XRD

In [None]:
os.chdir(path_XRD)
with open(
    os.path.expanduser("~/Projects/string_sims/data/processed/XRD/XRD.pickle"), "rb"
) as handle:
    XRD_dictionary = pickle.load(handle)
os.chdir(path_raw)

# MSM modelling of free energy surface

## Dimensionality reduction with TICA

The following cell computes the tica projection of the string cvs and discards the tics that have the lowest kinetic variance. This reduces the cvs space to a lower dimensional space that is adapted to the kinetic variance. You can use the drop keyword to drop certain cvs that are not well converged in the string simulation or that change very little from the beggining to the end of the string. The best case scenario is that `drop=[]` just works.

In [None]:
tica = spc.cvs_to_tica(cv_coordinates, drop=[], reversible=True)

## Cluster

The next cell plots the "vamp score" of using `n_clustercenters` to make an MSM. You should find that at some point the vamp score saturates. Choose the minimum number of clusters that gives you the saturated vamp score as the value of k for the next steps. This might take a little while.

In [None]:
if calculate_vamp_scores:
    n_clustercenters = [5, 10, 30, 50, 75, 100, 200][::-1]
    fig, ax, vamp_scores = spc.get_vamp_vs_k(
        n_clustercenters, tica, n_jobs=4, allow_failed_msms=True, reversible=True
    )  # 6 min
    np.save(f"{path_interim}vamp_scores.npy", vamp_scores)
else:
    n_clustercenters = [5, 10, 30, 50, 75, 100, 200][::-1]
    scores = np.load(f"{path_interim}vamp_scores.npy")
    fig, ax, vamp_scores = spc.get_vamp_vs_k(
        n_clustercenters,
        tica,
        n_jobs=4,
        allow_failed_msms=True,
        reversible=True,
        scores=scores,
    )

# If the calculation fails, there is something wrong with your MSM. Either you have too little transitions or there too many cvs in tica to have all the states well connected. Solutions:
+ Reduce the maximum number of clusters (drop 200 and 500) of `n_clustercenters` and see if you get a saturated curve.
+ Reduce the number of cvs that went into your TICA calculation.
+ Do more iterations of the string method.

## MSM Deeptime

Choose the number of clusters, `k`, for the clustering from the previous calculation. Also change n_proc to however many processors you can use.

In [None]:
k = {
    "C2I_lb_v1_amber/": 75,
    "C2I_lb_v1/": 30,
    "C2I_v1/": 50,
    "C2I_v1_amber/": 75,
}
k = k[name_sim]

In [None]:
clusters = spc.k_means_cluster(tica, k, stride=1, max_iter=500, n_jobs=4, seed=28101990)
np.save(f"{path_processed}clusters.npy", clusters)

In [None]:
%%time
msm, weights = spc.get_msm(clusters, n_jobs=4, reversible=True)
np.save(f"{path_processed}weights.npy", weights)

# FES projection on IG vs SF

## CVs for projection

Make a `cv_proj` numpy array with shape (n_iteration * n_swarms_iterations, n_frames_per_iter, 2). n_frames_per_iter is usally 2 since you only record the value of the cvs at the begining and end of the swarm. The last dimesions are the cvs on which you would like to project your FES using the weights obtained from the msm. The FES is then the negative log of a *weighted* histogram of the projection cvs using the weights from the msm. The projection cvs can be anything that you can calculate for a structure, not necessarily the cvs of the string. In the example bellow it is the mean of two cvs.

In [None]:
cv_proj = spc.cvs_to_SF_IG(cv_coordinates, [0, 1], [10, 11])
np.save(f"{path_interim}cv_proj.npy", cv_proj)

## Project FES

Do the projection and take log. You have to choose a bandwidth for the [KDE](https://en.wikipedia.org/wiki/Kernel_density_estimation) of the histogram. It should be big enough to reduce noise but not so big to remove features. If you give `None`

In [None]:
bandwidth = {
    "C2I_lb_v1/": 0.05,
    "C2I_v1/": 0.05,
    "C2I_v1_amber/": 0.05,
    "C2I_lb_v1_amber/": 0.05,
}
bandwidth = bandwidth[name_sim]

In [None]:
%%time
f_max = 25
p_of_cv, extent = spc.get_kde(cv_proj, weights, bandwidth)
F0 = -np.log(p_of_cv)
F = F0 - F0.min()
F[F > f_max] = np.nan

Do the projection and take log. You have to choose a bandwidth for the [KDE](https://en.wikipedia.org/wiki/Kernel_density_estimation) of the histogram. It should be big enough to reduce noise but not so big to remove features. If you give `None`

In [None]:
np.save(f"{path_processed}FES_SF_IG.npy", F)
np.save(f"{path_processed}extent.npy", np.array(extent))

## Plot FES

In [None]:
fig, ax = spc.plot_2D_heatmap(
    F,
    extent,
    f_max=f_max,
    f_min=0,
    cbar_label="Free Energy (kT)",
    xlabel="SF (nm)",
    ylabel="IG (nm)",
)
# ax.set_xlim([0.48, 1.0])
# ax.set_ylim([1.1, 2.45])
spc.add_XRD_values(XRD_dictionary, "SF", "IG", size=15, ax=ax)
fig.tight_layout()
fig.savefig(path_report + "FES.png")

## Bootstrap to get error

The problem with calculating errors in MD is that most statistical techniques for this rely on the data being uncorrelated. MD data is most of the time highly correlated due to the proximity in time and starting structure. Correlated data generates artificially low error estimates. 

For this reason we use blocking. In our case we will use blocking+bootstrapping. This is very well explained in this [very usefull video](https://www.youtube.com/watch?v=gHXXGYIgasE&t=1854s) by prof. Giovanni Bussi.

The uncertainty is calculated as half of the interval containing 95% of the probability of the distribution of histograms generated in the bootstraps.

This part is probably going to be slow! Maybe it will go over night. It is actually doing len(blocks) * n_boot msms! The good things is that once you have figured out for your system (and similar systems) what is a reasonable number of blocks then you can just do `blocks=[my_reasonable_number_blocks]`.

In [None]:
%%time
import src.analysis as spc

blocks = [2, 4, 8, 16, 32]
n_blocks = len(blocks)
n_boot = 100
if calculate_FES_errors:
    errors = spc.get_error(
        cv_proj,
        clusters,
        extent,
        n_boot=n_boot,
        bandwidth=bandwidth,
        nbin=55,
        blocks=blocks,
        seed=28101990,
        n_jobs=4,
    )
    np.save(f"{path_processed}errors_{n_boot}_{n_blocks}.npy", errors)
else:
    errors = np.load(f"{path_processed}errors_{n_boot}_{n_blocks}.npy")

Choose the number of blocks that gives you a high error.

Note,`e_min` and `e_max` are choosen to remove the extremely high or low values of error that are generated due to poor sampling or high free energy. These regions of the "free error surface" are not what we care about and thus we remove it from the statistic and the visualization.

In [None]:
fig, ax = plt.subplots(1, 1)
for error in glob.glob(path_processed + "errors_*.npy"):
    errors = np.load(error)
    errors[:, ~np.isfinite(F)] = np.nan
    label = f"n_boot={error.split('/')[-1].split('_')[1]}"
    mean = np.nanmean(errors, axis=(1, 2))
    std_err = np.nanstd(errors, axis=(1, 2)) / np.sqrt(errors.shape[0])
    ax.plot(np.array(blocks), mean, marker="o", label=label)
    ax.fill_between(np.array(blocks), mean + std_err, mean - std_err, alpha=0.3)
ax.legend()
ax.set_xlabel("Number of blocks", size=15)
ax.set_ylabel("FES error (kT)", size=15)

From the previous plot you can see which is the adequate number of blocks that low but still gives you the plateauing (or highest) error.

In [None]:
number_blocks = 16
n_boot = 150
n_blocks = len(blocks)
errors = np.load(f"{path_processed}errors_{n_boot}_{n_blocks}.npy")
f_max = 25

e = errors[blocks.index(number_blocks)].copy()
e[~np.isfinite(F)] = np.nan

fig, ax = plt.subplots(1, 2, figsize=(10 * 2, 7), sharex=True, sharey=True)
_ = spc.plot_2D_heatmap(
    F,
    extent,
    f_max=f_max,
    f_min=0,
    cbar_label="Free Energy (kT)",
    xlabel="SF (nm)",
    ylabel="IG (nm)",
    fig=fig,
    ax=ax[0],
)
_ = spc.plot_2D_heatmap(
    e,
    extent,
    f_max=4,
    f_min=0,
    cbar_label="FES Uncertainty (kT)",
    xlabel="SF (nm)",
    cmap=plt.cm.viridis_r,
    fig=fig,
    ax=ax[1],
)
spc.add_XRD_values(XRD_dictionary, "SF", "IG", size=15, ax=ax[1], color="r")
spc.add_XRD_values(XRD_dictionary, "SF", "IG", size=15, ax=ax[0])
ax[1].set_title("Bootstrap Error (95%)")
fig.tight_layout()
fig.savefig(path_report + "FES_error.png")

# Project H2O behind SF on FES

## Sum of all the H2O

In [None]:
H2O_behind_SF = np.load(f"{path_interim}H2O_behind_SF.npy")

In [None]:
H2O_behind_SF = np.sum(H2O_behind_SF, axis=1, keepdims=True)

In [None]:
if calculate_H2O_behind_SF:
    if name_sim == "C2I_v1/":
        start = 300 * beads_per_iter * swarms_per_bead * steps_per_swarm
    else:
        start = 0
    H2O_behind_SF_proj, extent = spc.project_property_on_cv_kde(
        cv_proj, weights=weights, proper=H2O_behind_SF[start:, 0], bandwidth=bandwidth
    )
    np.save(f"{path_processed}H2O_behind_SF_proj.npy", H2O_behind_SF_proj)
else:
    H2O_behind_SF_proj = np.load(f"{path_processed}H2O_behind_SF_proj.npy")

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 7), sharex=True, sharey=True)
_ = spc.plot_2D_heatmap(
    H2O_behind_SF_proj,
    extent,
    cbar_label="Number of H2O behind SF",
    xlabel="SF (nm)",
    ylabel="IG (nm)",
    f_min=0,
    # f_max=1,
    fig=fig,
    cmap=plt.cm.Spectral,
    ax=ax,
    n_colors=200,
    c_density=F,
    c_min=0,
    c_max=20,
    c_color="k",
)
ax.grid(None)
spc.add_XRD_values(XRD_dictionary, "SF", "IG", size=15, ax=ax, color="g")
fig.tight_layout()
fig.savefig(path_report + "projection_H2O_behind_SF.png")

## Lower H2O 

In [None]:
H2O_behind_SF = np.load(f"{path_interim}H2O_behind_SF.npy")

In [None]:
if calculate_H2O_behind_SF:
    if name_sim == "C2I_v1/":
        start = 300 * beads_per_iter * swarms_per_bead * steps_per_swarm
    else:
        start = 0
    H2O_behind_SF_proj, extent = spc.project_property_on_cv_kde(
        cv_proj, weights=weights, proper=H2O_behind_SF[start:, 2], bandwidth=bandwidth
    )
    np.save(f"{path_processed}H2O_behind_SF_proj.npy", H2O_behind_SF_proj)
else:
    H2O_behind_SF_proj = np.load(f"{path_processed}H2O_behind_SF_proj.npy")

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 7), sharex=True, sharey=True)
_ = spc.plot_2D_heatmap(
    H2O_behind_SF_proj,
    extent,
    cbar_label="Number of H2O lower position",
    xlabel="SF (nm)",
    ylabel="IG (nm)",
    f_min=0,
    # f_max=1,
    fig=fig,
    cmap=plt.cm.Spectral,
    ax=ax,
    n_colors=200,
    c_density=F,
    c_min=0,
    c_max=20,
    c_color="k",
)
ax.grid(None)
spc.add_XRD_values(XRD_dictionary, "SF", "IG", size=15, ax=ax, color="g")
fig.tight_layout()
fig.savefig(path_report + "projection_H2O_lower.png")

## Middle H2O

In [None]:
H2O_behind_SF = np.load(f"{path_interim}H2O_behind_SF.npy")

In [None]:
if calculate_H2O_behind_SF:
    if name_sim == "C2I_v1/":
        start = 300 * beads_per_iter * swarms_per_bead * steps_per_swarm
    else:
        start = 0
    H2O_behind_SF_proj, extent = spc.project_property_on_cv_kde(
        cv_proj, weights=weights, proper=H2O_behind_SF[start:, 1], bandwidth=bandwidth
    )
    np.save(f"{path_processed}H2O_behind_SF_proj.npy", H2O_behind_SF_proj)
else:
    H2O_behind_SF_proj = np.load(f"{path_processed}H2O_behind_SF_proj.npy")

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 7), sharex=True, sharey=True)
_ = spc.plot_2D_heatmap(
    H2O_behind_SF_proj,
    extent,
    cbar_label="Number of H2O middle position",
    xlabel="SF (nm)",
    ylabel="IG (nm)",
    f_min=0,
    # f_max=1,
    fig=fig,
    cmap=plt.cm.Spectral,
    ax=ax,
    n_colors=200,
    c_density=F,
    c_min=0,
    c_max=20,
    c_color="k",
)
ax.grid(None)
spc.add_XRD_values(XRD_dictionary, "SF", "IG", size=15, ax=ax, color="g")
fig.tight_layout()
fig.savefig(path_report + "projection_H2O_middle.png")

## Top H2O 

In [None]:
H2O_behind_SF = np.load(f"{path_interim}H2O_behind_SF.npy")

In [None]:
if calculate_H2O_behind_SF:
    if name_sim == "C2I_v1/":
        start = 300 * beads_per_iter * swarms_per_bead * steps_per_swarm
    else:
        start = 0
    H2O_behind_SF_proj, extent = spc.project_property_on_cv_kde(
        cv_proj, weights=weights, proper=H2O_behind_SF[start:, 0], bandwidth=bandwidth
    )
    np.save(f"{path_processed}H2O_behind_SF_proj.npy", H2O_behind_SF_proj)
else:
    H2O_behind_SF_proj = np.load(f"{path_processed}H2O_behind_SF_proj.npy")

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 7), sharex=True, sharey=True)
_ = spc.plot_2D_heatmap(
    H2O_behind_SF_proj,
    extent,
    cbar_label="Number of H2O top position",
    xlabel="SF (nm)",
    ylabel="IG (nm)",
    f_min=0,
    # f_max=1,
    fig=fig,
    cmap=plt.cm.Spectral,
    ax=ax,
    n_colors=200,
    c_density=F,
    c_min=0,
    c_max=20,
    c_color="k",
)
ax.grid(None)
spc.add_XRD_values(XRD_dictionary, "SF", "IG", size=15, ax=ax, color="g")
fig.tight_layout()
fig.savefig(path_report + "projection_H2O_top.png")

# Find nearest structure

In [None]:
points = {
    "C2I_lb_v1/": np.array(
        [[0.55, 2.0], [0.825, 1.3], [0.825, 1.65], [0.65, 1.9], [0.81, 1.85]]
    ),
    "C2I_v1/": np.array([[0.54, 2.0], [0.815, 1.3], [0.815, 1.7]]),
    "C2I_lb_v1_amber/": np.array(
        [[0.53, 2.25], [0.89, 1.25], [0.88, 1.65], [0.88, 2.05]]
    ),
    "C2I_v1_amber/": np.array([[0.53, 2.25], [0.89, 1.3], [0.89, 1.55], [0.88, 2.05]]),
}

In [None]:
for point in points[name_sim]:
    nearest, distance = spc.find_nearest_point(cv_proj[:, 0, :], point)
    # nearest = 10 * swarms_per_bead * beads_per_iter + 3 * swarms_per_bead + 3
    iteration = nearest // (swarms_per_bead * beads_per_iter)
    bead = nearest % (swarms_per_bead * beads_per_iter) // swarms_per_bead + 1
    swarm = nearest % swarms_per_bead
    print(
        f"Nearest point to x={point[0]}, y={point[1]}, iteration={iteration+ first_iteration} , bead={bead}, swarm={swarm} distance={distance}"
    )
    print(f"{cv_proj[nearest, 0, :]}")

## Plot Restarted Trajectories in Basins

In [None]:
SF_txt0 = [
    "segid PROA and name CA and resid 77",
    "segid PROC and name CA and resid 77",
]
SF_txt1 = [
    "segid PROB and name CA and resid 77",
    "segid PROD and name CA and resid 77",
]
IG_txt0 = [
    "segid PROA and name CA and resid 112",
    "segid PROC and name CA and resid 112",
]
IG_txt1 = [
    "segid PROB and name CA and resid 112",
    "segid PROD and name CA and resid 112",
]

In [None]:
list_restarts = spc.natural_sort(glob.glob(f"{path_raw}restarts/*"))
trajectories = []
for restart in list_restarts:
    u = mda.Universe(f"{path_raw}topology/5VKH.pdb", f"{restart}/traj_comp.xtc")
    print(u.trajectory.n_frames)
    SF = spc.distance_pairs_av(u, SF_txt0, SF_txt1, verbose=False).run().results_pp
    IG = spc.distance_pairs_av(u, IG_txt0, IG_txt1, verbose=False).run().results_pp
    trajectories.append([SF.copy(), IG.copy()])
trajectories = np.array(trajectories)
np.save(f"{path_processed}/SF_IG_restarts.npy", trajectories)

In [None]:
fig, ax = spc.plot_2D_heatmap(
    F,
    extent,
    f_max=f_max,
    f_min=0,
    cbar_label="Free Energy (kT)",
    xlabel="SF (nm)",
    ylabel="IG (nm)",
)
ax.plot(*points[name_sim].T, ls="", marker="^", c="k", ms=15)
for i, trajectory in enumerate(trajectories):
    ax.plot(trajectory[0] / 10, trajectory[1] / 10, ls="", marker=".", c=f"C{i}", ms=8)
    ax.plot(
        trajectory[0][:1] / 10,
        trajectory[1][:1] / 10,
        ls="",
        marker="^",
        c=f"C{i}",
        ms=15,
    )
# ax.scatter(
#    trajectories[2][0] / 10,
#    trajectories[2][1] / 10,
#    marker=".",
#    c=np.arange(trajectories[2][0].shape[0]),
#    s=15,
# )
# ax.set_xlim([0.48, 1.0])
# ax.set_ylim([1.1, 2.45])
fig.tight_layout()
spc.add_XRD_values(XRD_dictionary, "SF", "IG", size=15, ax=ax, color="purple")
fig.savefig(path_report + "FES_restarts.png")

# Project H2O behind SF on FES

In [None]:
SF_occupation = np.load(f"{path_interim}SF_occupation.npy")

In [None]:
if name_sim == "C2I_v1/":
    start = 300 * beads_per_iter * swarms_per_bead * steps_per_swarm
else:
    start = 0

In [None]:
if calculate_SF_occupation:
    SF_CV = spc.count_occurrances(SF_occupation, "W")[:, 0]
    H2O_in_SF_proj, extent = spc.project_property_on_cv_kde(
        cv_proj, weights=weights, proper=SF_CV[start:], bandwidth=bandwidth
    )
    np.save(f"{path_processed}H2O_in_SF_proj.npy", H2O_in_SF_proj)
    SF_CV = spc.count_occurrances(SF_occupation, "K")[:, 0]
    K_in_SF_proj, extent = spc.project_property_on_cv_kde(
        cv_proj, weights=weights, proper=SF_CV[start:], bandwidth=bandwidth
    )
    np.save(f"{path_processed}K_in_SF_proj.npy", K_in_SF_proj)
else:
    H2O_in_SF_proj = np.load(f"{path_processed}H2O_in_SF_proj.npy")
    K_in_SF_proj = np.load(f"{path_processed}K_in_SF_proj.npy")

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10, 7), sharex=True, sharey=True)
_ = spc.plot_2D_heatmap(
    H2O_in_SF_proj,
    extent,
    cbar_label="Number of H2O in SF",
    xlabel="SF (nm)",
    ylabel="IG (nm)",
    f_min=0,
    # f_max=1,
    fig=fig,
    cmap=plt.cm.Spectral,
    ax=ax[0],
    n_colors=200,
    c_density=F,
    c_min=0,
    c_max=20,
    c_color="k",
)
_ = spc.plot_2D_heatmap(
    K_in_SF_proj,
    extent,
    cbar_label="Number of K in SF",
    xlabel="SF (nm)",
    ylabel="IG (nm)",
    f_min=0,
    # f_max=1,
    fig=fig,
    cmap=plt.cm.Spectral,
    ax=ax[1],
    n_colors=200,
    c_density=F,
    c_min=0,
    c_max=20,
    c_color="k",
)
ax[0].grid(None)
ax[1].grid(None)
spc.add_XRD_values(XRD_dictionary, "SF", "IG", size=15, ax=ax[0], color="g")
spc.add_XRD_values(XRD_dictionary, "SF", "IG", size=15, ax=ax[1], color="g")
fig.tight_layout()
fig.savefig(path_report + "projection_SF_totals.png")