# String Method Analysis Markov-State-Models
## Imports

In [None]:
import os
import pickle
import sys
import logging
import numpy as np
import matplotlib.pyplot as plt
import glob
from math import ceil
from tqdm import tqdm

sys.path.append("../../InfleCS")
import free_energy_clustering as FEC


logging.getLogger("stringmethod").setLevel(logging.ERROR)
sys.path.append("../string-method-gmxapi/")
import src.analysis as spc

In [None]:
sys.path.append("../../../../InfleCS")

In [None]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

## Control Variables

In [None]:
extract_data = True
if os.getenv("CALC_FES") == "True":
    calculate_FES_errors = False
else:
    calculate_FES_errors = True

## Load data

This notebook needs to run in the string simulation folder, this cell will get you there. You also set up a path for writing the figures.

In [None]:
if os.getenv("NAME_SIM") == None:
    name_sim = "C2I_v1_amber/"
    name_sim = "C2I_lb_v1_amber/"
    name_sim = "C2I_v1/"
    name_sim = "C2I_lb_v1/"
else:
    name_sim = os.getenv("NAME_SIM")
path_raw = os.path.expanduser(f"~/Projects/string_sims/data/raw/{name_sim}")
path_interim = os.path.expanduser(f"~/Projects/string_sims/data/interim/{name_sim}")
path_processed = os.path.expanduser(f"~/Projects/string_sims/data/processed/{name_sim}")
path_XRD = os.path.expanduser(f"~/Projects/string_sims/models/raw_pdb/")
path_report = os.path.expanduser(f"~/Projects/string_sims/reports/figures/{name_sim}")
os.chdir(path_raw)
os.getcwd()

In [None]:
with open("cv.pkl", "rb") as file:
    cvs, ndx_groups = pickle.load(file)

The `load_swarm_data` function will load the swarm data in the `cv_coordinates`. If you set `extract=True` it will read the data from the swarm files. If you have done this previously you can set `extract=False` so the function just reads `postprocessing/cv_coordinates.npy`. `first_iteration` can be used to exclude initial swarms as equilibration and `last_iteration` can be done to exclude some iterations for example if you want to estimate the FES convergence by comparing blocks of data.

In [None]:
if extract_data:
    cv_coordinates = spc.load_swarm_data(
        extract=True, first_iteration=100, last_iteration=200
    )
    np.save(f"{path_interim}cv_coordinates.npy", cv_coordinates)
else:
    cv_coordinates = np.load(f"{path_interim}cv_coordinates.npy")

In [None]:
files = spc.natural_sort(glob.glob("./strings/string[0-9]*txt"))
strings = np.array([np.loadtxt(file).T for file in files])

Load pickle with data from XRD

In [None]:
os.chdir(path_XRD)
with open(
    "/data/sperez/Projects/string_sims/data/processed/XRD/XRD.pickle", "rb"
) as handle:
    XRD_dictionary = pickle.load(handle)
os.chdir(path_raw)

# MSM modelling of free energy surface

## Dimensionality reduction with TICA

The following cell computes the tica projection of the string cvs and discards the tics that have the lowest kinetic variance. This reduces the cvs space to a lower dimensional space that is adapted to the kinetic variance. You can use the drop keyword to drop certain cvs that are not well converged in the string simulation or that change very little from the beggining to the end of the string. The best case scenario is that `drop=[]` just works.

In [None]:
tica = spc.cvs_to_tica(cv_coordinates, drop=[20, 21, 22, 23, 32, 33, 34, 35])

## Cluster

The next cell plots the "vamp score" of using `n_clustercenters` to make an MSM. You should find that at some point the vamp score saturates. Choose the minimum number of clusters that gives you the saturated vamp score as the value of k for the next steps. This might take a little while.

In [None]:
if True:
    n_clustercenters = [5, 10, 30, 50, 75, 100, 200, 500][::-1]
    fig, ax, vamp_scores = spc.get_vamp_vs_k(
        n_clustercenters, tica, n_jobs=4, allow_failed_msms=True
    )  # 6 min
    np.save(f"{path_interim}vamp_scores.npy", vamp_scores)

If the calculation fails, there is something wrong with your MSM. Either you have too little transitions or there too many cvs in tica to have all the states well connected. Solutions:
+ Reduce the maximum number of clusters (drop 200 and 500) of `n_clustercenters` and see if you get a saturated curve.
+ Reduce the number of cvs that went into your TICA calculation.
+ Do more iterations of the string method.

## MSM Deeptime

Choose the number of clusters, `k`, for the clustering from the previous calculation. Also change n_proc to however many processors you can use.

In [None]:
k = 100
clusters = spc.k_means_cluster(tica, k, stride=1, max_iter=500, n_jobs=4, seed=28101990)

In [None]:
%%time
msm, weights = spc.get_msm(clusters, n_jobs=4)

# FES projection on IG vs SF

## CVs for projection

Make a `cv_proj` numpy array with shape (n_iteration * n_swarms_iterations, n_frames_per_iter, 2). n_frames_per_iter is usally 2 since you only record the value of the cvs at the begining and end of the swarm. The last dimesions are the cvs on which you would like to project your FES using the weights obtained from the msm. The FES is then the negative log of a *weighted* histogram of the projection cvs using the weights from the msm. The projection cvs can be anything that you can calculate for a structure, not necessarily the cvs of the string. In the example bellow it is the mean of two cvs.

In [None]:
cv_proj = spc.cvs_to_SF_IG(cv_coordinates, [0, 1], [10, 11])
np.save(f"{path_interim}cv_proj.npy", cv_proj)

## Project FES

Do the projection and take log. You have to choose a bandwidth for the [KDE](https://en.wikipedia.org/wiki/Kernel_density_estimation) of the histogram. It should be big enough to reduce noise but not so big to remove features. If you give `None`

In [None]:
%%time
bandwidth = 0.05
p_of_cv, extent = spc.get_kde(cv_proj, weights, bandwidth)
F0 = -np.log(p_of_cv)
F = F0 - F0.min()
F[F > 40] = np.nan

Do the projection and take log. You have to choose a bandwidth for the [KDE](https://en.wikipedia.org/wiki/Kernel_density_estimation) of the histogram. It should be big enough to reduce noise but not so big to remove features. If you give `None`

In [None]:
np.save(f"{path_processed}FES_SF_IG.npy", F)

## Plot FES

In [None]:
fig, ax = spc.plot_2D_heatmap(
    F * 0.593,
    extent,
    f_max=20,
    f_min=0,
    cbar_label="Free Energy (kT)",
    xlabel="SF (nm)",
    ylabel="IG (nm)",
)
# ax.set_xlim([0.48, 1.0])
# ax.set_ylim([1.1, 2.45])
spc.add_XRD_values(XRD_dictionary, "SF", "IG", size=15, ax=ax)
fig.tight_layout()
fig.savefig(path_report + "FES.png")

In [None]:
fec = FEC.FreeEnergyClustering(
    cv_proj.reshape(-1, cv_proj.shape[1]),
    min_n_components=2,
    max_n_components=10,
    temperature=300.0,
    n_iterations=5,
    n_grids=55,
    n_splits=1,
    stack_landscapes=False,
    data_weights=weights,
)

In [None]:
coords, FE_landscape, FE_points = fec.landscape()

In [None]:
fec.visualize(
    savefig=False,
    show_data=False,
    vmax=20,
    xlabel="X",
    ylabel="Y",
    filename="free_energy_landscape",
    title="Free energy landscape",
)

In [None]:
labels, cluster_centers = fec.cluster(
    coords,
    FE_points,
    cv_proj.reshape(-1, cv_proj.shape[1]),
    assign_transition_points=False,
)
print("Cluster center indices: " + str(cluster_centers))

# Computing state populations
state_populations = fec.population_states(n_sampled_points=100000)

In [None]:
# Visualize free energy landscape with cluster labels
fec.pathways_ = None
fec.visualize(savefig=False, vmax=7, show_data=True, xlabel='X', ylabel='Y', filename='InfleCS_clustering', title='InlfeCS clustering')

# Plotting the state populations
plt.figure(figsize=[15,5]);
plt.plot(np.arange(1,state_populations.shape[0]),state_populations[1::]/state_populations.sum(),linewidth=5,color=[0.7,0.7,0.7],zorder=-1)
plt.scatter(np.arange(1,state_populations.shape[0]),state_populations[1::]/state_populations.sum(),s=500,c=np.arange(1,state_populations.shape[0]),cmap='jet',edgecolor='k')
plt.title('State populations',fontsize=30)
plt.xlabel('State',fontsize=28)
plt.ylabel('Probability',fontsize=28)
plt.xticks(np.arange(1,state_populations.shape[0]))