In [None]:
%matplotlib inline

import numpy as np
from argparse import Namespace
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA
import phate

# Prepared embedded data

In [None]:
"""load saved trajectories data for npz file
"""
SEQ = "PT4"
# SEQ = "PT4_hairpin"

# laod pre-training data
fnpz_data = "./data/pretraining/pretraining_{}.npz".format(SEQ)
data_npz = np.load(fnpz_data)

# asssign data to variables
for var in data_npz.files:
     locals()[var] = data_npz[var]
     print(var, locals()[var].shape)

In [None]:
X_j = np.array(np.load(f'./data/graph/{SEQ}/shortestpath_knn=100.npz',allow_pickle=True)["X_j"], dtype=int)
D_ij = np.array(np.load(f'./data/graph/{SEQ}/shortestpath_knn=100.npz',allow_pickle=True)["D_ij"], dtype=float)
print("X_j", X_j.shape)
print("D_ij", D_ij.shape)

In [None]:
# load embedding WITHOUT vida plot data
SEQ = "PT4_hairpin"
fnpz_noViDa = f"./data/vida_data/noViDa-noEnergy/{SEQ}_noViDa.npz"
data_noViDa = np.load(fnpz_noViDa,allow_pickle=True)
for var in data_noViDa.files:
    globals()[var] = data_noViDa[var]
    print(var, globals()[var].shape)

In [None]:
# load ViDa embedding plot data
SEQ_embed = "PT0_0305-2258"

fnpz_data_embed = f"./data/vida_data/{SEQ_embed}.npz"
# fnpz_data_embed = f"./data/vida_data/{SEQ}_usePT4_03040216.npz"
data_npz_embed = np.load(fnpz_data_embed,allow_pickle=True)

# asssign data to variables
for var in data_npz_embed.files:
    globals()[var] = data_npz_embed[var]
    print(var, globals()[var].shape)
print("data_embed", data_embed.max(), data_embed.min(), data_embed.mean(), data_embed.std())

In [None]:
# load GSAE embedding plot data

fnpz_data_embed = f"./data/gsae_data/helix_assoc_PT0_multrj_60epoch_py.npz"

data_npz_embed = np.load(fnpz_data_embed,allow_pickle=True)
# asssign data to variables
for var in data_npz_embed.files:
    globals()[var] = data_npz_embed[var]
    print(var, globals()[var].shape)
print("data_embed", data_embed.max(), data_embed.min(), data_embed.mean(), data_embed.std())

In [None]:
# calculate the probability of being visited during a simulated trajectory 
# from the initial state
split_id = trj_id + 1 # index for split to each trajectory
P_tot = np.zeros(len(SIMS_dict_uniq))

for i in range(len(split_id)):
    if i == 0:
        trj = set(SIMS_dict[0:split_id[i],4].astype(int))
    else:
        trj = set(SIMS_dict[split_id[i-1]:split_id[i],4].astype(int))

    P_tot[list(trj)] += 1

P_tot = P_tot / 100

print("P_tot", P_tot.shape, P_tot.max(), P_tot.min())

# Evaluate embedding

### Metric distance

In [None]:
def metric_dist(X_j, D_ij, P_tot, z):
    """
    Metric to calculate the distance 
    """
    z_re = z.reshape(-1,1,z.shape[-1])
    zj = z[X_j]
    l2_zizj = np.sqrt(np.sum((z_re-zj)**2, axis=-1))
    
    # normalize the distance
    scaler = MinMaxScaler(feature_range=(1,3)) 
    l2_zizj = scaler.fit_transform(l2_zizj)
    D_ij = scaler.fit_transform(D_ij)
    
    dist_diff = (l2_zizj - D_ij)**2
    wij = (P_tot.reshape(-1,1) * P_tot[X_j])
    dist_loss = np.sum(wij * dist_diff)/len(dist_diff)
    return dist_loss

In [None]:
pca_dist = metric_dist(X_j, D_ij, P_tot, pca_coords[:,:2])
phate_dist = metric_dist(X_j, D_ij, P_tot, phate_coords)
print (f'PCA distance loss: {pca_dist:.4f}')
print (f'PHATE distance loss: {phate_dist:.4f}')

### Neighboring preservation rate

In [None]:
from sklearn.neighbors import NearestNeighbors

def neighboring_preservation_rate(X, X_j, P_tot, k):
    """
    Metric to calculate the neighboring preservation rate 
    """
    # Compute the k-nearest neighbors for both X and Y.
    nn_X = NearestNeighbors(n_neighbors=k+1).fit(X) # k+1 because we don't want to include the point itself
    indices_X = nn_X.kneighbors(X,return_distance=False)[:,1:] # exclude the point itself
    
    # compute the rate of each point
    rate_list = []
    for i in range(len(indices_X)):
        count = len(np.intersect1d(indices_X[i], X_j[i,:k]))
        rate_i = count/k
        rate_list.append(rate_i)
    
    # Compute the overall neighbsoring preservation rate
    
    # rate = np.mean(rate_list) # average
    rate = (rate_list * P_tot / P_tot.sum()).sum() # weighted average
    
    return rate

In [None]:
# ViDa embedding
knn = 10000
print("ViDa PCA rate: {:.4f}".format(neighboring_preservation_rate(pca_coords[:,:2], X_j_all, P_tot, k=knn)))
print("ViDa PHATE rate: {:.4f}".format(neighboring_preservation_rate(phate_coords, X_j_all, P_tot, k=knn)))

In [None]:
# Pretrained ViDa embedding
knn = 10000
print("Pretrained ViDa PCA rate: {:.4f}".format(neighboring_preservation_rate(pca_coords[:,:2], X_j_all, P_tot, k=knn)))
print("Pretrained ViDa PHATE rate: {:.4f}".format(neighboring_preservation_rate(phate_coords, X_j_all, P_tot, k=knn)))

In [None]:
# GSAE embedding
knn = 10000
print("GSAE PCA rate: {:.4f}".format(neighboring_preservation_rate(pca_coords[:,:2], X_j_all, P_tot, k=knn)))
print("GSAE PHATE rate: {:.4f}".format(neighboring_preservation_rate(phate_coords, X_j_all, P_tot, k=knn)))

In [None]:
# Direct embedding
knn = 10000
print("Direct PCA rate: {:.4f}".format(neighboring_preservation_rate(pca_coords_direct[:,:2], X_j_all, P_tot, k=knn)))
print("Direct PHATE rate: {:.4f}".format(neighboring_preservation_rate(phate_coords_direct, X_j_all, P_tot, k=knn)))
print("MDS with distance rate: {:.4f}".format(neighboring_preservation_rate(mds_coords, X_j_all, P_tot, k=knn)))

In [None]:
# D_ij_all = []
# X_j_all = []

# for i in range(len(SIMS_HT_uniq)):
#     dij = np.array(list(np.load(f'./data/graph/{SEQ}/allpath_{SEQ}/path_{i}.npy',allow_pickle=True)[0].values()), dtype=float)
#     xj = np.array(list(np.load(f'./data/graph/{SEQ}/allpath_{SEQ}/path_{i}.npy',allow_pickle=True)[0].keys()), dtype=int)
    
#     D_ij_all.append(dij)
#     X_j_all.append(xj)

# D_ij_all = np.stack(D_ij_all)
# X_j_all = np.stack(X_j_all)

# # save npz file for shortest path
# with open(f'./data/graph/{SEQ}_allpath.npz', 'wb') as f:
#     np.savez(f,
#              X_j_all = np.stack(X_j_all),
#              D_ij_all = np.stack(D_ij_all),
#          )

### PCA explained variance

In [None]:
cm = PCA(n_components=25)
cm.fit(data_embed)

PC_values = np.arange(cm.n_components_) + 1
plt.plot(PC_values, np.cumsum(cm.explained_variance_ratio_), 'ro-', linewidth=2)
plt.title('Scree Plot: PCA')
plt.xlabel('Number of principal components')
plt.ylabel('Cumulative explained variance');
# plt.xticks(np.arange(0, data_embed.shape[-1]+1, 1))
plt.show()

print(np.cumsum(cm.explained_variance_ratio_))

### MDS for distance matrix

In [None]:
# PT4_hairpin all path
X_j_all = np.array(np.load(f'./data/graph/{SEQ}_allpath.npz',allow_pickle=True)["X_j_all"])
D_ij_all = np.array(np.load(f'./data/graph/{SEQ}_allpath.npz',allow_pickle=True)["D_ij_all"])
print("X_j_all", X_j_all.shape)
print("D_ij_all", D_ij_all.shape)

In [None]:
# make precomputed distance matrix for MDS with 
MDS_dist = np.ones((D_ij_all.shape[0],D_ij_all.shape[0]))
for i in range(len(D_ij_all)):
    MDS_dist[i,X_j_all[i]] = D_ij_all[i]

In [None]:
MDS_dist.shape

In [None]:
# not necessary if the distance matrix includes all the neighbors
def makeSymmetric(mat):
    # Loop to traverse lower triangular
    # elements of the given matrix
    for i in range(0, len(mat)):
        for j in range(0, len(mat)):
            if (j < i):
                mat[i][j] = mat[j][i] = min(mat[i][j], mat[j][i])
    return mat

In [None]:
MDS_dist_symm = makeSymmetric(MDS_dist)
MDS_dist_symm.shape

In [None]:
# %%script false --no-raise-error

from sklearn.manifold import MDS
mds = MDS(n_components=2, dissimilarity='precomputed')
mds_coords = mds.fit_transform(MDS_dist)

In [None]:
# # save for python
# pca_all_coords_direct = pca_coords_direct[coord_id_S]  
# phate_all_coords_direct = phate_coords_direct[coord_id_S]
# mds_all_coords = mds_coords[coord_id_S]

# fnpz_data = f"data/vida_data/noViDa-noEnergy/{SEQ}_noViDa.npz"
# with open(fnpz_data, 'wb') as f:
#     np.savez(f,
#             pca_coords_direct=pca_coords_direct, pca_all_coords_direct=pca_all_coords_direct,
#             phate_coords_direct=phate_coords_direct, phate_all_coords_direct=phate_all_coords_direct,
#             mds_coords=mds_coords, mds_all_coords=mds_all_coords,
#             )

# Visualize

In [None]:
SEQ

### PCA Vis

In [None]:
%matplotlib inline
X = pca_coords[:,0]
Y = pca_coords[:,1]
Z = pca_coords[:,2]

# PCA: 2 components
fig,ax = plt.subplots(figsize=(8,6))
im = ax.scatter(X, Y, 
          c=SIMS_G_uniq, 
          cmap='plasma',
        )

plt.colorbar(im)

annotations=["I","F"]
x = [X[0],X[int(SIMS_dict[-1,-1])]]
y = [Y[0],Y[int(SIMS_dict[-1,-1])]]
plt.scatter(x,y,s=150, c="green", alpha=1)
for i, label in enumerate(annotations):
    plt.annotate(label, (x[i],y[i]),fontsize=20,c="black")

#### Try use PCA directly without AE

In [None]:
# %%script false --no-raise-error

pca_coords_direct = PCA(n_components=3).fit_transform(SIMS_scar_uniq)   # multiple trj

X = pca_coords_direct[:,0]
Y = pca_coords_direct[:,1]
Z = pca_coords_direct[:,2]

# PCA: 2 components
fig,ax = plt.subplots(figsize=(8,6))
im = ax.scatter(X, Y, 
          c=SIMS_G_uniq, 
          cmap='plasma',
        )

plt.colorbar(im)

annotations=["I","F"]
x = [X[0],X[-1]]
y = [Y[0],Y[-1]]
plt.scatter(x,y,s=150, c="green", alpha=1)
for i, label in enumerate(annotations):
    plt.annotate(label, (x[i]-0.3,y[i]-0.3),fontsize=20,c="black")

In [None]:
cm = PCA(n_components=25)
cm.fit(SIMS_scar_uniq)

PC_values = np.arange(cm.n_components_) + 1
plt.plot(PC_values, np.cumsum(cm.explained_variance_ratio_), 'ro-', linewidth=2)
plt.title('Scree Plot: PCA')
plt.xlabel('Number of principal components')
plt.ylabel('Cumulative explained variance');
# plt.xticks(np.arange(0, data_embed.shape[-1]+1, 1))
plt.show()

print(np.cumsum(cm.explained_variance_ratio_))

### PHATE Vis

In [None]:
X_phate = phate_coords[:,0]
Y_phate = phate_coords[:,1]

fig,ax = plt.subplots(figsize=(8,6))
im = ax.scatter(X_phate,Y_phate,
                c=SIMS_G_uniq,            
                cmap='plasma',
               )

plt.colorbar(im)

annotations=["I","F"]
x = [X_phate[0],X_phate[int(SIMS_dict[-1,-1])]]
y = [Y_phate[0],Y_phate[int(SIMS_dict[-1,-1])]]
plt.scatter(x,y,s=50, c="green", alpha=1)
for i, label in enumerate(annotations):
    plt.annotate(label, (x[i],y[i]),fontsize=20,c="black")

#### PHATE without AE

In [None]:
# %%script false --no-raise-error

phate_operator = phate.PHATE(n_jobs=-2)
phate_coords_direct = phate_operator.fit_transform(SIMS_scar_uniq)

fig,ax = plt.subplots(figsize=(8,6))
im = ax.scatter(phate_coords_direct[:,0],
          phate_coords_direct[:,1],
          c=SIMS_G_uniq, 
          cmap='plasma',
        )

plt.colorbar(im)

annotations=["I","F"]
x = [phate_coords_direct[:,0][0],phate_coords_direct[:,0][-1]]
y = [phate_coords_direct[:,1][0],phate_coords_direct[:,1][-1]]
plt.scatter(x,y,s=50, c="green", alpha=1)
for i, label in enumerate(annotations):
    plt.annotate(label, (x[i],y[i]),fontsize=20,c="black")

### MDS with distance matrix

In [None]:
X = mds_coords[:,0]
Y = mds_coords[:,1]
cmap = plt.cm.plasma
cmap_r = plt.cm.get_cmap('plasma_r')

fig,ax = plt.subplots(figsize=(8,6))
im = ax.scatter(X, Y, 
          c = SIMS_G_uniq,
          cmap=cmap,
          s=10
        )
 
plt.colorbar(im)

annotations=["I","F"]
x = [X[0],X[int(SIMS_dict[-1,-1])]]
y = [Y[0],Y[int(SIMS_dict[-1,-1])]]
plt.scatter(x,y,s=150, c="green", alpha=1)
for i, label in enumerate(annotations):
    plt.annotate(label, (x[i],y[i]),fontsize=20,c="black")