In [None]:
%matplotlib inline

import numpy as np
import pickle
import json
from argparse import Namespace
import plotly.express as px
import matplotlib.pyplot as plt
import time

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter

import networkx as nx
from pyvis.network import Network

from sklearn.decomposition import PCA
import phate
from umap import UMAP
from sklearn.manifold import TSNE

# Load simulated data

In [None]:
"""load saved trajectories data for npz file
"""
SEQ = "PT4"
# SEQ = "PT4_hairpin"

# multiple trajectories
if SEQ in ["PT3", "PT4", "PT3_hairpin"]:
     fnpz_data = "data/vida_data/helix_assoc/helix_assoc_{}_multrj_100epoch_py.npz".format(SEQ)
elif SEQ in ["PT0", "PT4_hairpin"]:
     fnpz_data = "data/vida_data/helix_assoc/helix_assoc_{}_multrj_60epoch_py.npz".format(SEQ)

data_npz = np.load(fnpz_data)

# asssign data to variables
for var in data_npz.files:
     locals()[var] = data_npz[var]

# recover full data based on coord_id, indices, and unique data
SIMS_adj = SIMS_adj_uniq[coord_id_S]
SIMS_scar = SIMS_scar_uniq[coord_id_S]
SIMS_G = SIMS_G_uniq[coord_id_S]
SIMS_pair = SIMS_pair_uniq[coord_id_S]

print(SIMS_T.shape,SIMS_HT.shape,SIMS_HT_uniq.shape)
print(SIMS_adj.shape,SIMS_scar.shape,SIMS_G.shape,SIMS_HT.shape,SIMS_pair.shape)
print(SIMS_adj_uniq.shape,SIMS_scar_uniq.shape,SIMS_G_uniq.shape,SIMS_pair_uniq.shape) 
print(SIMS_dict.shape,SIMS_dict_uniq.shape)
print(coord_id_S.shape,indices_S.shape,trj_id.shape,data_embed.shape,occ_density_S.shape)
print(pca_coords.shape,pca_all_coords.shape)
print(phate_coords.shape,phate_all_coords.shape)
print(umap_coord_2d.shape,umap_all_coord_2d.shape,umap_coord_3d.shape,umap_all_coord_3d.shape)
print(tsne_coord_2d.shape,tsne_all_coord_2d.shape,tsne_coord_3d.shape,tsne_all_coord_3d.shape)

In [None]:
unique, counts = np.unique(SIMS_HT == 0, return_counts=True)
print(unique, counts)

print(SIMS_HT.shape)

# Construct graph

### Collect graph information

In [None]:
%%script false --no-raise-error

# calculate the expected holding time for each node
all_graph_info = dict()

for i in range(SIMS_dict_uniq.shape[0]):
    idx = np.where(SIMS_dict[:,-1] == str(i))
    expected_time = SIMS_HT[idx].mean()
    all_graph_info[i] = expected_time, SIMS_dict_uniq[i][0]
    

In [None]:
# import json

# with open('all_graph_info.json', 'w') as f:
#     json.dump(all_graph_info, f)


# # load json file
all_graph_info = json.load(open('all_graph_info.json'))

In [None]:
len(all_graph_info)

### Construct weights (expected holding time)

In [None]:
# find the cutoff based on the min or max value of the SIMS_HT
avg_time = np.array(list(all_graph_info.values()),dtype=object)[:,0]
print(avg_time.max(), avg_time[avg_time!=0].min(), avg_time.shape)

### Construct edges

In [None]:
all_nodes = np.array(SIMS_dict[:,-1], dtype=int)
print(all_nodes.shape, all_nodes, all_nodes.max())
print("Initial node:", all_nodes[0], "  Final node:", all_nodes[-1])

In [None]:
# pairwise nodes
# note: this step connect final->initial node (291->0),
# which will be remove later
all_edges_temp = []
for previous, current in zip(all_nodes, all_nodes[1:]):
    all_edges_temp.append((previous, current))

In [None]:
len(all_edges_temp)

In [None]:
# remove all edges that connect final->initial node (291->0)
all_edges = list(filter((291,0).__ne__, all_edges_temp))

In [None]:
# sanity check
if (291,0) in all_edges:
    print("yes")
else:
    print("There are no edges from node 291 to node 0")
    
len(all_edges), len(all_edges_temp), len(all_edges_temp) - len(all_edges)

### Construct modified undirected weight graph

In [None]:
MUG = nx.Graph()

for  i in range(len(all_edges)):
    idx0 = all_edges[i][0]
    idx1 = all_edges[i][1]
    
    if avg_time[idx0] < avg_time[idx1]:
         weight = avg_time[idx0]
    else:
        weight = avg_time[idx1]
        
    if avg_time[idx0] == 0 or avg_time[idx1] == 0:
        weight = avg_time[idx0] + avg_time[idx1]
        
    MUG.add_edge(int(all_edges[i][0]), int(all_edges[i][1]), weight = float(weight))

#### Sanity check

In [None]:
# check the shortest path length which should not be 0
UDD = []
for i, w in enumerate(np.asarray((list(MUG.edges.data())))[:,2]):
    UDD.append(w['weight'])
    if w['weight'] == 0:
        print(np.asarray(list(MUG.edges.data()))[i])
        
UDD = np.asarray(UDD)
print(len(UDD), UDD.max(), UDD.min())

m = 35
n = 300
# m = 289
# n = 291
print('\n',nx.dijkstra_path_length(MUG, m, n),  nx.dijkstra_path_length(MUG, n, m))


In [None]:
all_edges[2][0], all_edges[2][1], avg_time[all_edges[2][0]], avg_time[all_edges[2][1]]

In [None]:
MUG.get_edge_data(0,1), MUG.get_edge_data(1,0)

### Collect and save all pair shortest paths

In [None]:
%%script false --no-raise-error

# collect and save all shortest paths for each node in individual npz file

for i in range(len(MUG.nodes())):
    path_list = []

    path_list.append(nx.single_source_dijkstra_path_length(MUG, i))
    
    np.save(f'./data/shortest_path/path_{i}.npy', path_list)

print("Finish saving all shortest paths for each node in individual npz file")

### Cutoff (currently not use)

In [None]:
# find the cutoff based on the min or max value of the SIMS_HT
alpha = 0.1
cutoff = alpha * avg_time.max()
cutoff

In [None]:
# collect the shortest path for each node
shortestpath_dict = dict()

for i in range(len(avg_time)):
    length = nx.single_source_dijkstra_path_length(DG, i)
    length_arr = np.array(list(length.items()), dtype=object)
    
    # find the shortest path that is less than cutoff
    # and get the index of the node
    pos = np.where(length_arr[:,1] < cutoff)
    Xj = length_arr[pos][:,0]
    dij = length_arr[pos][:,1]
    shortestpath_dict[i] = Xj, dij

In [None]:
len(shortestpath_dict)

In [None]:
## save npy file for shortestpath_dict
# np.save('shortestpath_dict.npy', shortestpath_dict)

## load npy file for shortestpath_dict
shortestpath_dict = np.load('shortestpath_dict.npy', allow_pickle='TRUE').item()

In [None]:
shortestpath_dict[0][0], 
shortestpath_dict[1][1].shape

In [None]:
# collect all shortest paths for each node
allpath_dict = dict()

for i in range(len(avg_time)):
    length = dict(nx.single_source_dijkstra_path_length(DG, i))
    length_arr = np.array(list(length.items()), dtype=object)
    
    # all shortest paths and index of the node
    Xj = length_arr[:,0]
    dij = length_arr[:,1]
    allpath_dict[i] = Xj, dij

In [None]:
## save npy file for allpath_dict
# np.save('allpath_dict.npy', allpath_dict)

## load npy file for shortestpath_dict
allpath_dict = np.load('allpath_dict.npy', allow_pickle='TRUE').item() ## extremely large memory usage

#### Case analysis (currently not use)

In [None]:
'''
Note: if directed graph: 35->36 is way larger than 36->35
'''
print(avg_time[35])
print(avg_time[36])
SIMS_dict_uniq[36], SIMS_dict_uniq[35]

In [None]:
avg_time.shape, SIMS_dict_uniq.shape,len(all_graph_info)

In [None]:
%%script false --no-raise-error
## no longer needed

# define a function to find the x_j for a given x_i
def findXjFromXi(xi_id, input, shortestpath_dict):
    '''
    xi: the index of the node
    path_dict: the dictionary of the shortest path
    '''
    xj_id = shortestpath_dict[xi_id][0].astype(int)[1:] # avoid the node itself
    xj = input[xj_id]
    dij = shortestpath_dict[xi_id][1].astype(float)[1:] # avoid the node itself
    
    return xj_id, xj, dij

In [None]:
%%script false --no-raise-error
## no longer needed

def findpath2Xj(MDG, xi_id, xj_id):
    '''
    MDG: modified directed weighted graph
    i: node index i
    j: node index j
    '''    
    d = None; dij = None; dij = None
    try:
        dij = nx.dijkstra_path_length(MDG, xi_id, xj_id)
    except:
        dij = nx.dijkstra_path_length(MDG, xj_id, xi_id)

    try:
        dji = nx.dijkstra_path_length(MDG, xj_id, xi_id)
    except:
        dji = nx.dijkstra_path_length(MDG, xi_id, xj_id)

    finally:
        d = min(dij,dji) # return the shortest path
        return d
    
m = 2494
n = 9898
findpath2Xj(MDG, m, n)

### Find the importance weight 

In [None]:
%%script false --no-raise-error
# alreay run and saved in npy file

# calculate the probability of being visited during a simulated trajectory 
# from the initial state

split_id = trj_id + 1 # index for split to each trajectory
P_tot = np.zeros(len(SIMS_dict_uniq))
 
for i in range(len(SIMS_dict_uniq)):
    for j in range(len(split_id)):
        if j == 0:
            trj = SIMS_dict[0:split_id[j],4].astype(int)
        else:
            trj = SIMS_dict[split_id[j-1]:split_id[j],4].astype(int)
            
        if i in trj:
            P_tot[i] += 1
P_tot = P_tot / 100

In [None]:
# with open("path_weight.npz", "wb") as f:
#     np.savez(f, 
#             shortestpath_dict=shortestpath_dict,
#             P_tot=P_tot)

P_tot = np.load("path_weight.npz", allow_pickle='TRUE')["P_tot"]
P_tot.shape, P_tot.max(), P_tot.min()

In [None]:
%%script false --no-raise-error
# might not be needed

# calculate the important weight for each edge (node i to j)
def findWij(xi_id, xj_id, P_tot):
    '''
    xi_id: the index of the x_i, one number
    xj_id: the index of the x_j, one number
    P_tot: the probability of all nodes
    '''
    return P_tot[xi_id] * P_tot[xj_id]

# ViDa Model

### Make dataset

In [None]:
train_tup = (torch.Tensor(SIMS_scar_uniq),
            torch.Tensor(SIMS_G_uniq),
            torch.Tensor(np.arange(len(SIMS_scar_uniq))))
train_dataset = torch.utils.data.TensorDataset(*train_tup)

### Set up configurations

In [None]:
# set up hyperparameters
input_dim = train_tup[0].shape[-1]

config = Namespace(
    device = 'mps', # change to cuda if using GPU
    batch_size = 256,
    input_dim = input_dim,
    output_dim = input_dim,
    latent_dim = 25, # bottleneck dimension
    hidden_dim = 400,
    n_epochs = 100, # 60 for PT4_hairpin, PT0, 100 for others
    learning_rate = 0.0001, # learning rate
    log_interval = 10, # how many batches to wait before logging training status    
    
    # hyperparameters for loss function
    alpha = 0.5,
    beta = 1e-2,
    gamma = 0.5,
    delta = 5e-16,
    
    # alpha = 1.0,
    # beta = 1.0,
    # gamma = 1.0,
)

### Make datalaoder

In [None]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config.batch_size,
                                            shuffle=True)

In [None]:
len(train_loader.dataset), len(train_loader), train_loader.batch_size

### Encoder

In [None]:
class Encoder(nn.Module):
    
    def __init__(self, input_dim, hidden_dim, latent_dim):
        '''
        Args:
        ----
            - input_dim: the dimension of the input node feature
            - hiddent_dim: the dimension of the hidden layer
            - latent_dim: the dimension of the latent space (bottleneck layer)
        '''
        super(Encoder, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.latent_dim = latent_dim
        
        self.fc1 = nn.Linear(self.input_dim, self.hidden_dim)
        self.bn1 = nn.BatchNorm1d(self.hidden_dim)
        self.fc2 = nn.Linear(400, 400)
        self.bn2 = nn.BatchNorm1d(400)
        
        # Split the result into mu and var components
        # of the latent Gaussian distribution, note how we only output
        # diagonal values of covariance matrix. Here we assume
        # they are conditionally independent
        self.hid2mu = nn.Linear(400, self.latent_dim)
        self.hid2logvar = nn.Linear(400, self.latent_dim)
        
    def forward(self, x):
        x = self.bn1(F.relu(self.fc1(x)))
        x = self.bn2(F.relu(self.fc2(x)))
        mu = self.hid2mu(x)
        logvar = self.hid2logvar(x)
        return mu, logvar


### Decoder

In [None]:
class Decoder(nn.Module):
    
    def __init__(self, latent_dim, hidden_dim, output_dim):
        '''
        Args:
        ----
            - latent_dim: the dimension of the latent space (bottleneck layer)
            - hiddent_dim: the dimension of the hidden layer
            - output_dim: the dimension of the output node feature
        '''
        super(Decoder, self).__init__()
        self.latent_dim = latent_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        
        self.fc1 = nn.Linear(self.latent_dim, self.hidden_dim)
        self.bn1 = nn.BatchNorm1d(self.hidden_dim)
        self.fc2 = nn.Linear(400, 400)
        self.bn2 = nn.BatchNorm1d(400)
        self.fc3 = nn.Linear(400, self.output_dim)
        
    def forward(self, z):
        x = self.bn1(F.relu(self.fc1(z)))
        x = self.bn2(F.relu(self.fc2(x)))
        # x = torch.sigmoid(self.fc3(x))
        x = self.fc3(x)
        return x

### Regressor

In [None]:
class Regressor(nn.Module):
    
    def __init__(self, latent_dim):
        '''
        The regressor is used to predict the energy of the node
        
        Args:
        ----
            - latent_dim: the dimension of the latent space (bottleneck layer)
        '''
        super(Regressor, self).__init__()
        self.latent_dim = latent_dim
        
        self.regfc1 = nn.Linear(self.latent_dim, 15)
        self.regfc2 = nn.Linear(15, 1)
        
    def forward(self, z):
        y = F.relu(self.regfc1(z))
        y = self.regfc2(y)
        return y

### VIDA model

In [None]:
class VIDA(nn.Module):
    
    def __init__(self, encoder, decoder, regressor):
        '''
        Args:
        ----
            - input_dim: the dimension of the input node feature
            - hiddent_dim: the dimension of the hidden layer
            - latent_dim: the dimension of the latent space (bottleneck layer)
            - output_dim: the dimension of the output node feature (same as input_dim)
        '''
        super(VIDA, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.regressor = regressor
        
    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5*logvar)
        eps = torch.randn_like(std)
        z = mu + eps*std
        return z
        
    def forward(self, x):
        mu, logvar = self.encoder(x)
        z = self.reparameterize(mu, logvar)
        x_recon = self.decoder(z)
        y_pred = self.regressor(z)
        return x_recon, y_pred, z, mu, logvar

### Loss functions

In [None]:
def vae_loss(x_recon, x, mu, logvar):
    '''
    Compute the VAE loss
    
    Args:
        - x_recon: the reconstructed node feature
        - x: the original node feature
        - mu: the mean of the latent space
        - logvar: the log variance of the latent space
    
    Returns:
    - loss: PyTorch Tensor containing (scalar) the loss for the VAE
    '''
    BCE = F.mse_loss(x_recon.flatten(), x.flatten())
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    # return BCE + KLD
    return BCE, KLD


def pred_loss(y_pred, y):
    '''
    Compute the energy prediction loss
    
    Args:
    ----
        - y_pred: the predicted energy of the node
        - y: the true energy of the node
    
    Returns:
        - loss: PyTorch Tensor containing (scalar) the loss for the prediction
    '''
    return F.mse_loss(y_pred.flatten(), y.flatten())


def distance_loss(zi, zj, dij, wij):
    '''
    Compute the distance loss between embeddings 
    and the minimum expected holding time
    
    Args:
        - zi: the embedding of the node i
        - zj: the embedding of the node j
        - dij: the minimum expected holding time (distance) between i and j
        - wij: the important weight of the edge (i,j) --> float
    
    Returns:
    - loss: PyTorch Tensor containing (scalar) the loss for the embedding distance
    '''
    pdist = torch.nn.PairwiseDistance(p=2)
    embed_dis = pdist(zi, zj)
    L = wij * (embed_dis - dij)**2 * 1/(dij**2)
    return L

### Train VIDA

In [None]:
def train(config, model, train_loader,
          optimizer, vae_loss, pred_loss, distance_loss,
          ):
    '''
    Train VIDA!
    
    Args:
    ----
        - config: Experiment configurations
        - model: Pytorch VIDA model
        - train_loader: Pytorch DataLoader for training set
        - optimizer: Pytorch optimizer
        - vae_loss: the VAE loss function
        - pred_loss: the energy prediction loss function
        - distance_loss: the distance loss function
    '''
    
    model.to(config.device)
    model.train()
    
    log_dir = f'./model_config/{time.strftime("%m%d-%H%M")}'
    writer = SummaryWriter(log_dir=log_dir)
    
    print('\n ------- Start Training -------')
    # for epoch in range(config.n_epochs):
    for epoch in range(1):
        
        training_loss = []
        
        for batch_idx, (x, y, idx) in enumerate(train_loader):  # mini batch
            
            # Configure input
            x = x.to(config.device)
            y = y.to(config.device)
            idx = idx.to(config.device)
            
            # ------------------------------------------
            #  Train VIDA
            # ------------------------------------------
            optimizer.zero_grad()
            
            # get the reconstructed nodes, predicted energy, and the embeddings
            x_recon, y_pred, z, mu, logvar = model(x)
            
            # compute the distance loss (kinetic loss)
            dist_loss = 0.0
                        
            for i in range(0,len(x)-2):
                xi_id = int(idx[i].item())       # node i idex
                xj_id = int(idx[i+1].item())     # node j idex
                zi = z[i]                   # node i embedding in the batch
                zj = z[i+1]                 # node j embedding in the batch
                
                # dij = nx.dijkstra_path_length(MUG, xi_id, xj_id) # embedding distance between i and j
                dij = np.load(f'./data/shortest_path/path_{xi_id}.npy',allow_pickle=True)[0][xj_id]
                
                wij = P_tot[xi_id] * P_tot[xj_id] # importance weight of nodes i and j
                
                dist_loss += distance_loss(zi, zj, dij, wij) # loss between i and j

            # compute the total loss
            recon_loss, kl_loss = vae_loss(x_recon, x, mu, logvar)
            p_loss = pred_loss(y_pred, y)
                        
            # scaling the loss
            recon_loss = config.alpha * recon_loss
            kl_loss = config.beta * kl_loss
            p_loss = config.gamma * p_loss
            dist_loss = config.delta * dist_loss
            
            loss = recon_loss + kl_loss + p_loss + dist_loss # total loss
                        
            training_loss.append(loss.item())
            
            # backpropagation and optimization
            loss.backward()
            optimizer.step()
            
            # ------------------------------------------
            # Log Progress
            # ------------------------------------------
            if batch_idx % config.log_interval == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * len(x), len(train_loader.dataset),
                    100. * batch_idx / len(train_loader),
                    loss.item()))
                
                writer.add_scalar('training loss',
                                  loss.item(),
                                  epoch * len(train_loader) + batch_idx)
                writer.add_scalar('recon loss',
                                  recon_loss.item(),
                                  epoch * len(train_loader) + batch_idx)
                writer.add_scalar('kl loss',
                                  kl_loss.item(),
                                  epoch * len(train_loader) + batch_idx)
                writer.add_scalar('pred loss',
                                  p_loss.item(),
                                  epoch * len(train_loader) + batch_idx)
                writer.add_scalar('dist loss',
                                  dist_loss.item(),
                                  epoch * len(train_loader) + batch_idx)
                    
        print ('====> Epoch: {} Average loss: {:.4f}'.format(epoch, np.mean(training_loss)))
        writer.add_scalar('epoch training loss', np.mean(training_loss), epoch)
    
    writer.close()  
    print('\n ------- Finished Training -------')
      
    # save the model
    torch.save(model.state_dict(), f'{log_dir}/model.pt')
    

In [None]:
# define models
encoder = Encoder(input_dim=config.input_dim, hidden_dim=config.hidden_dim, latent_dim=config.latent_dim)
decoder = Decoder(latent_dim=config.latent_dim, hidden_dim=config.hidden_dim, output_dim=config.output_dim)
regressor = Regressor(latent_dim=config.latent_dim)

vida = VIDA(encoder, decoder, regressor)

# define optimizer
optimizer = torch.optim.Adam(vida.parameters(), lr=config.learning_rate)

In [None]:
# train VIDA
train(config, vida, train_loader, optimizer, vae_loss, pred_loss, distance_loss)

In [None]:
%load_ext tensorboard
%tensorboard --logdir model_config/ --host localhost --port 8000
#  http://localhost:8000

### Load trained model

In [None]:
model = VIDA(encoder, decoder, regressor)
model.load_state_dict(torch.load('./model_config/0218-1818/model.pt'))

### Get embeddings

In [None]:
# do inference
model.eval()

with torch.no_grad():
        _, _, z, _, _ = model(torch.tensor(SIMS_scar_uniq).to(config.device))
        # _, _, z, _, _ = model(torch.tensor(train_loader.dataset.tensors[0]).to(config.device))
        

In [None]:
data_embed = z.to('cpu').numpy()
data_embed.shape

### 1. PCA

In [None]:
# # do PCA for GSAE embeded data
pca_coords = PCA(n_components=3).fit_transform(data_embed)

# # get all pca embedded states coordinates
pca_all_coords = pca_coords[coord_id_S]  # multiple trj

pca_coords.shape, pca_all_coords.shape

In [None]:
(np.unique(pca_coords,axis=0)).shape, (np.unique(pca_all_coords,axis=0)).shape

### 2. PHATE

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_embed = scaler.fit_transform(data_embed)
data_embed

In [None]:
# # do PHATE for GSAE embeded data
phate_operator = phate.PHATE(n_jobs=-2)
phate_coords = phate_operator.fit_transform(data_embed)

# # get all phate embedded states coordinates
phate_all_coords = phate_coords[coord_id_S]

phate_coords.shape, phate_all_coords.shape

In [None]:
(np.unique(phate_coords,axis=0)).shape, (np.unique(phate_all_coords,axis=0)).shape

### 3. UMAP

In [None]:
# UMAP set
umap_2d = UMAP(n_components=2, init='random', random_state=0)
# umap_3d = UMAP(n_components=3, init='random', random_state=0)

# UMAP 2D fit tranform
umap_coord_2d = umap_2d.fit_transform(data_embed)
umap_all_coord_2d = umap_coord_2d[coord_id_S]  

# UMAP 3D fit tranform
# umap_coord_3d = umap_3d.fit_transform(data_embed)
# umap_all_coord_3d = umap_coord_3d[coord_id_S]

print((np.unique(umap_coord_2d,axis=0)).shape, (np.unique(umap_coord_3d,axis=0)).shape)
print(umap_all_coord_2d.shape, (np.unique(umap_all_coord_2d,axis=0)).shape)
# print(umap_all_coord_3d.shape, (np.unique(umap_all_coord_3d,axis=0)).shape)

### 4. t-SNE

In [None]:
# tsne set
tsne_2d = TSNE(n_components=2, perplexity=1000.0, random_state=0)
# tsne_3d = TSNE(n_components=3, random_state=0)

# tsne 2D fit tranform
tsne_coord_2d = tsne_2d.fit_transform(data_embed)
tsne_all_coord_2d = tsne_coord_2d[coord_id_S] 

# # tsne 3D fit tranform
# tsne_coord_3d = tsne_3d.fit_transform(data_embed)
# tsne_all_coord_3d = tsne_coord_3d[coord_id_S] 

 
print((np.unique(tsne_coord_2d,axis=0)).shape, (np.unique(tsne_coord_3d,axis=0)).shape)
print(tsne_all_coord_2d.shape, (np.unique(tsne_all_coord_2d,axis=0)).shape)
# print(tsne_all_coord_3d.shape, (np.unique(tsne_all_coord_3d,axis=0)).shape)

# Visualize

In [None]:
SEQ

### 1. PCA Vis

In [None]:
%matplotlib inline
X = pca_all_coords[:,0]
Y = pca_all_coords[:,1]
Z = pca_all_coords[:,2]

# PCA: 2 components
fig,ax = plt.subplots(figsize=(8,6))
im = ax.scatter(X, Y, 
          c=SIMS_G,
          cmap='plasma',
          s=20
        )

plt.colorbar(im)

annotations=["I","F"]
x = [X[0],X[-1]]
y = [Y[0],Y[-1]]
plt.scatter(x,y,s=150, c="green", alpha=1)
for i, label in enumerate(annotations):
    plt.annotate(label, (x[i],y[i]*0.95),fontsize=15,c="yellow", horizontalalignment='center')

In [None]:
%matplotlib inline
X = pca_coords[:,0]
Y = pca_coords[:,1]
Z = pca_coords[:,2]

# PCA: 2 components
fig,ax = plt.subplots(figsize=(8,6))
im = ax.scatter(X, Y, 
          c=SIMS_G_uniq, 
          cmap='plasma',
        )

plt.colorbar(im)

annotations=["I","F"]
x = [X[0],X[-1]]
y = [Y[0],Y[-1]]
plt.scatter(x,y,s=150, c="green", alpha=1)
for i, label in enumerate(annotations):
    plt.annotate(label, (x[i]-0.3,y[i]-0.3),fontsize=15,c="yellow")

In [None]:
X = pca_coords[:,0]
Y = pca_coords[:,1]
Z = pca_coords[:,2]

# PCA: 3 components
fig,ax = plt.subplots(figsize=(8,6))
ax = plt.axes(projection ="3d")

im = ax.scatter3D(X,Y,Z,
          c=SIMS_G_uniq,      
          cmap='plasma')
ax.set_xlabel("X")
ax.set_ylabel("Y")
ax.set_zlabel("Z")
plt.colorbar(im)

annotations=["I","F"]
x = [X[0],X[-1]]
y = [Y[0],Y[-1]]
z = [Z[0], Z[-1]]
ax.scatter(x,y,z,s=100,c="green",alpha=1)

In [None]:
X = pca_coords[:,0]
Y = pca_coords[:,1]
Z = pca_coords[:,2]


# PCA: 2 components
fig,ax = plt.subplots(figsize=(8,6))
im = ax.scatter(X, Y,
          c=SIMS_pair_uniq,
          cmap='plasma',
          s=15
        )

plt.colorbar(im)

annotations=["I","F"]
x = [X[0],X[-1]]
y = [Y[0],Y[-1]]
plt.scatter(x,y,s=150, c="green", alpha=1)
for i, label in enumerate(annotations):
    plt.annotate(label, (x[i]-0.3,y[i]-0.3),fontsize=15,c="yellow")

#### Try use PCA directly without AE

In [None]:
pca_coords1 = PCA(n_components=3).fit_transform(SIMS_scar_uniq)   # multiple trj

X = pca_coords1[:,0]
Y = pca_coords1[:,1]
Z = pca_coords1[:,2]

# PCA: 2 components
fig,ax = plt.subplots(figsize=(8,6))
im = ax.scatter(X, Y, 
          c=SIMS_G_uniq, 
          cmap='plasma',
        )

plt.colorbar(im)

annotations=["I","F"]
x = [X[0],X[-1]]
y = [Y[0],Y[-1]]
plt.scatter(x,y,s=150, c="green", alpha=1)
for i, label in enumerate(annotations):
    plt.annotate(label, (x[i]-0.3,y[i]-0.3),fontsize=15,c="black")

In [None]:
cm = PCA(n_components=25)
cm.fit(data_embed)

PC_values = np.arange(cm.n_components_) + 1
plt.plot(PC_values, np.cumsum(cm.explained_variance_ratio_), 'ro-', linewidth=2)
plt.title('Scree Plot: PCA')
plt.xlabel('Number of principal components')
plt.ylabel('Cumulative explained variance');
# plt.xticks(np.arange(0, data_embed.shape[-1]+1, 1))

plt.show()

In [None]:
np.cumsum(cm.explained_variance_ratio_)

### 2. PHATE Vis

In [None]:
X_phate = phate_all_coords[:,0]
Y_phate = phate_all_coords[:,1]

fig,ax = plt.subplots(figsize=(8,6))
im = ax.scatter(X_phate,Y_phate,
                c=SIMS_G,   # multiple trj               
                cmap='plasma',
               )

plt.colorbar(im)

annotations=["I","F"]
x = [X_phate[0],X_phate[-1]]
y = [Y_phate[0],Y_phate[-1]]
plt.scatter(x,y,s=50, c="green", alpha=1)
for i, label in enumerate(annotations):
    plt.annotate(label, (x[i],y[i]),fontsize=30,c="black")

In [None]:
X_phate = phate_coords[:,0]
Y_phate = phate_coords[:,1]

fig,ax = plt.subplots(figsize=(8,6))
im = ax.scatter(X_phate,Y_phate,
                c=SIMS_G_uniq,            
                cmap='plasma',
               )

plt.colorbar(im)

annotations=["I","F"]
x = [X_phate[0],X_phate[-1]]
y = [Y_phate[0],Y_phate[-1]]
plt.scatter(x,y,s=50, c="green", alpha=1)
for i, label in enumerate(annotations):
    plt.annotate(label, (x[i],y[i]),fontsize=30,c="black")

#### PHATE without AE

In [None]:
phate_operator = phate.PHATE(n_jobs=-2)
phate1 = phate_operator.fit_transform(SIMS_scar_uniq)   # multiple trj

fig,ax = plt.subplots(figsize=(8,6))
im = ax.scatter(phate1[:,0],
          phate1[:,1],
          c=SIMS_G_uniq, 
          cmap='plasma',
        )

plt.colorbar(im)

annotations=["I","F"]
x = [phate1[:,0][0],phate1[:,0][-1]]
y = [phate1[:,1][0],phate1[:,1][-1]]
plt.scatter(x,y,s=50, c="green", alpha=1)
for i, label in enumerate(annotations):
    plt.annotate(label, (x[i],y[i]),fontsize=20,c="black")

### 3. UMAP Vis

In [None]:
X = umap_coord_2d[:,0]
Y = umap_coord_2d[:,1]
cmap = plt.cm.plasma
cmap_r = plt.cm.get_cmap('plasma_r')

# PCA: 2 components
fig,ax = plt.subplots(figsize=(8,6))
im = ax.scatter(X, Y, 
          c = SIMS_G_uniq,
          cmap=cmap,
          s=10
        )

plt.colorbar(im)

annotations=["I","F"]
x = [X[0],X[-1]]
y = [Y[0],Y[-1]]
plt.scatter(x,y,s=150, c="green", alpha=1)
for i, label in enumerate(annotations):
    plt.annotate(label, (x[i]-0.3,y[i]-0.3),fontsize=15,c="yellow")

In [None]:
# directly UMAP 2D
umap_coord_2dscar = umap_2d.fit_transform(SIMS_scar_uniq)

fig_2d = px.scatter(
    umap_coord_2dscar, x=0, y=1,color=SIMS_G_uniq
)
fig_2d.update_traces(marker_size=3)
fig_2d.show()



In [None]:
fig_2d = px.scatter(
    umap_coord_2d, x=0, y=1,color=SIMS_G_uniq
)
fig_2d.update_traces(marker_size=3)


fig_3d = px.scatter_3d(
    umap_coord_3d, x=0, y=1, z=2,color=SIMS_G_uniq
)

fig_3d.update_traces(marker_size=2)

fig_2d.show()
fig_3d.show()



### 4. t-SNE Vis

In [None]:
fig_2d = px.scatter(
    tsne_coord_2d, x=0, y=1,color=SIMS_G_uniq,
    hover_data = {"SIMS_G_uniq":SIMS_G_uniq, 
                  "SIMS_HT_uniq":SIMS_HT_uniq,
                  }
)
fig_2d.update_traces(marker_size=3)
fig_2d.show()


# fig_3d = px.scatter_3d(
#     tsne_coord_3d, x=0, y=1, z=2,color=SIMS_G_uniq
# )
# fig_3d.update_traces(marker_size=2)

