In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pickle
import h5py
from misc import *

from sklearn.decomposition import PCA
import phate

## 1. Import/generate Data

### 1.2 Load single simulated trajectory from Mulistrand

In [None]:
# load text file
file_name = "data/helix_assoc_PT0_new_2/assoc_PT0_1sim_20C_0.txt"
# file_name = "data/helix_assoc_PT3_new_2/assoc_PT3_1sim_20C_0.txt"
f = open(file_name, 'r') # PT0

# define absorbing (final) state structure
FINAL_STRUCTURE = "(((((((((((((((((((((((((+)))))))))))))))))))))))))"

SIM = loadtrj(f,FINAL_STRUCTURE,type="Multiple")
SIM_retrieve = np.array(SIM)
SIM_concat = concat_helix_structures(SIM) 

print("SIM: ", len(SIM))
print("SIM_retrieve: ", SIM_retrieve.shape)
print("SIM_concat: ", len(SIM_concat))

### 2. Convert dot-paren to adjacency matrix

In [None]:
""" Dimenstions of SIM_adj list 
SIM_adj: N*m*m
    N: number of states in the trajectory
    m: number of nucleotides in the state (strand)
"""
# get single trajectory's data
# get adjacency matrix, energy, and holding time for each state

SIM_adj,SIM_G,SIM_T,SIM_HT,SIM_pair = sim_adj(SIM_concat)
print(SIM_adj.shape,SIM_G.shape,SIM_T.shape,SIM_HT.shape,SIM_pair.shape)

In [None]:
# get unique states adjacency matrix with their occupancy density, get unique energy, and time;
# and their corresponding indices

# single trajectory
indices,occ_density,SIM_adj_uniq,SIM_G_uniq,SIM_T_uniq,SIM_HT_uniq,SIM_pair_uniq \
     = get_unique(SIM_concat,SIM_adj,SIM_G,SIM_T,SIM_HT,SIM_pair)
print(indices.shape, occ_density.shape, SIM_adj_uniq.shape,SIM_G_uniq.shape,SIM_T_uniq.shape,SIM_HT_uniq.shape,SIM_pair_uniq.shape)
print(SIM_adj_uniq.shape, (np.unique(SIM_adj_uniq,axis=0)).shape)

### 3. Get labeled trajectory data

In [None]:
# single trajectory
# get trajectory data with its corresponding labels 
SIM_dict = label_structures(SIM_concat,indices) 
coord_id = SIM_dict[:,-1].astype(int)
print(SIM_dict.shape, coord_id.shape)
# find the structure having the largest occupancy density
print(SIM_retrieve[indices[occ_density.argmax()]])

### 4. Convert adjacency matrix scattering coefficients

#### SIMS_scar

In [None]:
# Single trajectory
# convert all states
scat_coeff_array = transform_dataset(SIM_adj)
SIM_scar = get_normalized_moments(scat_coeff_array).squeeze()
print(SIM_scar.shape)

#### SIMS_scar_uniq

In [None]:
# convert only unique states to get unique scattering
scat_coeff_array_S = transform_dataset(SIM_adj_uniq)
SIM_scar_uniq = get_normalized_moments(scat_coeff_array_S).squeeze()
print(SIM_scar_uniq.shape)

In [None]:
print(SIM_scar_uniq.shape, (np.unique(SIM_scar_uniq,axis=0)).shape)
np.array_equal(SIM_scar,SIM_scar_uniq[coord_id])

### 5. Split data into tranning and test sets

In [None]:
"""Shape of split data
    train_data: [tr_adjs, tr_coeffs, tr_energies]
    test_data: [te_adjs, te_coeffs, te_energies]
"""
train_data,test_data = split_data(SIM_adj_uniq,SIM_scar_uniq,SIM_G_uniq)  # single trj

### 6. Train and test dataloader

In [None]:
"""Structure of train_tup when gnn=False
    train_tup: [train_coeffs,train_energy] 
"""
train_loader, train_tup, test_tup, valid_loader,early_stop_callback = load_trte(train_data,test_data,
                                              batch_size=64)
train_tup[0].shape, test_tup[0].shape, train_loader.batch_size

## 2.1 Load Model

In [None]:
# set up hyperparameters

input_dim = train_tup[0].shape[-1]
len_epoch = len(train_loader)

hparams = {
    'input_dim':  input_dim,
    'bottle_dim': 25,
    'hidden_dim': 400, #not used in model
    
    'len_epoch': len_epoch,
    'learning_rate': 0.0001,
    'max_epochs': 100,  # PT0 --> 1985 epoch  # PT3 --> 60， 100, 150, 756(overtfit) epoch
    'n_gpus': 0,
    'batch_size': 64, #not used in model
    
    'alpha':1.0,
    'beta':0.0001,

}

hparams = argparse.Namespace(**hparams)

model = GSAE(hparams)
print(model)

## 2.2 Train Model

In [None]:
trainer = pl.Trainer.from_argparse_args(hparams,
                                        max_epochs=hparams.max_epochs,
                                        gpus=hparams.n_gpus,
                                        # callbacks=[early_stop_callback],
                                        )
trainer.fit(model=model,
            train_dataloader=train_loader,
            val_dataloaders=valid_loader,)

In [None]:
model

In [None]:
%load_ext tensorboard
%tensorboard --logdir lightning_logs/ --host localhost --port 8000
#  http://localhost:8000

## 3. Load Pretrained Models

In [None]:
fname_model = "models/helix_assoc_new_PT0_multrj_model_100epoch.pickle"
# fname_model = "models/helix_assoc_new_PT3_multrj_model_100epoch.pickle"

model = pickle.load(open(fname_model, 'rb'))
model

## 4. Get Embeddings

In [None]:
# single trajectory
with torch.no_grad():
        data_embed = model.embed(torch.Tensor(SIM_scar_uniq))[0]

In [None]:
# # do PCA for GSAE embeded data
pca_coords = PCA(n_components=3).fit_transform(data_embed)

# # get all pca embedded states coordinates
pca_all_coords = pca_coords[coord_id]  # single trj

pca_coords.shape, pca_all_coords.shape

In [None]:
# # do PHATE for GSAE embeded data
phate_operator = phate.PHATE(n_jobs=-2)
phate_coords = phate_operator.fit_transform(data_embed)

# # get all phate embedded states coordinates
phate_all_coords = phate_coords[coord_id]  # single trj

phate_coords.shape, phate_all_coords.shape

In [None]:
print((np.unique(pca_coords,axis=0)).shape, (np.unique(pca_all_coords,axis=0)).shape)
print((np.unique(phate_coords,axis=0)).shape, (np.unique(phate_all_coords,axis=0)).shape)

## 5. Visualize

### 1. PCA Vis

In [None]:
X = pca_all_coords[:,0]
Y = pca_all_coords[:,1]
Z = pca_all_coords[:,2]

# PCA: 2 components
fig,ax = plt.subplots(figsize=(8,6))
im = ax.scatter(X, Y, 
          c=SIM_G,
          cmap='plasma',
        )

plt.colorbar(im)

annotations=["I","F"]
x = [X[0],X[-1]]
y = [Y[0],Y[-1]]
plt.scatter(x,y,s=150, c="green", alpha=1)
for i, label in enumerate(annotations):
    plt.annotate(label, (x[i]-0.3,y[i]-0.3),fontsize=15,c="yellow")

In [None]:
X = pca_coords[:,0]
Y = pca_coords[:,1]
Z = pca_coords[:,2]

# PCA: 2 components
fig,ax = plt.subplots(figsize=(8,6))
im = ax.scatter(X, Y, 
          c=SIM_G_uniq,
          cmap='plasma',
        )

plt.colorbar(im)

annotations=["I","F"]
x = [X[0],X[-1]]
y = [Y[0],Y[-1]]
plt.scatter(x,y,s=150, c="green", alpha=1)
for i, label in enumerate(annotations):
    plt.annotate(label, (x[i]-0.3,y[i]-0.3),fontsize=15,c="yellow")

In [None]:
X = pca_coords[:,0]
Y = pca_coords[:,1]
Z = pca_coords[:,2]

# PCA: 3 components
fig,ax = plt.subplots(figsize=(8,6))
ax = plt.axes(projection ="3d")

im = ax.scatter3D(X,Y,Z,
          c=SIM_G_uniq,
          cmap='plasma')
ax.set_xlabel("X")
ax.set_ylabel("Y")
ax.set_zlabel("Z")
plt.colorbar(im)

annotations=["I","F"]
x = [X[0],X[-1]]
y = [Y[0],Y[-1]]
z = [Z[0], Z[-1]]
ax.scatter(x,y,z,s=100,c="green",alpha=1)

In [None]:
## Scree Plot
cm = PCA(n_components=25)
cm.fit(data_embed)

PC_values = np.arange(cm.n_components_) + 1
plt.plot(PC_values, cm.explained_variance_ratio_, 'ro-', linewidth=2)
plt.title('Scree Plot: GSAE+PCA')
plt.xlabel('Principal Component')
plt.ylabel('Proportion of Variance Explained')
plt.xticks(np.arange(0, data_embed.shape[-1], 1))

plt.show()

#### Try use PCA directly without AE

In [None]:
pca_coords1 = PCA(n_components=3).fit_transform(SIM_scar_uniq)  # single trj

X = pca_coords1[:,0]
Y = pca_coords1[:,1]
Z = pca_coords1[:,2]

# PCA: 2 components
fig,ax = plt.subplots(figsize=(8,6))
im = ax.scatter(X, Y, 
          c=SIM_G_uniq, 
          cmap='plasma',
        )

plt.colorbar(im)

annotations=["I","F"]
x = [X[0],X[-1]]
y = [Y[0],Y[-1]]
plt.scatter(x,y,s=150, c="green", alpha=1)
for i, label in enumerate(annotations):
    plt.annotate(label, (x[i]-0.3,y[i]-0.3),fontsize=15,c="black")

In [None]:
X = pca_coords1[:,0]
Y = pca_coords1[:,1]
Z = pca_coords1[:,2]

# PCA: 3 components
fig,ax = plt.subplots(figsize=(8,6))
ax = plt.axes(projection ="3d")

im = ax.scatter3D(X, Y, Z,
          c=SIM_G_uniq, 
          cmap='plasma',
        )

plt.colorbar(im)

annotations=["I","F"]
x = [X[0],X[-1]]
y = [Y[0],Y[-1]]
z = [Z[0], Z[-1]]
ax.scatter(x,y,z,s=100,c="green",alpha=1)

### 2. PHATE Vis

In [None]:
X_phate = phate_all_coords[:,0]
Y_phate = phate_all_coords[:,1]

fig,ax = plt.subplots(figsize=(8,6))
im = ax.scatter(X_phate,Y_phate,
                c=SIM_G,   # multiple trj               
                cmap='plasma',
               )

plt.colorbar(im)

annotations=["I","F"]
x = [X_phate[0],X_phate[-1]]
y = [Y_phate[0],Y_phate[-1]]
plt.scatter(x,y,s=50, c="green", alpha=1)
for i, label in enumerate(annotations):
    plt.annotate(label, (x[i],y[i]),fontsize=30,c="black")

In [None]:
X_phate = phate_coords[:,0]
Y_phate = phate_coords[:,1]

fig,ax = plt.subplots(figsize=(8,6))
im = ax.scatter(X_phate,Y_phate,
                c=SIM_G_uniq,                 
                cmap='plasma',
               )

plt.colorbar(im)

annotations=["I","F"]
x = [X_phate[0],X_phate[-1]]
y = [Y_phate[0],Y_phate[-1]]
plt.scatter(x,y,s=50, c="green", alpha=1)
for i, label in enumerate(annotations):
    plt.annotate(label, (x[i],y[i]),fontsize=30,c="black")

#### PHATE without AE

In [None]:
phate_operator = phate.PHATE(n_jobs=-2)
phate1 = phate_operator.fit_transform(SIM_scar_uniq)   # single trj

fig,ax = plt.subplots(figsize=(8,6))
im = ax.scatter(phate1[:,0],
          phate1[:,1],
          c=SIM_G_uniq, 
          cmap='plasma',
        )

plt.colorbar(im)

annotations=["I","F"]
x = [phate1[:,0][0],phate1[:,0][-1]]
y = [phate1[:,1][0],phate1[:,1][-1]]
plt.scatter(x,y,s=50, c="green", alpha=1)
for i, label in enumerate(annotations):
    plt.annotate(label, (x[i],y[i]),fontsize=20,c="black")

## Draw helix structure

In [None]:
import networkx as nx
import numpy as np
from networkx.drawing.nx_pylab import draw_networkx
from networkx.drawing.layout import *

import matplotlib.pyplot as plt
from misc import *

f = open('./data/helix_assos/assos_PT0_1sim_20C_51.txt', 'r') # PT0
STRAND_NAME = "assos_PT0_1sim_20C_51"

# define absorbing (final) state structure
FINAL_STRUCTURE = "(((((((((((((((((((((((((+)))))))))))))))))))))))))"

SIM = loadtrj(f,FINAL_STRUCTURE,type="Multiple")
SIM_retrieve = np.array(SIM)
SIM_concat = concat_helix_structures(SIM) 

print("SIM: ", len(SIM))
print("SIM_retrieve: ", SIM_retrieve.shape)
print("SIM_concat: ", len(SIM_concat))

# get single trajectory's data
# get adjacency matrix, energy, and holding time for each state
SIM_adj,SIM_G,SIM_T,SIM_HT = sim_adj(SIM_concat)
print(SIM_adj.shape,SIM_G.shape,SIM_T.shape,SIM_HT.shape)

# single trajectory
indices,occ_density,SIM_adj_uniq,SIM_G_uniq,SIM_T_uniq,SIM_HT_uniq \
     = get_unique(SIM_concat,SIM_adj,SIM_G,SIM_T,SIM_HT)
print(indices.shape, occ_density.shape, SIM_adj_uniq.shape,SIM_G_uniq.shape,SIM_T_uniq.shape,SIM_HT_uniq.shape)

print(np.unique(SIM_adj_uniq,axis=0).shape, SIM_adj_uniq.shape)

In [None]:
ex_neighborhood_graphs = [nx.convert_matrix.from_numpy_matrix(x) for x in SIM_adj[:5]]

fig, ax = plt.subplots(1,5, figsize=(20,5))

for i, g in enumerate(ex_neighborhood_graphs):
    ppos = kamada_kawai_layout(g)
    nx.draw(g, ax=ax[i], node_size=5, pos=ppos) 

In [None]:
ex_neighborhood_graphs = [nx.convert_matrix.from_numpy_matrix(x) for x in SIM_adj[-3:]]

fig, ax = plt.subplots(1,3, figsize=(20,5))

for i, g in enumerate(ex_neighborhood_graphs):
    ppos = kamada_kawai_layout(g)
    nx.draw(g, ax=ax[i], node_size=5, pos=ppos) 

In [None]:
ex_neighborhood_graphs = [nx.convert_matrix.from_numpy_matrix(x) for x in SIM_adj[-20:-15]]

fig, ax = plt.subplots(1,5, figsize=(20,5))

for i, g in enumerate(ex_neighborhood_graphs):
    ppos = kamada_kawai_layout(g)
    nx.draw(g, ax=ax[i], node_size=5, pos=ppos) 

In [None]:
ex_neighborhood_graphs = [nx.convert_matrix.from_numpy_matrix(x) for x in SIM_adj[-16:-11]]

fig, ax = plt.subplots(1,5, figsize=(20,5))

for i, g in enumerate(ex_neighborhood_graphs):
    ppos = kamada_kawai_layout(g)
    nx.draw(g, ax=ax[i], node_size=5, pos=ppos) 

In [None]:

for j in range(-20,0,5):
    fig, ax = plt.subplots(1,5, figsize=(20,5))
    ex_neighborhood_graphs = [nx.convert_matrix.from_numpy_matrix(x) for x in SIM_adj[j:j+5]]
    if j+5 == 0:
        ex_neighborhood_graphs = [nx.convert_matrix.from_numpy_matrix(x) for x in SIM_adj[j:]]

    for i, g in enumerate(ex_neighborhood_graphs):
        ppos = kamada_kawai_layout(g)
        nx.draw(g, ax=ax[i], node_size=5, pos=ppos)



In [None]:
s1 = '...(..............)......'
s1 = '...((............))......'
s2 = '.........................'

s12 = s1+s2
print(s12,len(s12))

d_a2 = dot2adj(s12)
print(d_a2, d_a2.shape)

g = nx.convert_matrix.from_numpy_matrix(d_a2)

# fig, ax = plt.plot(figsize=(20,5))

nx.draw(g, node_size=5, pos=kamada_kawai_layout(g)) 

In [None]:
s1 = "....(((.....)))..((...))."
s2 = ".(.(..............).)...."
s12 = s1+s2

print(s12,len(s12))

d_a2 = dot2adj(s12)
print(d_a2, d_a2.shape)

g = nx.convert_matrix.from_numpy_matrix(d_a2)

# fig, ax = plt.plot(figsize=(20,5))

nx.draw(g, node_size=50, pos=kamada_kawai_layout(g),with_labels=True,font_size=10)

print(# 24, 32
      d_a2[23,31], d_a2[31,23])

In [None]:
s1 = "....(((.....)))..((...))."
s2 = ".(.(..............).)...."
s12 = s1+s2

print(s12,len(s12))

d_a2 = dot2adj(s12)
print(d_a2, d_a2.shape)

g = nx.convert_matrix.from_numpy_matrix(d_a2)

# fig, ax = plt.plot(figsize=(20,5))

nx.draw(g, node_size=50, pos=circular_layout(g),with_labels=True,font_size=10)

print(# 24, 32
      d_a2[23,31], d_a2[31,23])

In [None]:
g = nx.convert_matrix.from_numpy_matrix(SIM_adj[-3])
nx.draw(g, node_size=50, pos=shell_layout(g),with_labels=True,font_size=8) 

In [None]:
g = nx.convert_matrix.from_numpy_matrix(SIM_adj[11])
nx.draw(g, node_size=50, pos=shell_layout(g),with_labels=True,font_size=8) 

In [None]:
g = nx.convert_matrix.from_numpy_matrix(SIM_adj[-1])
nx.draw(g, node_size=50, pos=shell_layout(g),with_labels=True,font_size=8) 

In [None]:
s1 = '....(.....)..............'
s2 = '...((............))......'
s12 = s1+s2

print(s12,len(s12))

d_a2 = dot2adj(s12)
print(d_a2, d_a2.shape)

g = nx.convert_matrix.from_numpy_matrix(d_a2)

# fig, ax = plt.plot(figsize=(20,5))

nx.draw(g, node_size=5, pos=kamada_kawai_layout(g)) 

In [None]:
s12 = "....(((.....))).(((...))..(.(..............).).).."

print(s12,len(s12))

d_a2 = dot2adj(s12)
# print(d_a2, d_a2.shape)

g = nx.convert_matrix.from_numpy_matrix(d_a2)

# fig, ax = plt.plot(figsize=(20,5))

nx.draw(g, node_size=5, pos=kamada_kawai_layout(g)) 

In [None]:
s12 = "....(((.....))).(((...))..(.(..............).).).."

print(s12,len(s12))

d_a2 = dot2adj(s12)
# print(d_a2, d_a2.shape)

g = nx.convert_matrix.from_numpy_matrix(d_a2)

# fig, ax = plt.plot(figsize=(20,5))

nx.draw(g, node_size=50, pos=shell_layout(g),with_labels=True,font_size=8) 

In [None]:
s12 = '.((..))..(..).'

print(s12,len(s12))

d_a2 = dot2adj(s12,hairpin=True,helix=False)
# print(d_a2, d_a2.shape)

g = nx.convert_matrix.from_numpy_matrix(d_a2)

# fig, ax = plt.plot(figsize=(20,5))

nx.draw(g, node_size=50, pos=shell_layout(g),with_labels=True,font_size=8) 

In [None]:
s12 = "....((((.....(.....).....))..))..(...........)...."
print(s12,len(s12))

d_a2 = dot2adj(s12,helix=True)
print(s12,len(s12),d_a2[24,25])

# print(d_a2, d_a2.shape)

g = nx.convert_matrix.from_numpy_matrix(d_a2)

# fig, ax = plt.plot(figsize=(20,5))

nx.draw(g, node_size=90, pos=kamada_kawai_layout(g),with_labels=True,font_size=10) 

In [None]:
s12 = "(.)..."

# d_a2 = dot2adj(s12,helix=False,hairpin=True)
d_a2 = dot2adj(s12,helix=True)

print(s12,len(s12),d_a2[int(len(s12)/2-1),int(len(s12)/2)])

# print(d_a2, d_a2.shape)

g = nx.convert_matrix.from_numpy_matrix(d_a2)

# fig, ax = plt.plot(figsize=(20,5))

nx.draw(g, node_size=90, pos=kamada_kawai_layout(g),with_labels=True,font_size=10) 