In [None]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import pickle
import pandas as pd 

from vida.data_processing.load_data import *
from vida.data_processing.convertor import *
from vida.data_processing.misc import *
from vida.data_processing.split_data import *

from vida.model.scatter_transform import transform_dataset, get_normalized_moments

# Import and Pre-processing Data

### 1.1 Load multiple simulated trajectories from Mulistrand

In [None]:
# load multiple trajectories from multiple files
SEQ = "PT4_hairpin"
# SEQ = "PT4"

folder_name = "data/raw_data/helix_assoc/helix_assoc_{}/assoc_{}_1sim_20C".format(SEQ,SEQ)

# define absorbing (final) state structure
FINAL_STRUCTURE = "(((((((((((((((((((((((((+)))))))))))))))))))))))))"
num_files = 100

SIMS,SIMS_retrieve,SIMS_concat = load_multitrj(folder_name,FINAL_STRUCTURE,num_files)

print("SIMS: ", len(SIMS))
print("SIMS_retrieve: ", SIMS_retrieve.shape)
print("SIMS_concat: ", len(SIMS_concat))

### 1.2 Load multiple simulated trajectories from CTMC

In [None]:
# 1k trajectories, each have multiple states
with open('./data/jordan/trajs_reaction28_1000_sims.pkl', 'rb') as fp:
    trajs = pickle.load(fp)
    print('loaded trajectories dictionary')

trajs_states = trajs["trajs_states"]
trajs_times = trajs["trajs_times"]
trajs_energies = trajs["trajs_energies"]

len(trajs_states), len(trajs_times), len(trajs_energies[0])

#### Concatenate all structures

In [None]:
# convert concantenate two individual structures to one structure
def concat_helix_structures(dp):
    """concatenate two individual structures to one structure
    Args:
        SIM: list of individual structures
    Returns:
        SIM_concat: concatenated structure
    """
    dp_concat = copy.deepcopy(dp)
    dp_pair = []
    for i in range(len(dp_concat)):
        if "&" in dp_concat[i]:
            dp_concat[i] = dp_concat[i].replace("&","")
            dp_pair.append(0)
            
        if "+" in dp_concat[i]:
            dp_concat[i] = dp_concat[i].replace("+","")
            dp_pair.append(1)
            
    return np.array(dp_concat), np.array(dp_pair)

# cooncatanate all sturcutres: 
def concat_all(states, times, energies):
    SIMS_concat = []
    
    # for i in range(len(states)):
    for i in range(100):
        sims_dp, sims_pair = concat_helix_structures(states[i])
        SIMS_stack = np.stack([sims_dp, times[i], energies[i], sims_pair], axis=1)
        SIMS_concat.extend(SIMS_stack.tolist())
    return SIMS_concat 

In [None]:
SIMS_concat = concat_all(trajs_states, trajs_times, trajs_energies)
len(SIMS_concat)

### 2. Convert dot-paren to adjacency matrix

In [None]:
""" Dimenstions of SIM_adj list 
SIM_adj: N*m*m
    N: number of states in the trajectory
    m: number of nucleotides in the state (strand)
"""
# get multiple trajectories' data
SIMS_adj, SIMS_G, SIMS_T, SIMS_HT, SIMS_pair, trj_id = sim_adj(SIMS_concat)
print(SIMS_adj.shape,SIMS_G.shape,SIMS_T.shape,SIMS_HT.shape,SIMS_pair.shape,trj_id.shape)

### 3.1 Get unique data except holding time

In [None]:
# get unique states adjacency matrix with their occupancy density, get unique energy, time, if paired;
# and their corresponding indices

# multiple trajectories
indices_S,occ_density_S,SIMS_adj_uniq,SIMS_G_uniq,SIMS_pair_uniq \
     = get_unique(SIMS_concat,SIMS_adj,SIMS_G,SIMS_pair) 
print(indices_S.shape, occ_density_S.shape, SIMS_adj_uniq.shape,SIMS_G_uniq.shape,SIMS_pair_uniq.shape)

### 3.2. Get labeled trajectory data

In [None]:
# # get trajectory data with its corresponding labels 
# # multiple trajectories
SIMS_dict = label_structures(SIMS,indices_S)
coord_id_S = SIMS_dict[:,-1].astype(int)
SIMS_dict_uniq = np.array(SIMS)[indices_S]
print(SIMS_dict.shape, coord_id_S.shape, SIMS_dict_uniq.shape)

# find the structure having the largest occupancy density
print(SIMS_retrieve[indices_S[occ_density_S.argmax()]])

### 3.3 Get unique holding time for each state

In [None]:
# get unique holding time of unique states
SIMS_HT_uniq = mean_holdingtime(SIMS_HT, indices_S, coord_id_S)
print(SIMS_HT_uniq.shape)

### 4. Convert adjacency matrix scattering coefficients

#### SIMS_scar_uniq

In [None]:
# # Multiple trajectories
scat_coeff_array_S = transform_dataset(SIMS_adj_uniq)
SIMS_scar_uniq = get_normalized_moments(scat_coeff_array_S).squeeze()

# get SIMS_scar based on SIMS_scar_uniq
SIMS_scar = SIMS_scar_uniq[coord_id_S]

In [None]:
# print(SIMS_scar.shape, (np.unique(SIMS_scar,axis=0)).shape)
print(SIMS_scar_uniq.shape, (np.unique(SIMS_scar_uniq,axis=0)).shape)

In [None]:
# # For large trajectories states
# SIMS_scar_uniq1 = get_normalized_moments(transform_dataset(SIMS_adj_uniq[:60000])).squeeze()
# SIMS_scar_uniq2 = get_normalized_moments(transform_dataset(SIMS_adj_uniq[60000:])).squeeze()
# SIMS_scar_uniq = np.concatenate((SIMS_scar_uniq1,SIMS_scar_uniq2))

# # get SIMS_scar based on SIMS_scar_uniq
# SIMS_scar = SIMS_scar_uniq[coord_id_S]

# print(SIMS_scar.shape, (np.unique(SIMS_scar,axis=0)).shape)
# print(SIMS_scar_uniq.shape, (np.unique(SIMS_scar_uniq,axis=0)).shape)

In [None]:
""" Save/load all obtained data to npz file for python,
    Multiple trajectories
"""
# # save for python
# fnpz_data = "data/vida_data/helix_assoc/helix_assoc_{}_multrj_100epoch_py_temp.npz".format(SEQ)
# with open(fnpz_data, 'wb') as f:
#     np.savez(f,
#             # SIMS data
#             SIMS_T=SIMS_T, SIMS_HT=SIMS_HT, SIMS_HT_uniq=SIMS_HT_uniq,
#             SIMS_adj_uniq=SIMS_adj_uniq, SIMS_scar_uniq=SIMS_scar_uniq,
#             SIMS_G_uniq=SIMS_G_uniq, SIMS_pair_uniq=SIMS_pair_uniq,
#             SIMS_dict=SIMS_dict, SIMS_dict_uniq=SIMS_dict_uniq,
#             # Indices
#             coord_id_S=coord_id_S, indices_S=indices_S,trj_id=trj_id, occ_density_S=occ_density_S,
#             # # embed data and occpancy density
#             # data_embed=data_embed,
#             # # plotting data
#             # pca_coords=pca_coords, pca_all_coords=pca_all_coords,
#             # phate_coords=phate_coords, phate_all_coords=phate_all_coords,
#             # umap_coord_2d=umap_coord_2d, umap_all_coord_2d=umap_all_coord_2d,
#             # umap_coord_3d=umap_coord_3d, umap_all_coord_3d=umap_all_coord_3d,
#             # tsne_coord_2d=tsne_coord_2d, tsne_all_coord_2d=tsne_all_coord_2d,
#             # tsne_coord_3d=tsne_coord_3d, tsne_all_coord_3d=tsne_all_coord_3d,
#             )
    
# # multiple trajectories
# fnpz_data = "data/vida_data/helix_assoc/helix_assoc_PT4_multrj_100epoch_py_temp.npz"
# data_npz = np.load(fnpz_data)

# # asssign data to variables
# for var in data_npz.files:
#      locals()[var] = data_npz[var]

# # recover full data based on coord_id, indices, and unique data
# SIMS_adj = SIMS_adj_uniq[coord_id_S]
# SIMS_scar = SIMS_scar_uniq[coord_id_S]
# SIMS_G = SIMS_G_uniq[coord_id_S]
# SIMS_pair = SIMS_pair_uniq[coord_id_S]


# print(SIMS_T.shape,SIMS_HT.shape,SIMS_HT_uniq.shape)
# print(SIMS_adj.shape,SIMS_scar.shape,SIMS_G.shape,SIMS_HT.shape,SIMS_pair.shape)
# print(SIMS_adj_uniq.shape,SIMS_scar_uniq.shape,SIMS_G_uniq.shape,SIMS_pair_uniq.shape) 
# print(SIMS_dict.shape,SIMS_dict_uniq.shape)
# print(coord_id_S.shape,indices_S.shape,trj_id.shape,occ_density_S.shape)

In [None]:
"""Shape of split data
    train_data: [tr_adjs, tr_coeffs, tr_energies]
    test_data: [te_adjs, te_coeffs, te_energies]
"""
train_data,test_data = split_data(SIMS_adj_uniq,SIMS_scar_uniq,SIMS_G_uniq)  # multiple trj