In [None]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import pickle
import pandas as pd 

from vida.data_processing.load_data import *
from vida.data_processing.convertor import *
from vida.data_processing.misc import *
from vida.data_processing.split_data import *

from vida.model.scatter_transform import transform_dataset, get_normalized_moments

# Import and Pre-processing Data

### 1.1 Load multiple simulated trajectories from Mulistrand

In [None]:
import numpy as np

# load dataset generated by Multistrand
def read_1trj(f):
    """load text data and split it into individual trajectory 
    with seperated structure, time, and energy

    Args:
        f: text file with trajectory dp notation, time, energy, and if paired (1) or not (0)
            eg. '..((((....)))).', 't=0.000000103', 'seconds, dG=  0.26 kcal/mol\n', "0"
        FINAL_STRUCTURE: final state structure, eg. "..((((....))))."
        type: 'Single' or 'Multiple' mode
    Returns:
        [list]: dot-parenthesis notation, time floats, energy floats, paired or not
            eg. ['...............', 0.0, -12.0, 1]
    """
    S_dp = []; S_time = []; S_energy = []; S_pair = []
    
    for s in f:
        ss = s.split(" ")
        s_dp=ss[0] # dp notation
        s_time = float(ss[1].split("=",1)[1]) # simulation time
        s_energy = float(ss[3].split("=")[1].split("kcal")[0]) # energy
        s_pair = int(ss[-1]) # paired or not
        
        S_dp.append(s_dp)
        S_time.append(s_time)
        S_energy.append(s_energy)
        S_pair.append(s_pair)
        
    return [S_dp,S_time,S_energy,S_pair]


# load multiple trajectories from multiple files
def read_Gao(fpath,rxn,num_files=100):
    trajs_states, trajs_times, trajs_energies, trajs_pairs = [],[],[],[]
    
    for i in range(num_files):
        STR_name = f"{fpath}/{rxn}/{rxn}-{i}.txt"
        f = open(STR_name, 'r') 
        TRJ = read_1trj(f)
        trajs_states.append(TRJ[0])
        trajs_times.append(TRJ[1])
        trajs_energies.append(TRJ[2])
        trajs_pairs.append(TRJ[3])
    
    trajs_states = np.array(trajs_states, dtype=object)
    trajs_times = np.array(trajs_times, dtype=object)
    trajs_energies = np.array(trajs_energies, dtype=object)
    trajs_pairs = np.array(trajs_pairs, dtype=object)
        
    return trajs_states, trajs_times, trajs_energies, trajs_pairs

In [None]:
rxn = "Gao-P4T4"
fpath= f"../raw_data/Gao-data"

trajs_states, trajs_times, trajs_energies, trajs_pairs = read_Gao(fpath,rxn)


In [None]:
# # load multiple trajectories from multiple files

# # rxn = "PT4_hairpin"
# rxn = "Gao-P4T4"
# folder_name = f"../raw_data/Gao-data/{rxn}/{rxn}"

# # define absorbing (final) state structure
# FINAL_STRUCTURE = "(((((((((((((((((((((((((+)))))))))))))))))))))))))"
# num_files = 100

# SIMS,SIMS_retrieve,SIMS_concat = load_multitrj(folder_name,FINAL_STRUCTURE,num_files)

# print("SIMS: ", len(SIMS))
# print("SIMS_retrieve: ", SIMS_retrieve.shape)
# print("SIMS_concat: ", len(SIMS_concat))

In [None]:
# convert concantenate two individual structures to one structure 
def process_gao(dp_og):
    dp = copy.deepcopy(dp_og)
    for i in range(len(dp)):   
        dp[i] = dp[i].replace("+","")
    
    return dp


# convert concantenate two individual structures to one structure, and pair or not
def process_hata(dp_og):
    dp = copy.deepcopy(dp_og)
    dp_pair = []
    for i in range(len(dp)):
        if "&" in dp[i]:
            dp[i] = dp[i].replace("&","")
            dp_pair.append(0)
            
        if "+" in dp[i]:
            dp[i] = dp[i].replace("+","")
            dp_pair.append(1)
            
    return np.array(dp), np.array(dp_pair)


# cooncatanate all sturcutres for Gao dataset: 
def concat_gao(states, times, energies, pairs):
    
    SIMS_dp, SIMS_dp_og, SIMS_pair, SIMS_G, SIMS_T = [],[],[],[],[]
    
    for i in range(len(states)):
        sims_dp = process_gao(states[i])
        
        SIMS_dp.append(sims_dp)
        SIMS_dp_og.append(states[i])
        SIMS_T.append(times[i])
        SIMS_G.append(energies[i])
        SIMS_pair.append(pairs[i])
    
    SIMS_dp = np.concatenate(SIMS_dp)
    SIMS_dp_og = np.concatenate(SIMS_dp_og)
    SIMS_pair = np.concatenate(SIMS_pair)
    SIMS_G = np.concatenate(SIMS_G)
    SIMS_T = np.concatenate(SIMS_T)
        
    return SIMS_dp, SIMS_dp_og, SIMS_pair, SIMS_G, SIMS_T


# cooncatanate all sturcutres for Hata dataset:: 
def concat_hata(states, times, energies):
    
    SIMS_dp, SIMS_dp_og, SIMS_pair, SIMS_G, SIMS_T = [],[],[],[],[]
    
    for i in range(len(states)):
        sims_dp, sims_pair = process_hata(states[i])

        SIMS_dp.append(sims_dp)
        SIMS_dp_og.append(states[i])
        SIMS_pair.append(sims_pair)
        SIMS_G.append(energies[i])
        SIMS_T.append(times[i])
    
    SIMS_dp = np.concatenate(SIMS_dp)
    SIMS_dp_og = np.concatenate(SIMS_dp_og)
    SIMS_pair = np.concatenate(SIMS_pair)
    SIMS_G = np.concatenate(SIMS_G)
    SIMS_T = np.concatenate(SIMS_T)
        
    return SIMS_dp, SIMS_dp_og, SIMS_pair, SIMS_G, SIMS_T


# get the unique structures and their corresponding indices
def get_uniq(SIMS_dp, SIMS_dp_og, SIMS_pair, SIMS_G):
    
    indices_S = np.unique(SIMS_dp,return_index=True)[1]
    
    SIMS_dp_uniq = SIMS_dp[indices_S]
    SIMS_dp_og_uniq = SIMS_dp_og[indices_S]
    SIMS_pair_uniq = SIMS_pair[indices_S]
    SIMS_G_uniq = SIMS_G[indices_S]
        
    # find index to recover to all data from unique data
    coord_id_S = np.empty(len(SIMS_dp))
    for i in range(len(SIMS_dp_uniq)):
        temp = SIMS_dp == SIMS_dp_uniq[i]
        indx = np.argwhere(temp==True)
        coord_id_S[indx] = i
    coord_id_S = coord_id_S.astype(int)

    return SIMS_dp_uniq, SIMS_dp_og_uniq, SIMS_pair_uniq, SIMS_G_uniq, indices_S, coord_id_S


# label the structural types
def label_struc(trajs_types, SIMS_dp_og_uniq):
    
    SIMS_type_uniq = []
    
    for i in range(len(SIMS_dp_og_uniq)):
        SIMS_type_uniq.append(trajs_types[SIMS_dp_og_uniq[i]])
    SIMS_type_uniq = np.array(SIMS_type_uniq)
    
    return SIMS_type_uniq
    


In [None]:
SIMS_dp, SIMS_dp_og, SIMS_pair, SIMS_G, SIMS_T = concat_gao(trajs_states, trajs_times, trajs_energies, trajs_pairs)

SIMS_dp_uniq, SIMS_dp_og_uniq, SIMS_pair_uniq, SIMS_G_uniq, indices_S, coord_id_S = get_uniq(SIMS_dp, SIMS_dp_og, SIMS_pair, SIMS_G)

In [None]:
coord_id_S.shape, SIMS_dp_uniq.shape, SIMS_dp_og_uniq.shape, SIMS_pair_uniq.shape, SIMS_G_uniq.shape, indices_S.shape

In [None]:
import os

inpath = "/Users/chenwei/Desktop/Github/ViDa/data/preprocess_data/preprocess_Gao-P4T4.pkl.gz"
file_name = os.path.basename(inpath)

In [None]:
file_name

### 2. Convert dot-paren to adjacency matrix

In [None]:
""" Dimenstions of SIM_adj list 
SIM_adj: N*m*m
    N: number of states in the trajectory
    m: number of nucleotides in the state (strand)
"""
# get multiple trajectories' data
SIMS_adj, SIMS_G, SIMS_T, SIMS_HT, SIMS_pair, trj_id = sim_adj(SIMS_concat)
print(SIMS_adj.shape,SIMS_G.shape,SIMS_T.shape,SIMS_HT.shape,SIMS_pair.shape,trj_id.shape)

### 3.1 Get unique data except holding time

In [None]:
# get unique states adjacency matrix with their occupancy density, get unique energy, time, if paired;
# and their corresponding indices

# multiple trajectories
indices_S,occ_density_S,SIMS_adj_uniq,SIMS_G_uniq,SIMS_pair_uniq \
     = get_unique(SIMS_concat,SIMS_adj,SIMS_G,SIMS_pair) 
print(indices_S.shape, occ_density_S.shape, SIMS_adj_uniq.shape,SIMS_G_uniq.shape,SIMS_pair_uniq.shape)

### 3.2. Get labeled trajectory data

In [None]:
# # get trajectory data with its corresponding labels 
# # multiple trajectories
SIMS_dict = label_structures(SIMS,indices_S)
coord_id_S = SIMS_dict[:,-1].astype(int)
SIMS_dict_uniq = np.array(SIMS)[indices_S]
print(SIMS_dict.shape, coord_id_S.shape, SIMS_dict_uniq.shape)

# find the structure having the largest occupancy density
print(SIMS_retrieve[indices_S[occ_density_S.argmax()]])

### 3.3 Get unique holding time for each state

In [None]:
# get unique holding time of unique states
SIMS_HT_uniq = mean_holdingtime(SIMS_HT, indices_S, coord_id_S)
print(SIMS_HT_uniq.shape)

### 4. Convert adjacency matrix scattering coefficients

#### SIMS_scar_uniq

In [None]:
# # Multiple trajectories
scat_coeff_array_S = transform_dataset(SIMS_adj_uniq)
SIMS_scar_uniq = get_normalized_moments(scat_coeff_array_S).squeeze()

# get SIMS_scar based on SIMS_scar_uniq
SIMS_scar = SIMS_scar_uniq[coord_id_S]

In [None]:
# print(SIMS_scar.shape, (np.unique(SIMS_scar,axis=0)).shape)
print(SIMS_scar_uniq.shape, (np.unique(SIMS_scar_uniq,axis=0)).shape)

In [None]:
# # For large trajectories states
# SIMS_scar_uniq1 = get_normalized_moments(transform_dataset(SIMS_adj_uniq[:60000])).squeeze()
# SIMS_scar_uniq2 = get_normalized_moments(transform_dataset(SIMS_adj_uniq[60000:])).squeeze()
# SIMS_scar_uniq = np.concatenate((SIMS_scar_uniq1,SIMS_scar_uniq2))

# # get SIMS_scar based on SIMS_scar_uniq
# SIMS_scar = SIMS_scar_uniq[coord_id_S]

# print(SIMS_scar.shape, (np.unique(SIMS_scar,axis=0)).shape)
# print(SIMS_scar_uniq.shape, (np.unique(SIMS_scar_uniq,axis=0)).shape)

In [None]:
""" Save/load all obtained data to npz file for python,
    Multiple trajectories
"""
# # save for python
# fnpz_data = "data/vida_data/helix_assoc/helix_assoc_{}_multrj_100epoch_py_temp.npz".format(SEQ)
# with open(fnpz_data, 'wb') as f:
#     np.savez(f,
#             # SIMS data
#             SIMS_T=SIMS_T, SIMS_HT=SIMS_HT, SIMS_HT_uniq=SIMS_HT_uniq,
#             SIMS_adj_uniq=SIMS_adj_uniq, SIMS_scar_uniq=SIMS_scar_uniq,
#             SIMS_G_uniq=SIMS_G_uniq, SIMS_pair_uniq=SIMS_pair_uniq,
#             SIMS_dict=SIMS_dict, SIMS_dict_uniq=SIMS_dict_uniq,
#             # Indices
#             coord_id_S=coord_id_S, indices_S=indices_S,trj_id=trj_id, occ_density_S=occ_density_S,
#             # # embed data and occpancy density
#             # data_embed=data_embed,
#             # # plotting data
#             # pca_coords=pca_coords, pca_all_coords=pca_all_coords,
#             # phate_coords=phate_coords, phate_all_coords=phate_all_coords,
#             # umap_coord_2d=umap_coord_2d, umap_all_coord_2d=umap_all_coord_2d,
#             # umap_coord_3d=umap_coord_3d, umap_all_coord_3d=umap_all_coord_3d,
#             # tsne_coord_2d=tsne_coord_2d, tsne_all_coord_2d=tsne_all_coord_2d,
#             # tsne_coord_3d=tsne_coord_3d, tsne_all_coord_3d=tsne_all_coord_3d,
#             )
    
# # multiple trajectories
# fnpz_data = "data/vida_data/helix_assoc/helix_assoc_PT4_multrj_100epoch_py_temp.npz"
# data_npz = np.load(fnpz_data)

# # asssign data to variables
# for var in data_npz.files:
#      locals()[var] = data_npz[var]

# # recover full data based on coord_id, indices, and unique data
# SIMS_adj = SIMS_adj_uniq[coord_id_S]
# SIMS_scar = SIMS_scar_uniq[coord_id_S]
# SIMS_G = SIMS_G_uniq[coord_id_S]
# SIMS_pair = SIMS_pair_uniq[coord_id_S]


# print(SIMS_T.shape,SIMS_HT.shape,SIMS_HT_uniq.shape)
# print(SIMS_adj.shape,SIMS_scar.shape,SIMS_G.shape,SIMS_HT.shape,SIMS_pair.shape)
# print(SIMS_adj_uniq.shape,SIMS_scar_uniq.shape,SIMS_G_uniq.shape,SIMS_pair_uniq.shape) 
# print(SIMS_dict.shape,SIMS_dict_uniq.shape)
# print(coord_id_S.shape,indices_S.shape,trj_id.shape,occ_density_S.shape)

In [None]:
"""Shape of split data
    train_data: [tr_adjs, tr_coeffs, tr_energies]
    test_data: [te_adjs, te_coeffs, te_energies]
"""
train_data,test_data = split_data(SIMS_adj_uniq,SIMS_scar_uniq,SIMS_G_uniq)  # multiple trj