In [None]:
!pip install import-ipynb

import import_ipynb
from gensim.models import Word2Vec
import numpy as np
import gc
from sklearn.model_selection import KFold
from tqdm import tqdm
import os,logging,pickle,random,torch
from matplotlib import pyplot
import pandas as pd
from scipy import stats
import keras
import h5py
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, Activation, LSTM, GRU, SimpleRNN, Conv1D, TimeDistributed, MaxPooling1D, Flatten, Dropout, Input, AveragePooling1D, Add, Concatenate
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from google.colab import drive

datadir = 'Dataset/embedded_data'
drive.mount('/content/drive', force_remount=True)

%cd "drive/MyDrive/Bionformatics_Project/Colab"

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1he9S1Es-XalZ9vgCRMbTTNd6cXeRkkFI/Bionformatics_Project/Colab


#Calculate embedded dataset for sequences with halflife

Load the sequences

In [None]:
from Classes.DataManager import DataManager
print("> Loading DataManager...")
dm = DataManager(transformer=True, micro = False, tf = False)

print("> Loading train data...")
X_trainhalflife, X_trainpromoter, y_train, geneName_train, _ = dm.get_train(True, True)

print("> Loading validation data...")
X_validationhalflife, X_validationpromoter, y_validation, geneName_valid, _ = dm.get_validation(True, True)

print("> Loading test data...")
X_testhalflife, X_testpromoter, y_test, geneName_test, _ = dm.get_test(True, True)

X_trainpromoter = np.array(X_trainpromoter)

mask_tr = np.all(X_trainpromoter < 4, axis = 1)
X_trainpromoter = X_trainpromoter[mask_tr]
X_trainhalflife = X_trainhalflife[mask_tr]
geneName_train = geneName_train[mask_tr]
y_train = y_train[mask_tr]

X_testpromoter = np.array(X_testpromoter)

mask_te = np.all(X_testpromoter < 4, axis = 1)
X_testpromoter = X_testpromoter[mask_te]
X_testhalflife = X_testhalflife[mask_te]
geneName_test = geneName_test[mask_te]
y_test = y_test[mask_te]

importing Jupyter notebook from /content/drive/.shortcut-targets-by-id/1he9S1Es-XalZ9vgCRMbTTNd6cXeRkkFI/Bionformatics_Project/Colab/Classes/DataManager.ipynb
> Loading DataManager...
> Loading train data...
> Loading validation data...
> Loading test data...


Algorithm basic Functions

In [None]:
def chunks(l, n=50):
    n = max(1, n)
    y = [l[i:i+n] for i in range(0, len(l), n)]
    return y

def segmentation_train(seq):
  segments = [seq[j:j+3] for j in range(len(seq)-2)]
  vec = [''.join([str(j) for j in i]) for i in segments]                                       
  return vec

def segmentation(seq):
  segments = [seq[j:j+3] for j in range(len(seq)-2)]
  vec = [''.join([str(j) for j in i]) for i in segments]
  vec = [w2v[i] for i in vec]  
  y = np.array(vec).mean(axis=0)      #capire se va bene o ci vuole il layer apposito
  return y

def full_map(x):
  temp = chunks(x[3000:13500])
  temp = map(segmentation,temp)
  temp = tuple(temp)
  temp = np.stack(temp)
  return temp

def embeddings(x): 
  data = np.array([el for el in tqdm(map(full_map, x))])
  return data

Create the Word2Vec model

In [None]:
w2v = Word2Vec(sentences=map(segmentation_train,X_trainpromoter), size=64, window=5, min_count=0, workers=4, sg = 1, iter = 10)
w2v.save("Dataset/embedded_data/word2vec.model")

Calculate the new embedding

In [None]:
w2v = Word2Vec.load("Dataset/embedded_data/word2vec.model")

X_train = embeddings(X_trainpromoter)
X_validation = embeddings(X_validationpromoter)
X_test = embeddings(X_testpromoter)

Save the new data

In [None]:
h5f = h5py.File('Dataset/embedded_data/etrain.h5', 'w')
h5f.create_dataset('promoter', data=X_train)
h5f.create_dataset('halflife', data=X_trainhalflife)
h5f.create_dataset('label',    data=y_train)
h5f.close()

h5f = h5py.File('Dataset/embedded_data/etest.h5', 'w')
h5f.create_dataset('promoter', data=X_test)
h5f.create_dataset('halflife', data=X_testhalflife)
h5f.create_dataset('label',    data=y_test)
h5f.close()

h5f = h5py.File('Dataset/embedded_data/evalidation.h5', 'w')
h5f.create_dataset('promoter', data=X_validation)
h5f.create_dataset('halflife', data=X_validationhalflife)
h5f.create_dataset('label',    data=y_validation)
h5f.close()

#Calculate embedded dataset for sequences with halflife and transcription factors

Load the data and allign it

In [None]:
from DataManager import DataManager
print("> Loading DataManager...")
dm = DataManager(transformer=True)

print("> Loading train data...")
X_trainhalflife, X_trainpromoter, y_train, geneName_train,_ = dm.get_train(True, True)

print("> Loading validation data...")
X_validationhalflife, X_validationpromoter, y_validation, geneName_valid,_ = dm.get_validation(True, True)

print("> Loading test data...")
X_testhalflife, X_testpromoter, y_test, geneName_test,_ = dm.get_test(True, True)

tf = pd.read_excel('transcription_factor2.xlsx')

J_promoter = np.append(np.append(X_trainpromoter,X_validationpromoter,axis = 0),X_testpromoter,axis=0)
J_halflife = np.append(np.append(X_trainhalflife,X_validationhalflife,axis = 0),X_testhalflife,axis=0)
J_label = np.append(np.append(y_train,y_validation,axis = 0),y_test,axis=0)
J_genes = np.append(np.append(geneName_train,geneName_valid,axis = 0),geneName_test,axis=0)
J_genes = np.array(list(map(lambda x: x.decode('UTF-8'),J_genes)))

mask_J = np.in1d(J_genes, tf[0])
J_halflife = J_halflife[mask_J]
J_genes = J_genes[mask_J]
J_promoter = J_promoter[mask_J]
J_label =J_label[mask_J]

a = J_genes.argsort()
J_label = J_label.take(a,0)
J_promoter = J_promoter.take(a,0)
J_halflife = J_halflife.take(a,0)
J_genes.sort()

tf = tf.sort_values(0)
tf = tf[tf[0].isin(J_genes)]

import json
J_tf = np.array(list(map(lambda x: np.array(json.loads(x)), tf['TF'].values)))

list_shuffle = list(range(len(J_label)))
print(list_shuffle)
######################################
np.random.seed(42)
######################################
np.random.shuffle(list_shuffle)
print(list_shuffle)
idx_test = list_shuffle[:1000]
idx_val = list_shuffle[1000:2000]
idx_train = list_shuffle[2000:]

X_test_p = J_promoter[idx_test]
X_val_p = J_promoter[idx_val]
X_train_p = J_promoter[idx_train]

X_test_h = J_halflife[idx_test]
X_val_h = J_halflife[idx_val]
X_train_h = J_halflife[idx_train]

y_test = J_label[idx_test]
y_val = J_label[idx_val]
y_train = J_label[idx_train]

X_test_tf = J_tf[idx_test]
X_val_tf = J_tf[idx_val]
X_train_tf = J_tf[idx_train]

mask_tr = np.all(X_train_p < 4, axis = 1)
X_train_p = X_train_p[mask_tr]
X_train_h = X_train_h[mask_tr]
y_train = y_train[mask_tr]
X_train_tf = X_train_tf[mask_tr]

mask_te = np.all(X_test_p < 4, axis = 1)
X_test_p = X_test_p[mask_te]
X_test_h = X_test_h[mask_te]
X_test_tf = X_test_tf[mask_te]
y_test = y_test[mask_te]

Calculate new embedding

In [None]:
w2v = Word2Vec.load("embedded_data/word2vec.model")

X_train = embeddings(X_train_p)
X_validation = embeddings(X_val_p)
X_test = embeddings(X_test_p)

Save the data

In [None]:
  h5f = h5py.File('Dataset/embedded_data/etrain_tf.h5', 'w')
  h5f.create_dataset('promoter', data=X_train)
  h5f.create_dataset('halflife', data=X_train_h)
  h5f.create_dataset('tf',       data=X_train_tf)
  h5f.create_dataset('label',    data=y_train)
  h5f.close()

  h5f = h5py.File('Dataset/embedded_data/etest_tf.h5', 'w')
  h5f.create_dataset('promoter', data=X_test)
  h5f.create_dataset('halflife', data=X_test_h)
  h5f.create_dataset('tf',       data=X_test_tf)
  h5f.create_dataset('label',    data=y_test)
  h5f.close()

  h5f = h5py.File('Dataset/embedded_data/evalidation_tf.h5', 'w')
  h5f.create_dataset('promoter', data=X_validation)
  h5f.create_dataset('halflife', data=X_val_h)
  h5f.create_dataset('tf',       data=X_val_tf)
  h5f.create_dataset('label',    data=y_val)
  h5f.close()