In [3]:
import tensorflow.keras
from tensorflow.keras.models import Sequential, Model, load_model

from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Input, Lambda, GlobalMaxPooling1D, concatenate, ReLU
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Conv1D, MaxPooling1D, LSTM, ConvLSTM2D, GRU, BatchNormalization, LocallyConnected2D, Permute
from tensorflow.keras.layers import Concatenate, Reshape, Softmax, Conv2DTranspose, Embedding, Multiply
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, Callback
from tensorflow.keras import regularizers
from tensorflow.keras import backend as K
import tensorflow.keras.losses

import tensorflow as tf
from tensorflow.python.framework import ops

from tensorflow.keras.utils import plot_model

# import isolearn.keras as iso

import numpy as np

import tensorflow as tf
import logging
logging.getLogger('tensorflow').setLevel(logging.ERROR)

import pandas as pd

import os
import pickle
import numpy as np

import scipy.sparse as sp
import scipy.io as spio

import matplotlib.pyplot as plt

# import isolearn.io as isoio
# import isolearn.keras as isol

import sklearn
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn import linear_model
from sklearn.ensemble import GradientBoostingRegressor

from scipy.stats import pearsonr, spearmanr

import seaborn as sns

from matplotlib import colors

# import editdistance
from sklearn.utils import resample

In [4]:
# Construct data features

# location of filtered Sharpr-MPRA data (R0-MPRA)
base_dir = '/content/drive/MyDrive/Colab Notebooks/'

promoter = "minp"

cell_line_1 = "hepg2"
cell_line_2 = "k562"

x_train = np.load(base_dir + "sharpr_cached_" + promoter + "_" + cell_line_1 + "_" + cell_line_2 + "minlogfold-2.4_minDNA200" + "_x_train.npy")
x_valid = np.load(base_dir +"sharpr_cached_" + promoter + "_" + cell_line_1 + "_" + cell_line_2 + "minlogfold-2.4_minDNA200" + "_x_valid.npy")
x_test = np.load(base_dir +"sharpr_cached_" + promoter + "_" + cell_line_1 + "_" + cell_line_2 + "minlogfold-2.4_minDNA200" + "_x_test.npy")

y_train = np.load(base_dir +"sharpr_cached_" + promoter + "_" + cell_line_1 + "_" + cell_line_2 + "minlogfold-2.4_minDNA200" + "_y_train.npy")
y_valid = np.load(base_dir +"sharpr_cached_" + promoter + "_" + cell_line_1 + "_" + cell_line_2 + "minlogfold-2.4_minDNA200" + "_y_valid.npy")
y_test = np.load(base_dir +"sharpr_cached_" + promoter + "_" + cell_line_1 + "_" + cell_line_2 + "minlogfold-2.4_minDNA200" + "_y_test.npy")

# remove unnecessary dimension
x_train = x_train[:,0,:,:]
x_valid = x_valid[:,0,:,:]
x_test = x_test[:,0,:,:]
    
print("x_train.shape = " + str(x_train.shape))
print("y_train.shape = " + str(y_train.shape))
print("")

print("x_valid.shape = " + str(x_valid.shape))
print("y_valid.shape = " + str(y_valid.shape))
print("")

print("x_test.shape = " + str(x_test.shape))
print("y_test.shape = " + str(y_test.shape))
print("")


x_train.shape = (40268, 145, 4)
y_train.shape = (40268, 2)

x_valid.shape = (10000, 145, 4)
y_valid.shape = (10000, 2)

x_test.shape = (10000, 145, 4)
y_test.shape = (10000, 2)



In [4]:
# define R1-MPRA model architecture
def get_pat_model(n_filters,filt_sizes,n_dense,dropout_rate):
  sequence_input = Input(shape=(145, 4),name="pat_input")

  convs = [None]*len(filt_sizes)

  for i in range(len(filt_sizes)):
    conv1           = Conv1D(n_filters, filt_sizes[i], padding='same', activation='linear', name = "pat_conv_" + str(i))(sequence_input)
    batchnorm1      = BatchNormalization(axis=-1,name = "pat_batchnorm_" + str(i))(conv1)
    relu1           = Activation('relu',name = "pat_relu_" + str(i))(batchnorm1)
    convs[i]        = Dropout(dropout_rate,name = "pat_dropout_" + str(i))(GlobalMaxPooling1D(name = "pat_pool_" + str(i))(relu1))

  concat1           = concatenate(convs,name="pat_concat_layer")

  dense           = Dense(n_dense,activation='relu',name="pat_dense")(concat1)
  output          = Dense(2,activation='linear',name="pat_output")(dense)  # 0 - HepG2, 1 - K562

  model = Model(inputs=sequence_input,outputs=output)
  model.compile(optimizer=tensorflow.keras.optimizers.Adam(lr=0.0002, beta_1=0.9, beta_2=0.999),
                loss="mse")
  
  return model

In [None]:
#Train conv model

n_models = 10
n_epochs = 60
batch_size = 64

for model_ix in range(n_models) :

    # keep this line otherwise training models in loop slows down
    K.clear_session() 

    model_name = "wide_" + str(model_ix)

    print("Training model '" + model_name + "'")

    conv_model = get_pat_model(600,[25,11,7],64,0.075)
    
    callbacks =[
        EarlyStopping(
            monitor='val_loss',
            min_delta=1e-6,
            patience=8,
            verbose=True,
            restore_best_weights=True
        )
    ]

    train_history = conv_model.fit(
        [x_train],
        [y_train],
        shuffle=True,
        epochs=n_epochs,
        batch_size=batch_size,
        validation_data=(
            [x_valid],
            [y_valid]
        ),
        callbacks=callbacks
    )
    
    # Save model and weights
    save_dir = 'saved_models'

    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)

    model_path = os.path.join(save_dir, model_name + '.h5')
    conv_model.save(model_path)
    print('Saved trained model at %s ' % model_path)
