In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "15"

In [2]:
from keras.utils import Sequence
from keras.layers import Dense, Activation, Input, Conv1D, MaxPooling1D, Reshape
from keras.optimizers import Adam
from keras import Model
import numpy as np
from seqdataloader.batchproducers.coordbased.core import Coordinates
from seqdataloader.batchproducers.coordbased.coordstovals.fasta import PyfaidxCoordsToVals
from pyfaidx import Fasta
import random

In [17]:
class TrainGenerator(Sequence):
    def __init__(self):
        self.trainingfile = training_set
        self.converter = PyfaidxCoordsToVals('GRCh38.p13.genome.fa')
        self.batchsize = batchsize
        self.steps_per_epoch = train_steps
        self.total_epochs = num_epochs  # how many epochs to train for (you're using enough epochs when the loss stops going down)
        self.get_coords()
        self.on_epoch_end()
    def __len__(self):  # required by Keras -- returns # of batches to expect
        print('running __len__')
        return self.steps_per_epoch
    def get_coords(self):
        print("running get_coords")
        with open(self.trainingfile) as trainf:
            #self.labels = [int(line.split()[-6:]) for line in trainf]  # assumes last columns in file are the labels, and assumes 6 label columns
            #coords_tmp = [line.split()[:3] for line in trainf]
            #self.coords = [Coordinates(coord[0], int(coord[1]), int(coord[2])) for coord in coords_tmp]
            self.labels = []
            self.coords = []
            for line in trainf:
                temp = line.split()
                tempList = [temp[0], int(temp[1]), int(temp[2]), float(temp[3]), float(temp[4]), 
                            float(temp[5]), float(temp[6]), int(temp[7]), int(temp[8])]
                for ind in range(len(tempList)):
                    if (ind < 3 or ind > 6):
                        continue
                    if (tempList[ind] > threshold):
                        tempList[ind] = 1
                    else:
                        tempList[ind] = 0
                    #tempList[ind] = int(tempList[ind])
                self.labels.append(tempList[-6:-2]) #4 label columns
                self.coords.append(Coordinates(tempList[0], tempList[1], tempList[2]))
            #should I cast them as numpy arrays?
    def __getitem__(self, batch_index):
        print("running __getitem__")
        # get one-hot encoded sequences, using coordinates, for the next batch
        seqs_onehot = self.converter(self.coords[batch_index * self.batchsize : (batch_index + 1) * self.batchsize])
        # get labels for the next batch
        labels = self.labels[batch_index * self.batchsize : (batch_index + 1) * self.batchsize]
        
        if (seqs_onehot.shape[0] == 0):
            print("DEBUG: seqs_onehot.shape[0] == 0")
            print("batch_index: " + str(batch_index))
            print("self.batchsize: " + str(self.batchsize))
            print("self.coords[batch_index * self.batchsize : (batch_index + 1) * self.batchsize]")
            print(self.coords[batch_index * self.batchsize : (batch_index + 1) * self.batchsize])
        
        # sanity checks
        assert seqs_onehot.shape[0] == np.array(labels).shape[0], (seqs_onehot.shape[0], np.array(labels).shape[0])
        assert seqs_onehot.shape[0] == self.batchsize, (seqs_onehot.shape[0], self.batchsize)
        return seqs_onehot, np.array(labels) #return all_seqs, np.array(self.labels)
    def on_epoch_end(self):
        # shuffle labels and coordinates (together) between epochs
        print("running on_epoch_end")
        zipped_coords_and_labels = list(zip(self.coords, self.labels))
        random.shuffle(zipped_coords_and_labels)
        self.coords = [pair[0] for pair in zipped_coords_and_labels]
        self.labels = [pair[1] for pair in zipped_coords_and_labels]

In [4]:
def default_model(sequence_length = 200, num_filters = 240, filter_size = 20, dense1_nodes = 1024, dense2_nodes = 512, dense3_nodes = 128, num_outputs = 4, stride = 10, pool_len = 10):
    # start by defining input layer, which will read in one-hot encoded sequences
    seq_input = Input(shape = (sequence_length, 4, ), name = 'seq')
    # the first layer is convolutional
    seq = Conv1D(num_filters, filter_size, padding = "same")(seq_input)
    seq2 = Activation("relu")(seq)
    # pool to decrease size (keep strides <= pool_size)
    seq3 = MaxPooling1D(padding = "same", strides = stride, pool_size = pool_len)(seq2)
    # pyramid of dense layers with decreasing #s of nodes
    reshaped = Reshape((int(num_filters * sequence_length / stride), ))(seq3)
    dense1 = Dense(dense1_nodes, activation = "relu")(reshaped)
    dense2 = Dense(dense2_nodes, activation = "relu")(dense1)
    dense3 = Dense(dense3_nodes, activation = "relu")(dense2)
    # softmax layer output (since we are predicting multiple categories)
    # use with categorical_crossentropy loss
    output = Dense(num_outputs, activation = "softmax")(dense3)
    # return model
    model = Model(seq_input, output)
    return model

# Training Set Modification

In [None]:
with open('6col-training-set.bed', 'r') as f:
    training = f.readlines()
for i in range(len(training)):
    training[i] = training[i].split()
    for j in range(len(training[i])):
        if (j == 1 or j == 2 or j > 6):
            training[i][j] = int(training[i][j])
        elif (j > 2):
            training[i][j] = float(training[i][j])
training[:100]

In [5]:
threshold = 0.75

In [None]:
def fewerIntrons(file, interval): #after opening and saving as list
    #print(file[:100])
    print(len(file))
    counter = 0
    newList = []
    print("creating newList")
    for window in file:
        if (window[interval] > threshold):
            counter += 1
        if (counter % interval == 0):
            newList.append(window)
    #print(newList[:100])
    print("removing items")
    print(len(newList))
    #newList[:100]
    return newList

In [None]:
training_modified = fewerIntrons(training, 2)

In [None]:
training_modified

In [None]:
if os.path.isfile('6col-training-set-mod.bed'):
    os.remove('6col-training-set-mod.bed')

myFile = open('6col-training-set-mod.bed','w')

for window in training_modified:
    val = [str(item) for item in window]
    myFile.write('\t'.join(val) + '\n')
myFile.close()

In [6]:
training_set = '6col-training-set-mod.bed'
#training_set = '6col-training-set.bed'

# Training!

In [7]:
if (not (os.path.isfile('GRCh38.p13.genome.fa.gz') or os.path.isfile('GRCh38.p13.genome.fa'))):
    !wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_34/GRCh38.p13.genome.fa.gz
if (not os.path.isfile('GRCh38.p13.genome.fa') and os.path.isfile('GRCh38.p13.genome.fa.gz')):
    !gunzip GRCh38.p13.genome.fa.gz

In [8]:
training_examples = !wc -l < "6col-training-set.bed"
training_examples = int(training_examples[0])

In [9]:
#parameters
batchsize = 200 #for now
train_steps = training_examples // batchsize
num_epochs = 10 #for now #1000
#threshold defined above

In [18]:
model = default_model()
print(model)
model.compile(loss = "categorical_crossentropy", optimizer = Adam(learning_rate=0.000001), metrics = ["accuracy"])
print(model.summary())
hist = model.fit_generator(epochs = num_epochs,  # choose how many epochs to train for (watch the loss to see when to stop)
                           steps_per_epoch = train_steps,
                           generator = TrainGenerator(),  # see above
                           use_multiprocessing = True,
                           workers = 8)  # use this to split batch processing into multiple CPUs (but don't take over the VM completely!)

<tensorflow.python.keras.engine.functional.Functional object at 0x7fc6d5adce10>
Model: "functional_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
seq (InputLayer)             [(None, 200, 4)]          0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 200, 240)          19440     
_________________________________________________________________
activation_4 (Activation)    (None, 200, 240)          0         
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 20, 240)           0         
_________________________________________________________________
reshape_4 (Reshape)          (None, 4800)              0         
_________________________________________________________________
dense_16 (Dense)             (None, 1024)              4916224   
________________________________________

UnknownError:  AssertionError: (0, 200)
multiprocessing.pool.RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/opt/miniconda3/lib/python3.7/multiprocessing/pool.py", line 121, in worker
    result = (True, func(*args, **kwds))
  File "/home/chloe/.local/lib/python3.7/site-packages/tensorflow/python/keras/utils/data_utils.py", line 679, in get_index
    return _SHARED_SEQUENCES[uid][i]
  File "<ipython-input-17-c10936ee6704>", line 52, in __getitem__
    assert seqs_onehot.shape[0] == self.batchsize, (seqs_onehot.shape[0], self.batchsize)
AssertionError: (0, 200)
"""


The above exception was the direct cause of the following exception:


Traceback (most recent call last):

  File "/home/chloe/.local/lib/python3.7/site-packages/tensorflow/python/ops/script_ops.py", line 244, in __call__
    ret = func(*args)

  File "/home/chloe/.local/lib/python3.7/site-packages/tensorflow/python/autograph/impl/api.py", line 302, in wrapper
    return func(*args, **kwargs)

  File "/home/chloe/.local/lib/python3.7/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 827, in generator_py_func
    values = next(generator_state.get_iterator(iterator_id))

  File "/home/chloe/.local/lib/python3.7/site-packages/tensorflow/python/keras/engine/data_adapter.py", line 814, in wrapped_generator
    for data in generator_fn():

  File "/home/chloe/.local/lib/python3.7/site-packages/tensorflow/python/keras/utils/data_utils.py", line 900, in get
    six.reraise(*sys.exc_info())

  File "/home/chloe/.local/lib/python3.7/site-packages/six.py", line 703, in reraise
    raise value

  File "/home/chloe/.local/lib/python3.7/site-packages/tensorflow/python/keras/utils/data_utils.py", line 891, in get
    inputs = self.queue.get(block=True, timeout=5).get()

  File "/opt/miniconda3/lib/python3.7/multiprocessing/pool.py", line 657, in get
    raise self._value

AssertionError: (0, 200)


	 [[{{node PyFunc}}]]
	 [[IteratorGetNext]] [Op:__inference_train_function_4761]

Function call stack:
train_function


In [None]:
def plot_metrics(hist, cb = None):
    epochs = range(1, 100) #1, num_epochs + 1
    plt.figure(figsize = (12,8))
    plt.subplot(211)
    plt.plot(epochs, hist.history["acc"], '.-', color = '#31E080', label = "Training Accuracy")
    plt.legend()
    plt.subplot(212)
    plt.plot(epochs, hist.history["loss"], '.-', color = '#31E080', label = "Training Loss")
    plt.legend()
    plt.legend()
    plt.savefig(params.figures_path + "_metrics", dpi = 300)

In [None]:
plot_metrics(hist)

# Test Generator

In [None]:
test_set = '6col-test-set.bed'

In [None]:
class TestGenerator(Sequence):
    def __init__(self):
        self.testingfile = test_set
        self.converter = PyfaidxCoordsToVals('GRCh38.p13.genome.fa')
        self.batchsize = batchsize
        self.steps_per_epoch = train_steps
        self.total_epochs = num_epochs  # how many epochs to train for (you're using enough epochs when the loss stops going down)
        self.get_coords()
        self.on_epoch_end()
    def __len__(self):  # required by Keras -- returns # of batches to expect
        print('running __len__')
        return self.steps_per_epoch
    def get_coords(self):
        print("running get_coords")
        with open(self.testingfile) as testf:
            #self.labels = [int(line.split()[-6:]) for line in testf]  # assumes last columns in file are the labels, and assumes 6 label columns
            #coords_tmp = [line.split()[:3] for line in testf]
            #self.coords = [Coordinates(coord[0], int(coord[1]), int(coord[2])) for coord in coords_tmp]
            self.coords = []
            for line in testf:
                temp = line.split()
                tempList = [temp[0], int(temp[1]), int(temp[2])]
                self.coords.append(Coordinates(tempList[0], tempList[1], tempList[2]))
            #should I cast them as numpy arrays?
    def __getitem__(self, batch_index):
        print("running __getitem__")
        # get one-hot encoded sequences, using coordinates, for the next batch
        seqs_onehot = self.converter(self.coords[batch_index * self.batchsize : (batch_index + 1) * self.batchsize])
        
        # sanity checks
        assert seqs_onehot.shape[0] == np.array(labels).shape[0], (seqs_onehot.shape[0], np.array(labels).shape[0])
        assert seqs_onehot.shape[0] == self.batchsize
        return seqs_onehot
    def on_epoch_end(self):
        # shuffle labels and coordinates (together) between epochs
        print("running on_epoch_end")
        self.coords = random.shuffle(self.coords)

In [None]:
hist2 = model.fit_generator(epochs = num_epochs,  # choose how many epochs to train for (watch the loss to see when to stop)
                           steps_per_epoch = train_steps,
                           generator = TestGenerator(),  # see above
                           use_multiprocessing = True,
                           workers = 8)  # use this to split batch processing into multiple CPUs (but don't take over the VM completely!)