In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "15"

In [2]:
from keras.utils import Sequence
from keras.layers import Dense, Activation, Input, Conv1D, MaxPooling1D
import numpy as np
from seqdataloader.batchproducers.coordbased.core import Coordinates
from seqdataloader.batchproducers.coordbased.coordstovals.fasta import PyfaidxCoordsToVals
from keras import Model
from pyfaidx import Fasta

In [3]:
class TrainGenerator():
    def __init__(self):
        self.trainingfile = '6col-training-set.bed' #training
        self.converter = PyfaidxCoordsToVals('gencode.v34.transcripts.fa')
        self.batchsize = batchsize
        self.steps_per_epoch = train_steps
        self.total_epochs = num_epochs  # how many epochs to train for (you're using enough epochs when the loss stops going down)
        self.get_coords()
        self.on_epoch_end()
    def __len__(self):  # required by Keras -- returns # of batches to expect
        print('running __len__')
        return self.steps_per_epoch
    def get_coords(self):
        print("running get_coords")
        with open(self.trainingfile) as trainf:
            #self.labels = [int(line.split()[-6:]) for line in trainf]  # assumes last columns in file are the labels, and assumes 6 label columns
            #coords_tmp = [line.split()[:3] for line in trainf]
            #self.coords = [Coordinates(coord[0], int(coord[1]), int(coord[2])) for coord in coords_tmp]
            self.labels = []
            self.coords = []
            for line in trainf:
                temp = line.split()
                tempList = [temp[0], int(temp[1]), int(temp[2]), float(temp[3]), float(temp[4]), 
                            float(temp[5]), float(temp[6]), int(temp[7]), int(temp[8])]
                for ind in range(len(tempList)):
                    if (ind < 3 or ind > 6):
                        continue
                    if (tempList[ind] > threshold):
                        tempList[ind] = 1
                    tempList[ind] = int(tempList[ind])
                self.labels.append(tempList[-6:]) #6 label columns
                self.coords.append([Coordinates(tempList[0], tempList[1], tempList[2])])
            #should I cast them as numpy arrays?
    def __getitem__(self, batch_index):
        print("running __getitem__")
        # get one-hot encoded sequences, using coordinates, for the next batch
        seqs_onehot = self.converter(self.coords[batch_index * self.batchsize : (batch_index + 1) * self.batchsize])
        # get labels for the next batch
        labels = self.labels[batch_index * self.batchsize : (batch_index + 1) * self.batchsize]
        
        #all_seqs = seqs_onehot
        #print("hi")
        #print(all_seqs)
        # sanity checks
        assert all_seqs.shape[0] == np.array(self.labels).shape[0], (seqs_onehot.shape[0], np.array(self.labels).shape[0])
        assert all_seqs.shape[0] == self.batchsize
        return all_seqs, np.array(labels) #return all_seqs, np.array(self.labels)
    def on_epoch_end(self):
        # shuffle labels and coordinates (together) between epochs
        print("running on_epoch_end")
        indexes = np.arange(np.array(self.coords).shape[0])
        np.random.shuffle(indexes)
        self.coords = np.array(self.coords)[indexes]
        self.labels = np.array(self.labels)[indexes]

In [4]:
def default_model(sequence_length = 200, num_filters = 240, filter_size = 20, dense1_nodes = 1024, dense2_nodes = 512, dense3_nodes = 128, num_outputs = 4, stride = 10, pool_len = 10):
    # start by defining input layer, which will read in one-hot encoded sequences
    seq_input = Input(shape = (sequence_length, 4, ), name = 'seq')
    # the first layer is convolutional
    seq = Conv1D(num_filters, filter_size, padding = "same")(seq_input)
    seq2 = Activation("relu")(seq)
    # pool to decrease size (keep strides <= pool_size)
    seq3 = MaxPooling1D(padding = "same", strides = stride, pool_size = pool_len)(seq2)
    # pyramid of dense layers with decreasing #s of nodes
    dense1 = Dense(dense1_nodes, activation = "relu")(seq3)
    dense2 = Dense(dense2_nodes, activation = "relu")(dense1)
    dense3 = Dense(dense3_nodes, activation = "relu")(dense2)
    # softmax layer output (since we are predicting multiple categories)
    # use with categorical_crossentropy loss
    output = Dense(num_outputs, activation = "softmax")(dense3)
    # return model
    model = Model(seq_input, output)
    return model

# File Reading

In [5]:
with open('6col-test-set.bed', 'r') as f:
    test = f.readlines()
for i in range(len(test)):
    test[i] = test[i].split()
    for j in range(len(test[i])):
        if (j == 1 or j == 2 or j > 6):
            test[i][j] = int(test[i][j])
        elif (j > 2):
            test[i][j] = float(test[i][j])
test[:100]

[['chr5', 107859034, 107859234, 0.0, 0.0, 0.0, 1.0, 0, 0],
 ['chr5', 107859134, 107859334, 0.0, 0.0, 0.0, 1.0, 0, 0],
 ['chr5', 107859234, 107859434, 0.0, 0.0, 0.0, 1.0, 0, 0],
 ['chr5', 107859334, 107859534, 0.0, 0.0, 0.0, 1.0, 0, 0],
 ['chr5', 107859434, 107859634, 0.0, 0.0, 0.0, 1.0, 0, 0],
 ['chr5', 107859534, 107859734, 0.0, 0.0, 0.0, 1.0, 0, 0],
 ['chr5', 107859634, 107859834, 0.0, 0.0, 0.0, 1.0, 0, 0],
 ['chr5', 107859734, 107859934, 0.0, 0.0, 0.0, 1.0, 0, 0],
 ['chr5', 107859834, 107860034, 0.0, 0.0, 0.0, 1.0, 0, 0],
 ['chr5', 107859934, 107860134, 0.0, 0.0, 0.0, 1.0, 0, 0],
 ['chr5', 107860034, 107860234, 0.0, 0.0, 0.0, 1.0, 0, 0],
 ['chr5', 107860134, 107860334, 0.0, 0.0, 0.0, 1.0, 0, 0],
 ['chr5', 107860234, 107860434, 0.0, 0.0, 0.0, 1.0, 0, 0],
 ['chr5', 107860334, 107860534, 0.0, 0.0, 0.0, 1.0, 0, 0],
 ['chr5', 107860434, 107860634, 0.0, 0.0, 0.0, 1.0, 0, 0],
 ['chr5', 107860534, 107860734, 0.0, 0.0, 0.0, 1.0, 0, 0],
 ['chr5', 107860634, 107860834, 0.0, 0.0, 0.0, 1.0, 0, 0

In [6]:
with open('6col-validation-set.bed', 'r') as f:
    validation = f.readlines()
for i in range(len(validation)):
    validation[i] = validation[i].split()
    for j in range(len(validation[i])):
        if (j == 1 or j == 2 or j > 6):
            validation[i][j] = int(validation[i][j])
        elif (j > 2):
            validation[i][j] = float(validation[i][j])
validation[:100]

[['chr6', 136922300, 136922500, 0.0, 0.36, 0.64, 0.0, 0, 1],
 ['chr6', 136922400, 136922600, 0.0, 0.86, 0.14, 0.0, 0, 1],
 ['chr6', 136922500, 136922700, 0.0, 1.0, 0.0, 0.0, 0, 0],
 ['chr6', 136922600, 136922800, 0.0, 1.0, 0.0, 0.0, 0, 0],
 ['chr6', 136922700, 136922900, 0.165, 0.835, 0.0, 0.0, 1, 0],
 ['chr6', 136922800, 136923000, 0.665, 0.335, 0.0, 0.0, 1, 0],
 ['chr6', 136922900, 136923100, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr6', 136923000, 136923200, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr6', 136923100, 136923300, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr6', 136923200, 136923400, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr6', 136923300, 136923500, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr6', 136923400, 136923600, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr6', 136923500, 136923700, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr6', 136923600, 136923800, 1.0, 0.0, 0.0, 0.0, 0, 0],
 ['chr6', 136923700, 136923900, 0.92, 0.08, 0.0, 0.0, 1, 0],
 ['chr6', 136923800, 136924000, 0.42, 0.58, 0.0, 0.0, 1, 0],
 ['chr6', 136923900, 136924100, 0.0, 1.0

In [7]:
with open('6col-training-set.bed', 'r') as f:
    training = f.readlines()
for i in range(len(training)):
    training[i] = training[i].split()
    for j in range(len(training[i])):
        if (j == 1 or j == 2 or j > 6):
            training[i][j] = int(training[i][j])
        elif (j > 2):
            training[i][j] = float(training[i][j])
training[:100]

[['chr6', 41026894, 41027094, 0.0, 0.0, 0.0, 1.0, 0, 0],
 ['chr6', 41026994, 41027194, 0.0, 0.0, 0.0, 1.0, 0, 0],
 ['chr6', 41027094, 41027294, 0.0, 0.0, 0.0, 1.0, 0, 0],
 ['chr6', 41027194, 41027394, 0.0, 0.0, 0.0, 1.0, 0, 0],
 ['chr6', 41027294, 41027494, 0.0, 0.0, 0.0, 1.0, 0, 0],
 ['chr6', 41027394, 41027594, 0.0, 0.0, 0.0, 1.0, 0, 0],
 ['chr6', 41027494, 41027694, 0.0, 0.0, 0.0, 1.0, 0, 0],
 ['chr6', 41027594, 41027794, 0.0, 0.0, 0.0, 1.0, 0, 0],
 ['chr6', 41027694, 41027894, 0.0, 0.0, 0.0, 1.0, 0, 0],
 ['chr6', 41027794, 41027994, 0.0, 0.0, 0.0, 1.0, 0, 0],
 ['chr6', 41027894, 41028094, 0.0, 0.0, 0.0, 1.0, 0, 0],
 ['chr6', 41027994, 41028194, 0.0, 0.0, 0.0, 1.0, 0, 0],
 ['chr6', 41028094, 41028294, 0.0, 0.0, 0.0, 1.0, 0, 0],
 ['chr6',
  41028194,
  41028394,
  0.0,
  0.16964285714285715,
  0.0,
  0.8303571428571429,
  0,
  1],
 ['chr6',
  41028294,
  41028494,
  0.0,
  0.7345679012345679,
  0.0,
  0.2654320987654321,
  0,
  1],
 ['chr6', 41028394, 41028594, 0.0, 1.0, 0.0, 0.0, 0,

In [8]:
if (not (os.path.isfile('gencode.v34.transcripts.fa.gz') or os.path.isfile('gencode.v34.transcripts.fa'))):
    !wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_34/gencode.v34.transcripts.fa.gz
if (not (os.path.isfile('gencode.v34.transcripts.fa') or os.path.isfile('gencode.v34.transcripts.fa.gz'))):
    !gunzip gencode.v34.transcripts.fa.gz

In [9]:
#parameters
batchsize = 2 #for now #200
train_steps = len(training) / batchsize
num_epochs = 1 #for now #1000
threshold = 0.75

In [10]:
if __name__ == "__main__":
    # insert the code you might need to read in files, set hyperparameters, etc.
    model = default_model()
    print(model)
    model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
    print(model.summary())
    hist = model.fit_generator(epochs = num_epochs,  # choose how many epochs to train for (watch the loss to see when to stop)
                               steps_per_epoch = train_steps,
                               generator = TrainGenerator(),  # see above
                               use_multiprocessing = True,
                               workers = 8)  # use this to split batch processing into multiple CPUs (but don't take over the VM completely!)

<tensorflow.python.keras.engine.functional.Functional object at 0x7f68d0342910>
Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
seq (InputLayer)             [(None, 200, 4)]          0         
_________________________________________________________________
conv1d (Conv1D)              (None, 200, 240)          19440     
_________________________________________________________________
activation (Activation)      (None, 200, 240)          0         
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 20, 240)           0         
_________________________________________________________________
dense (Dense)                (None, 20, 1024)          246784    
_________________________________________________________________
dense_1 (Dense)              (None, 20, 512)           524800    
________________________________________

ValueError: Failed to find data adapter that can handle input: <class '__main__.TrainGenerator'>, <class 'NoneType'>

In [None]:
genes = Fasta('gencode.v34.transcripts.fa')
genes

In [None]:
genes.keys()