In [None]:
import datetime
import os
import pandas as pd
import h5py
import numpy as np
from multiprocessing import Pool
from multiprocessing.pool import ThreadPool
from functools import partial, reduce
from collections import deque
from IPython.core.debugger import set_trace
from tensorflow.keras.utils import Sequence

labelBaseMap = {
    0: "A",
    1: "C",
    2: "G",
    3: "T"
}

possible_filenames = ["/mnt/sdb/taiyaki_mapped/mapped_umi16to9.hdf5",
                      "/mnt/nvme/taiyaki_aligned/mapped_umi16to9.hdf5",
                      "/Users/felix/MsC/DNA/mapped_umi16to9.hdf5"]

for filename in possible_filenames:
    if os.path.isfile(filename):
        the_filename = filename
        print(f"Using {filename}")
        break
else:
    the_filename = ""
    print("Error, no filename valid!")

RNN_LEN = 300

In [None]:
class PrepData(Sequence):
    
    def __init__(self, filename, train_validate_split=0.8, min_labels=5):
        self.filename = filename
        self.train_validate_split=train_validate_split
        self.min_labels=min_labels
        self.pos = 0
        self.test_gen_data = ([],[])
        self.max_label_len = 50
        with h5py.File(filename, 'r') as h5file:
            self.readIDs = list(h5file['Reads'].keys())
            
    def get_len(self):
        return len(self.readIDs)
    
    def get_max_label_len(self):
        return self.max_label_len
        
    def normalise(self, dac):
        dmin = min(dac)
        dmax = max(dac)
        return [(d-dmin)/(dmax-dmin) for d in dac]
    
    def processRead(self, readID):
        train_X = []
        train_y = []
        test_X  = []
        test_y  = []
        with h5py.File(self.filename, 'r') as h5file:
            DAC = list(self.normalise(h5file['Reads'][readID]['Dacs'][()]))
            RTS = deque(list(h5file['Reads'][readID]['Ref_to_signal'][()]))
            REF = deque(h5file['Reads'][readID]['Reference'][()])
            
        train_validate_split = round(len(REF)*(1-self.train_validate_split))
        curdacs  = deque( [[x] for x in DAC[RTS[0]:RTS[0]+RNN_LEN-5]], RNN_LEN )
        curdacts = RTS[0]+RNN_LEN-5
        labels  = deque([])
        labelts = deque([])

        while RTS[0] < curdacts:
            labels.append(REF.popleft())
            labelts.append(RTS.popleft())


        while curdacts+5 < RTS[-1]-RNN_LEN:
            curdacs.extend([[x] for x in DAC[curdacts:curdacts+5]])
            curdacts += 5
            
            while RTS[0] < curdacts:
                labels.append(REF.popleft())
                labelts.append(RTS.popleft())
                
            while len(labelts) > 0 and labelts[0] < curdacts - RNN_LEN:
                labels.popleft()
                labelts.popleft()

            if len(labels) > self.min_labels:
                if len(RTS) > train_validate_split:
                    train_X.append(list(curdacs))
                    train_y.append(list(labels))
                else:
                    test_X.append(list(curdacs))
                    test_y.append(list(labels))

        return train_X, train_y, test_X, test_y
    
    
    def train_gen(self):
        while self.pos < len(self.readIDs):
            print(f"Processing {self.pos}")
            train_X, train_y, test_X, test_y = self.processRead(self.readIDs[self.pos])
            self.pos += 1
            
            train_X = np.array(train_X)
            train_y = np.array(train_y)
            test_X  = np.array(test_X)
            test_y  = np.array(test_y)
            self.test_gen_data = (test_X, test_y)
            
            train_X_lens = np.array([[95] for x in train_X], dtype="float32")
            train_y_lens = np.array([[len(x)] for x in train_y], dtype="float32")
#             maxlen = max([len(r) for r in train_y])
            train_y_padded = np.array([r + [5]*(self.get_max_label_len()-len(r)) for r in train_y], dtype='float32')
            X = {'the_input': train_X,
                      'the_labels': train_y_padded,
                      'input_length': train_X_lens,
                      'label_length': train_y_lens
                      }
            y = {'ctc': np.zeros([len(train_X)])}
            yield (X, y)
        
    def test_gen(self):
        while True:
            tgd, self.test_gen_data = self.test_gen_data, ([],[])
            yield tgd
            
            
    def __len__(self):
        return len(self.readIDs)

    def __getitem__(self, idx):
        return next(self.train_gen())
    
prepData = PrepData(filename)

# HERE COME DAT ML

In [None]:
import tensorflow as tf
import tensorflow.keras.backend as kb
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Activation, Add, Lambda
from tensorflow.keras.layers import Dense, MaxPooling1D, Conv1D, LSTM, GRU
from tensorflow.keras.backend import ctc_batch_cost
from tensorflow.keras.callbacks import Callback
import numpy as np
import matplotlib.pyplot as plt
from functools import reduce
import editdistance

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5000)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
    print(e)

In [None]:
def make_res_block(upper, block):
    res = Conv1D(256, 1,
                  padding="same",
                  name=f"res{block}-r")(upper)
    upper = Conv1D(256, 1,
                  padding="same",
                  activation="relu",
                  use_bias="false",
                  name=f"res{block}-c1")(upper)
    upper = Conv1D(256, 3,
                  padding="same",
                  activation="relu",
                  use_bias="false",
                  name=f"res{block}-c2")(upper)
    upper = Conv1D(256, 1,
                  padding="same",
                  use_bias="false",
                  name=f"res{block}-c3")(upper)
    added = Add(name=f"res{block}-add")([res, upper])
    return Activation('relu', name=f"res{block}-relu")(added)

def make_bdlstm(upper, block):
    lstm_1a = LSTM(200, return_sequences=True, name=f"blstm{block}-fwd")(upper)
    lstm_1b = LSTM(200, return_sequences=True, go_backwards=True, name=f"blstm{block}-rev")(upper)
    return Add(name=f"blstm{block}-add")([lstm_1a, lstm_1b])


input_data = Input(name="the_input", shape=(300,1), dtype="float32")

inner = make_res_block(input_data, 1)
inner = make_res_block(inner, 2)
inner = make_res_block(inner, 3)
inner = make_res_block(inner, 4)
inner = make_res_block(inner, 5)
inner = make_bdlstm(inner, 1)
inner = make_bdlstm(inner, 2)
inner = make_bdlstm(inner, 3)

inner = Dense(64, name="dense", activation="relu")(inner)
inner = Dense(5, name="dense_output")(inner)

y_pred = Activation("softmax", name="softmax")(inner)

model = Model(inputs=input_data, outputs=y_pred)
model.load_weights("weights/2020-01-07_13:49:58_e14_dis5427.h5")

In [None]:
a = next(prepData.train_gen())

In [None]:
idx = 200
ipt = a[0]['the_input'][idx:idx+1]
lbs = a[0]['the_labels'][idx:idx+1]
print(a[0].keys())

In [None]:
prediction = model.predict(ipt)

In [None]:
for file in os.listdir("weights"):
    model.load_weights(f"weights/{file}")
    prediction = model.predict(ipt)
    plt.figure(figsize=(30,10))
    for pred, raw, label in zip(prediction, ipt, lbs):
        transposed = list(map(list, zip(*pred)))
        for i in range(len(transposed)):
            plt.plot(transposed[i], label=i)
        plt.plot(raw, "k")
        plt.legend()
        plt.savefig(f"images/{file}.png")

In [None]:
a[0]['label_length'][idx]