In [None]:
import time
import pandas as pd
import h5py
import numpy as np
from multiprocessing import Pool
from functools import partial, reduce

# labelBaseMap = {
#     0: "A",
#     1: "C",
#     2: "G",
#     3: "T"
# }

filename = "/mnt/nvme/taiyaki_aligned/mapped_umi16to9.hdf5"

RNN_LEN = 200

In [None]:
with h5py.File(filename, 'r') as h5file:
    readIDs = list(h5file['Reads'].keys())
    print(f"{len(readIDs)} reads, keys: {list(h5file['Reads'][readIDs[0]].keys())}")

In [None]:
def processRead(readID, filename, train_validate_split=0.8):
    train = []
    test  = []
    with h5py.File(filename, 'r') as h5file:
        DAC = list(h5file['Reads'][readID]['Dacs'][()])
        RTS = list(h5file['Reads'][readID]['Ref_to_signal'][()])
        REF = list(h5file['Reads'][readID]['Reference'][()])
    for rtsidx in range(len(RTS)-1):
        # Add to dataset in increments of 5 until too close to the next rtsidx
        # Or not enough Dacs left
        i = RTS[rtsidx]

        #make the labels iteratively
        labels = []
        l = rtsidx
        while RTS[l] < i + RNN_LEN and l < len(REF):
            labels.append(REF[l])
            l += 1

        while i < (RTS[rtsidx+1] - 5) and (i + RNN_LEN) < len(DAC):
            # check if we should include another label
            while RTS[l] <= i + RNN_LEN and l < len(REF):
                labels.append(REF[l])
                l += 1
            # last (1-train_validate_split) for validation
            if rtsidx < len(RTS)*train_validate_split: # split ref_to_signal based on train/validate
                train.append([
                        DAC[i:(i+RNN_LEN)],
                        labels
                ])
            else:
                test.append([
                        DAC[i:(i+RNN_LEN)],
                        labels
                ])
            i += 5
    return train, test

pp = partial(processRead, filename=filename)
ppp = pp(readIDs[0])

In [None]:
%%time
pool = Pool(16)
results_prim = pool.map(partial(processRead, filename=filename), readIDs[:1])
pool.close()
pool.join()

In [None]:
train_results = []
test_results = []
for thread in results_prim:
    for trainset in thread[0]:
        train_results.append(trainset)
        break
    for testset in thread[1]:
        test_results.append(testset)
        break

In [None]:
def normalise_dacs(dac):
    dmin = min(dac)
    dmax = max(dac)
    return [[(d-dmin)/(dmax-dmin)] for d in dac]

def ohe(v):
    tr = [0,0,0,0]
    tr[v] = 1
    return tr

In [None]:
X = np.array([normalise_dacs(r[0]) for r in results])
y = np.array([[ ohe(i) for i in r[1] ] for r in results])

In [None]:
print(X[0][0])
print(y[0])
print(f"X shape: {X.shape}, y shape: {y.shape}")

# HERE COME DAT ML

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, MaxPooling1D, Flatten, Conv1D, LSTM, Softmax
from tensorflow.nn import ctc_loss
from tensorflow.keras.callbacks import TensorBoard
import numpy as np

gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5)
sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

In [None]:
model = Sequential()
model.add(Conv1D(32, 3,
          padding="valid",
          activation="relu", 
          input_shape=X[0].shape))
model.add(Conv1D(32, 10,
          padding="valid",
          activation="relu"))
model.add(Conv1D(32, 5,
          padding="valid",
          activation="relu"))
model.add(Flatten())
model.add(Dense(64, activation="relu"))
model.add(Dense(64))
model.compile(optimizer="adam", loss=ctc_loss, metrics=['accuracy'])

model.fit(x=X, y=y, epochs=2, validation_split=0.1)