In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sb
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from tensorflow.python.keras.utils import to_categorical
from tensorflow.python.keras import Sequential, Input, Model, regularizers
from tensorflow.python.keras.layers import Conv1D, MaxPooling1D, GlobalAveragePooling1D, Flatten, Dropout, Dense, LSTM, Reshape, BatchNormalization, UpSampling1D
from tensorflow.train import AdamOptimizer
import tensorflow as tf
import keras.backend as K
from keras.engine.topology import Layer
from keras.layers import Lambda, Conv2DTranspose


REGULAR = "reg"
CHIMERIC = "chi"
REPEAT = "rep"
LOW_QUALITY = "loq"

INPUT_LENGTH = 5000
INPUT_THRESHOLD = 1000

NUM_CLASSES = 3

Using TensorFlow backend.


In [2]:
def create_convolutional_encoder(input_shape):
    reshape = Reshape(target_shape=(INPUT_LENGTH, 1), input_shape=(INPUT_LENGTH,))(input_shape)
    conv1 = Conv1D(filters=64, kernel_size=31, activation='relu', padding='same')(reshape)
    conv1 = MaxPooling1D(pool_size=10)(conv1)
    
    conv2 = Conv1D(filters=32, kernel_size=31, activation='relu', padding='same')(conv1)
    conv2 = MaxPooling1D(pool_size=5)(conv2)

    print("shape of encoded {}".format(K.int_shape(conv2)))
    return conv2


def create_convolutional_decoder(encoder):
    conv3 = UpSampling1D(5)(encoder)
    conv3 = Conv1D(filters=64, kernel_size=31, activation='relu', padding='same')(conv3)
    
    conv4 = UpSampling1D(10)(conv3)
    conv4 = Conv1D(filters=64, kernel_size=31, activation='relu', padding='same')(conv4)
    
    out = Conv1D(filters=1, kernel_size=31, activation='softmax', padding='same')(conv4)
    out = Reshape((INPUT_LENGTH,))(out)
    
    return out


def create_conv_autoencoder():
    input_shape = Input(shape=(INPUT_LENGTH, ))
    model = Model(input_shape, create_convolutional_decoder(create_convolutional_encoder(input_shape)))

    model.compile(optimizer=AdamOptimizer(learning_rate=0.0001), loss='binary_crossentropy')

    print(model.summary())

    return model

In [6]:

def create_datasets(tsv_input):
    data = pd.read_csv(tsv_input, delimiter="\t")

    # Filter out all low quality reads
    data = data.loc[data.CAT != LOW_QUALITY]

    # Convert sequence string to float array
    data.PTS = data.PTS.apply(string_to_array)

    # Convert labels from strings to ints
    ys = data.CAT.apply(category_to_int).to_numpy()
    xs = np.stack(data.PTS.array)

    encoded_ys = to_categorical(ys, num_classes=NUM_CLASSES)

    print("XS shape: {}".format(xs.shape))
    print("One-hot encoded YS shape: {}".format(encoded_ys.shape))

    train_x, test_x, train_y, test_y = train_test_split(xs, encoded_ys, test_size=0.15)

    return train_x, train_y, test_x, test_y


def create_unlabeled_datasets(tsv_input):
    data = pd.read_csv(tsv_input, delimiter="\t")

    # Filter out all low quality reads
    data = data.loc[data.CAT != LOW_QUALITY]

    # Convert sequence string to float array
    data.PTS = data.PTS.apply(string_to_array)
    xs = np.stack(data.PTS.array)
    print("Unlabeled XS shape: {}".format(xs.shape))

    train_x, test_x = train_test_split(xs, test_size=0.2)

    return train_x, test_x


def string_to_array(data):
    data_string = str(data)
    split = data_string.split(',')

    return np.array([float(i) for i in split])


def category_to_int(data):
    category = str(data)
    if category == REGULAR:
        return 0
    elif category == REPEAT:
        return 1
    else:
        return 2
    

def evaluate_autoencoder(model, train_x, test_x, epochs_num):
    history = model.fit(
        x=train_x,
        batch_size=64,
        epochs=epochs_num,
        validation_data=(test_x, None)
    )

    decoded_x = model.predict(test_x)
    print(decoded_x.shape)
    print(decoded_x[0].tolist())

    plot_decoded(test_x, decoded_x)

    plot_loss(history, epochs_num)


def plot_decoded(original_x, decoded_x):
    n = 20  # how many overlaps we will display
    idxs = range(5000)
    plt.figure(figsize=(20, 4))
    for i in range(n):
        # display original
        plt.subplot(2, n, i + 1)
        plt.plot(idxs, original_x[i].tolist())
        plt.gray()
        # display reconstruction
        plt.subplot(2, n, i + 1 + n)
        plt.plot(idxs, decoded_x[i].tolist())
        plt.gray()
    plt.show()


def plot_loss(model, epochs_num):
    loss = model.history['loss']
    val_loss = model.history['val_loss']
    epochs = range(1, epochs_num + 1)
    plt.figure()
    plt.plot(epochs, loss, 'r-', label='Training loss')
    plt.plot(epochs, val_loss, 'b-', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()
    plt.show()


def plot_confusion_matrix(matrix):
    df_cm = pd.DataFrame(matrix, ["regular", "repeat", "chimeric"], ["regular", "repeat", "chimeric"])
    plt.figure(figsize=(10, 7))
    sb.heatmap(df_cm, annot=True)
    plt.show()
    

def test_autoencoder(dataset):
    model = create_conv_autoencoder()

    print("Creating dataset...")
    train_x, test_x = create_unlabeled_datasets(dataset)
    print("Dataset created!")

    # Normalize inputs into [0, 1] range
    train_x = train_x / np.max(train_x)
    test_x = test_x / np.max(test_x)
    
    evaluate_autoencoder(model, train_x, test_x, epochs_num=70)

In [7]:
test_autoencoder("/floyd/input/overlaps/classified_7000.tsv")

shape of encoded (None, 100, 32)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 5000)              0         
_________________________________________________________________
reshape_2 (Reshape)          (None, 5000, 1)           0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 5000, 64)          2048      
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 500, 64)           0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 500, 32)           63520     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 100, 32)           0         
_________________________________________________________________
up_sampling1d_2 (UpSampling1 (None, 500, 32

IndexError: list index out of range