In [17]:
import tensorflow as tf
from tensorflow.keras import layers

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

import pandas as pd
import numpy as np

import time

In [18]:
label_encoder = LabelEncoder()
min_max_scaler = MinMaxScaler()

In [19]:
X = pd.read_csv(r'../data/X_expr.csv').drop(['Unnamed: 0', 'seqLibID'], axis=1).values
y = pd.read_csv(r'../data/y_cog.csv').drop(['Unnamed: 0', 'seqLibID'], axis=1).values
y = label_encoder.fit_transform(y.ravel())

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

X_train = min_max_scaler.fit_transform(X_train, y_train)
X_test = min_max_scaler.transform(X_test)

train_ds = tf.data.Dataset.from_tensor_slices(
    (X_train, y_train)
).shuffle(10000).batch(100)

test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(32)

In [21]:
np.min(X_train[:, 0])

0.0

##### Encoders

In [22]:
def get_nn_encoder(input_shape, layer_sizes, activation='relu'):
    input_layer = layers.Input(shape=input_shape)
    for i, n_nodes in enumerate(layer_sizes):
        if i == 0:
            x = layers.Dense(n_nodes, activation=activation)(input_layer)
        else:
            x = layers.Dense(n_nodes, activation=activation)(x)

    model = tf.keras.Model(inputs=input_layer, outputs=x)
    return model



def get_cnn_encoder(input_shape, filters, kernel_sizes, dnn_layer_sizes, strides=None, paddings="valid"):
    if strides is None:
        strides = [(1, 1) for _ in range(len(filters))]

    input_layer = layers.Input(shape=input_shape)

    for i in range(len(filters)):
        if i == 0:
            x = layers.Conv2D(filters[i], kernel_sizes[i], strides[i])(input_layer)
        else:
            x = layers.Conv2D(filters[i], kernel_sizes[i], strides[i])(x)
        x = layers.BatchNormalization()(x)
        x = layers.LeakyReLU()(x)

    x = layers.Flatten()(x)
    pre_flatten_idx = len(filters) - 1
    flatten_idx = len(filters)

    for i in range(len(dnn_layer_sizes)):
        x = layers.Dense(dnn_layer_sizes[i])(x)

    model = tf.keras.Model(inputs=input_layer, outputs=x)
    pre_flatten_dim = model.layers[pre_flatten_idx].output_shape
    n_flatten_nodes = model.layers[flatten_idx].output_shape

    return model, pre_flatten_dim, n_flatten_nodes


def get_rf_encoder(X_train, y_train, n_estimators=1000, random_state=None):
    """
    trees = model.estimators_
    # Get all 50 tree predictions for the first sample in X_train
    preds_for_0 = [tree.predict(X_train[0].reshape(1, -1))[0] for tree in trees]
    """
    if random_state is None:
        model = RandomForestClassifier(n_estimators=n_estimators)
    else:
        model = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)

    model.fit(X_train, y_train)

    return model
    

##### Decoders

In [23]:
def get_nn_decoder(latent_shape, layer_sizes, activation='relu'):
    """
    Not strictly for decoding; can also be used for classification
    """
    input_layer = layers.Input(shape=latent_shape)
    for i, n_nodes in enumerate(layer_sizes):
        if i == 0:
            x = layers.Dense(n_nodes, activation=activation)(input_layer)
        elif i == len(layer_sizes) - 1:
            x = layers.Dense(n_nodes, activation=tf.keras.activations.softmax)(x)
        else:
            x = layers.Dense(n_nodes, activation=activation)(x)

    model = tf.keras.Model(inputs=input_layer, outputs=x)
    return model


def get_cnn_decoder(latent_shape):
    pass

def get_svm_decoder(latent_shape):
    pass

def get_logistic_decoder(latent_shape):
    pass

In [24]:
loss_object_sparse = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
loss_object_autoencoder = tf.keras.losses.MeanSquaredError()
optimizer = tf.keras.optimizers.Adam(lr = 0.00001)

  super(Adam, self).__init__(name, **kwargs)


In [25]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy_sparse = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

test_loss = tf.keras.metrics.Mean(name='test_loss')
test_accuracy_sparse = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy')

In [26]:
@tf.function
def train_step(model, data, labels):
    with tf.GradientTape() as tape:
        # training=True is only needed if there are layers with different
        # behavior during training versus inference (e.g. Dropout).
        predictions = model(data, training=True)
        loss = loss_object_sparse(labels, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_loss(loss)
    train_accuracy_sparse(labels, predictions)

@tf.function
def test_step(model, data, labels):
    # training=False is only needed if there are layers with different
    # behavior during training versus inference (e.g. Dropout).
    predictions = model(data, training=False)
    t_loss = loss_object_sparse(labels, predictions)

    test_loss(t_loss)
    test_accuracy_sparse(labels, predictions)

In [27]:
@tf.function
def train_step_autoencoder(encoder, decoder, data, labels):
    with tf.GradientTape() as enc, tf.GradientTape() as dec:
        latent = encoder(data, training=True)
        generated_data = decoder(latent, training=True)
        loss = loss_object_autoencoder(data, generated_data)
    encoder_gradient = enc.gradient(loss, encoder.trainable_variables)
    decoder_gradient = dec.gradient(loss, decoder.trainable_variables)

    optimizer.apply_gradients(zip(encoder_gradient, encoder.trainable_variables))
    optimizer.apply_gradients(zip(decoder_gradient, decoder.trainable_variables))

    return loss

##### Autoencoder training

In [None]:
EPOCHS_AE = 50

layer_sizes = [18980, 10000, 2500, 2500, 400]
layer_sizes_reversed = list(reversed(layer_sizes))
encoder = get_nn_encoder((layer_sizes[0]), layer_sizes=layer_sizes[1:])
decoder = get_nn_decoder((layer_sizes_reversed[0]), layer_sizes=layer_sizes_reversed[1:])

for epoch in range(EPOCHS_AE):
    start_time = time.time()
    for data, labels in train_ds:
        loss = train_step_autoencoder(encoder, decoder, data, labels)
    print(f"Epoch {epoch + 1}, Loss: {loss}, Time for epoch: {time.time() - start_time}")

In [None]:
encoder.summary()

In [None]:
decoder.summary()

##### Decoder training

In [None]:
EPOCHS = 5

# model = get_nn_decoder((400), layer_sizes=[300, 150, 150, 50, 3])

for epoch in range(EPOCHS):
    # Reset the metrics at the start of the next epoch
    train_loss.reset_states()
    train_accuracy_sparse.reset_states()
    test_loss.reset_states()
    test_accuracy_sparse.reset_states()

    for data, labels in train_ds:
        encoded_data = encoder.predict(data)
        train_step(model, encoded_data, labels)

    for test_data, test_labels in test_ds:
        encoded_test_data = encoder.predict(test_data)
        test_step(model, encoded_test_data, test_labels)

    print(
        f'Epoch {epoch + 1}, '
        f'Loss: {train_loss.result()}, '
        f'Accuracy: {train_accuracy_sparse.result() * 100}, '
        f'Test Loss: {test_loss.result()}, '
        f'Test Accuracy: {test_accuracy_sparse.result() * 100}'
    )


In [None]:
for data, labels in train_ds:
    print(type(data))
    print(labels)
    break

In [28]:
n_estimators = 1000
rf_encoder = get_rf_encoder(X_train, y_train, n_estimators=n_estimators)

In [29]:
# 620, -620
EPOCHS = 25

tf.keras.backend.clear_session()
decoder = get_nn_decoder((1000),  [500, 200, 100, 10, 3])


def rf_encode(encoder, data):
    """
    trees = model.estimators_
    # Get all 50 tree predictions for the first sample in X_train
    preds_for_0 = [tree.predict(X_train[0].reshape(1, -1))[0] for tree in trees]
    """
    trees = encoder.estimators_
    n_estimators = len(trees)

    output = np.zeros(shape=(data.shape[0], n_estimators))

    for i, sample in enumerate(data):
        output[i] = [tree.predict(sample.numpy().reshape(1, -1))[0] for tree in trees]

    return tf.convert_to_tensor(output)


for epoch in range(EPOCHS):
    # Reset the metrics at the start of the next epoch
    train_loss.reset_states()
    train_accuracy_sparse.reset_states()
    test_loss.reset_states()
    test_accuracy_sparse.reset_states()

    for data, labels in train_ds:
        encoded_data = rf_encode(rf_encoder, data)
        train_step(decoder, encoded_data, labels)

    for test_data, test_labels in test_ds:
        encoded_test_data = rf_encode(rf_encoder, test_data)
        test_step(decoder, encoded_test_data, test_labels)

    print(
        f'Epoch {epoch + 1}, '
        f'Loss: {train_loss.result()}, '
        f'Accuracy: {train_accuracy_sparse.result() * 100}, '
        f'Test Loss: {test_loss.result()}, '
        f'Test Accuracy: {test_accuracy_sparse.result() * 100}'
    )


Epoch 1, Loss: 1.0200287103652954, Accuracy: 43.1297721862793, Test Loss: 1.1670171022415161, Test Accuracy: 36.66666793823242
Epoch 2, Loss: 0.9879631400108337, Accuracy: 43.511451721191406, Test Loss: 1.168146014213562, Test Accuracy: 40.0
Epoch 3, Loss: 0.9739248156547546, Accuracy: 43.511451721191406, Test Loss: 1.1680554151535034, Test Accuracy: 40.0
Epoch 4, Loss: 0.9544284343719482, Accuracy: 43.89312744140625, Test Loss: 1.1672734022140503, Test Accuracy: 40.0
Epoch 5, Loss: 0.9476329684257507, Accuracy: 44.274810791015625, Test Loss: 1.1661981344223022, Test Accuracy: 40.0
Epoch 6, Loss: 0.9412970542907715, Accuracy: 44.274810791015625, Test Loss: 1.1646977663040161, Test Accuracy: 40.0
Epoch 7, Loss: 0.9238945841789246, Accuracy: 45.03816604614258, Test Loss: 1.1619668006896973, Test Accuracy: 40.0
Epoch 8, Loss: 0.9109888076782227, Accuracy: 45.41984939575195, Test Loss: 1.1606254577636719, Test Accuracy: 40.0
Epoch 9, Loss: 0.9015046954154968, Accuracy: 45.8015251159668, Te

In [None]:
print(encoded_data.shape)