In [1]:
import tensorflow as tf
from tensorflow.keras import layers

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import confusion_matrix

import pandas as pd
import numpy as np

import time

In [2]:
label_encoder = LabelEncoder()
min_max_scaler = MinMaxScaler()

In [3]:
X = pd.read_csv(r'../data/X_expr.csv').drop(['Unnamed: 0', 'seqLibID'], axis=1).values
y = pd.read_csv(r'../data/y_cog.csv').drop(['Unnamed: 0', 'seqLibID'], axis=1).values
y = label_encoder.fit_transform(y.ravel())

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

X_train = min_max_scaler.fit_transform(X_train, y_train)
X_test = min_max_scaler.transform(X_test)

train_ds = tf.data.Dataset.from_tensor_slices(
    (X_train, y_train)
).shuffle(10000).batch(100)

test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(32)

In [5]:
def get_rf_encoder(X_train, y_train, n_estimators=1000, random_state=None):
    """
    trees = model.estimators_
    # Get all 50 tree predictions for the first sample in X_train
    preds_for_0 = [tree.predict(X_train[0].reshape(1, -1))[0] for tree in trees]
    """
    if random_state is None:
        model = RandomForestClassifier(n_estimators=n_estimators)
    else:
        model = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)

    model.fit(X_train, y_train)

    return model

def get_nn_decoder(latent_shape, layer_sizes, dropout_rates=None, activation='relu'):
    """
    Not strictly for decoding; can also be used for classification
    """
    input_layer = layers.Input(shape=latent_shape)
    for i, n_nodes in enumerate(layer_sizes):
        if i == 0:
            x = layers.Dense(n_nodes, activation=activation)(input_layer)
            if dropout_rates is not None:
                x = layers.Dropout(dropout_rates[i])(x)
        elif i == len(layer_sizes) - 1:
            x = layers.Dense(n_nodes, activation=tf.keras.activations.softmax)(x)
        else:
            x = layers.Dense(n_nodes, activation=activation)(x)
            if dropout_rates is not None:
                x = layers.Dropout(dropout_rates[i])(x)

    model = tf.keras.Model(inputs=input_layer, outputs=x)
    return model

In [6]:
loss_object_sparse = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
loss_object_autoencoder = tf.keras.losses.MeanSquaredError()
optimizer = tf.keras.optimizers.Adam(learning_rate=0.00001)

In [7]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy_sparse = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

test_loss = tf.keras.metrics.Mean(name='test_loss')
test_accuracy_sparse = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy')

In [8]:
@tf.function
def train_step(model, data, labels):
    with tf.GradientTape() as tape:
        # training=True is only needed if there are layers with different
        # behavior during training versus inference (e.g. Dropout).
        predictions = model(data, training=True)
        loss = loss_object_sparse(labels, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_loss(loss)
    train_accuracy_sparse(labels, predictions)

@tf.function
def test_step(model, data, labels):
    # training=False is only needed if there are layers with different
    # behavior during training versus inference (e.g. Dropout).
    predictions = model(data, training=False)
    t_loss = loss_object_sparse(labels, predictions)

    test_loss(t_loss)
    test_accuracy_sparse(labels, predictions)

In [9]:
n_estimators = 1000
rf_encoder = get_rf_encoder(X_train, y_train, n_estimators=n_estimators)

In [10]:
# 620, -620
EPOCHS = 250

tf.keras.backend.clear_session()
decoder = get_nn_decoder((1000),  [500, 200, 100, 10, 3])

def rf_encode(encoder, data):
    """
    trees = model.estimators_
    # Get all 50 tree predictions for the first sample in X_train
    preds_for_0 = [tree.predict(X_train[0].reshape(1, -1))[0] for tree in trees]
    """
    trees = encoder.estimators_
    n_estimators = len(trees)

    output = np.zeros(shape=(data.shape[0], n_estimators))

    for i, sample in enumerate(data):
        output[i] = [tree.predict(sample.numpy().reshape(1, -1))[0] for tree in trees]

    return tf.convert_to_tensor(output)


for epoch in range(EPOCHS):
    # Reset the metrics at the start of the next epoch
    train_loss.reset_states()
    train_accuracy_sparse.reset_states()
    test_loss.reset_states()
    test_accuracy_sparse.reset_states()

    for data, labels in train_ds:
        encoded_data = rf_encode(rf_encoder, data)
        train_step(decoder, encoded_data, labels)

    for test_data, test_labels in test_ds:
        encoded_test_data = rf_encode(rf_encoder, test_data)
        test_step(decoder, encoded_test_data, test_labels)

    train_cnf = np.zeros(shape=(3, 3))
    test_cnf = np.zeros(shape=(3, 3))

    for data, labels in train_ds:
        encoded_data = rf_encode(rf_encoder, data)
        predictions = decoder(encoded_data).numpy().argmax(axis=1)
        train_cnf += confusion_matrix(labels, predictions)

    for test_data, test_labels in test_ds:
        encoded_test_data = rf_encode(rf_encoder, test_data)
        predictions_test = decoder(encoded_test_data).numpy().argmax(axis=1)
        test_cnf += confusion_matrix(test_labels, predictions_test)

    print(train_cnf)
    print(test_cnf)

    print(
        f'Epoch {epoch + 1}, '
        f'Loss: {train_loss.result()}, '
        f'Accuracy: {train_accuracy_sparse.result() * 100}, '
        f'Test Loss: {test_loss.result()}, '
        f'Test Accuracy: {test_accuracy_sparse.result() * 100}'
    )


[[ 92.   0.   1.]
 [ 63.   0.   3.]
 [100.   0.   3.]]
[[11.  0.  0.]
 [ 7.  0.  0.]
 [11.  0.  1.]]
Epoch 1, Loss: 1.6438946723937988, Accuracy: 35.496185302734375, Test Loss: 1.461470365524292, Test Accuracy: 40.0
[[92.  0.  1.]
 [57.  0.  9.]
 [82.  0. 21.]]
[[10.  0.  1.]
 [ 7.  0.  0.]
 [11.  0.  1.]]
Epoch 2, Loss: 1.4226804971694946, Accuracy: 38.931297302246094, Test Loss: 1.3736521005630493, Test Accuracy: 36.66666793823242
[[91.  0.  2.]
 [50.  0. 16.]
 [62.  0. 41.]]
[[10.  0.  1.]
 [ 7.  0.  0.]
 [11.  0.  1.]]
Epoch 3, Loss: 1.2763837575912476, Accuracy: 45.8015251159668, Test Loss: 1.2977794408798218, Test Accuracy: 36.66666793823242
[[88.  2.  3.]
 [43.  0. 23.]
 [48.  0. 55.]]
[[ 9.  0.  2.]
 [ 7.  0.  0.]
 [11.  0.  1.]]
Epoch 4, Loss: 1.166983962059021, Accuracy: 51.90839767456055, Test Loss: 1.2324671745300293, Test Accuracy: 33.333335876464844
[[88.  2.  3.]
 [36.  0. 30.]
 [34.  1. 68.]]
[[9. 0. 2.]
 [7. 0. 0.]
 [7. 0. 5.]]
Epoch 5, Loss: 1.0959889888763428, Accura

KeyboardInterrupt: 