In [1]:
# Imports
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from nn.nn import NeuralNetwork
from nn.io import read_text_file, read_fasta_file
from nn.preprocess import one_hot_encode_seqs, sample_seqs
import random
import plotly.graph_objects as go

In [2]:
# Read in the 137 positive Rap1 motif examples
positive_seqs = read_text_file('./data/rap1-lieb-positives.txt')

# Read in all the negative examples
negative_seqs = read_fasta_file('./data/yeast-upstream-1k-negative.fa')

# Total seqs
all_seqs = positive_seqs + negative_seqs

# Get minimum length
min_len = min(len(seq) for seq in all_seqs)

# Shorten the sequences for same lengths
shortened_all_seqs=[]

for seq in all_seqs:
    if len(seq) > min_len:
        start = random.randint(0, len(seq) - min_len)
        shortened_all_seqs.append(seq[start:start + min_len])
    else:
        shortened_all_seqs.append(seq)
    
# Creating labels
labels=(['positive'] * len(positive_seqs)) + (['negative'] * len(negative_seqs))   

# Explanation of sampling scheme
shortened_all_seq contains trimmed sequences which are all of the same length, the length of the shortest sequence in our dataset. 

In our sampling process, we randomly sample sequences with replacement from the minority class so that there are same number of sequences in the positive and negative class. A weakness of this method is the possibility of overrepresenting certain sequences and biasing our model. 

In [3]:
# Use sample seqs from preprocess
sampled_seqs, sampled_labels = sample_seqs(shortened_all_seqs, labels)

# One hot encode
encode_OH = one_hot_encode_seqs(sampled_seqs)

In [4]:
# Labels to binary
labels_binary = [0 if val == 'negative' else 1 for val in sampled_labels]

#split train and test 
X_train, X_val, y_train, y_val= train_test_split(encode_OH, np.expand_dims(labels_binary, 1), 
                                                  test_size=0.2, random_state=42)


In [5]:
# Defining the architecture for autoencoder

classifier_architecture = [
    {'input_dim': 68, 'output_dim': 34, 'activation': 'sigmoid'},
    {'input_dim': 34, 'output_dim': 17, 'activation': 'sigmoid'},
    {'input_dim': 17, 'output_dim': 1, 'activation': 'sigmoid'}
]

# Define hyperparameters
learning_rate = 0.01
random_seed = 42
epochs = 50
batch_size = 50
#loss_function = 'mean_squared_error'
loss_function = 'binary_cross_entropy'

# Create an instance of NeuralNetwork for the autoencoder
classifier = NeuralNetwork(
    nn_arch=classifier_architecture,
    lr=learning_rate,
    seed=random_seed,
    batch_size=batch_size,
    epochs=epochs,
    loss_function=loss_function
)


In [6]:
per_epoch_loss_train, per_epoch_loss_val= classifier.fit(X_train, y_train, X_val, y_val)

In [7]:

# Plot your training and validation loss by epoch.
fig = go.Figure()

# Add training loss trace
fig.add_trace(go.Scatter(x=np.arange(1, len(per_epoch_loss_train)+1), y=per_epoch_loss_train, mode='lines', name='Training Loss'))

# Add validation loss trace
fig.add_trace(go.Scatter(x=np.arange(1, len(per_epoch_loss_val)+1), y=per_epoch_loss_val, mode='lines', name='Validation Loss'))

# Update layout
fig.update_layout(title='Training and Validation Loss',
                   xaxis_title='Epoch',
                   yaxis_title='Loss',
                   legend=dict(x=0, y=1),
                   margin=dict(l=0, r=0, t=40, b=0))

# Show plot
fig.show()

In [8]:
Pred = classifier.predict(X_val)
pred_binary = np.where(Pred > 0.5, 1, 0)

print('Validation set accuracy:', np.sum(pred_binary == y_val) / len(y_val)) 

Validation set accuracy: 1.0


# Troubleshoot

Accuracy of a 100% seems suspicious (but I'll take it!) 

# Define hyperparameters
Hyperparameters were tuned manually to find the best setting.

# learning_rate = 0.01
Based on minimized loss over smaller epochs

# epochs = 50
Loss is very close to zero without any signs of overfitting. 

# batch_size = 50
loss_function = 'binary_cross_entropy'
Since the goal is binary classification. 

# Sigmoid
Since it is appropriate in classification problems due to its output range of [0, 1]. For example, it's use in Logistic Regression. 