# Classifier

## Import dependencies

In [35]:
from nn.nn import NeuralNetwork
import nn
import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

## Read in and process data

In [36]:
from nn import io

# Read in positive sequences
pos_seqs = io.read_text_file('data/rap1-lieb-positives.txt')

# Read in negative sequences
neg_seqs = io.read_fasta_file('data/yeast-upstream-1k-negative.fa')

# Process data so that negative and positive sequences are the same length

# Initialize a list to store processed sequences
neg_seqs_processed = []

# Positive sequences are shorter than negative sequences
# so set shorter length variable to the length of the positive sequences
pos_seq_len = len(pos_seqs[0])

# For each of the negative sequences, iterate through the sequence and
# return a subsequence with the same length as the positive sequences.
# Then, store that subsequence in the seqs_processed list
for seq in neg_seqs:
    for i in range(len(seq) - pos_seq_len + 1):
        sub_seq = seq[i:i + pos_seq_len]
        neg_seqs_processed.append(sub_seq)

## Combine positive and negative sequences and generate labels

In [37]:
from nn import preprocess
seqs = pos_seqs + neg_seqs_processed
pos_labels = [1] * len(pos_seqs)
neg_labels = [0] * len(neg_seqs_processed)
labels = pos_labels + neg_labels

samples, sample_labels = preprocess.sample_seqs(seqs, labels)

## Generate a one-hot encoding of sequences

In [38]:
X = preprocess.one_hot_encode_seqs(samples)
y = np.array(sample_labels, dtype=int)

## Split data into training and validation sets

In [39]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## Create an instance of the NeuralNetwork class

In [43]:
# Creatе an instance of the NeuralNetwork class with defined hyperparameters

nn_arch = [
    {"input_dim": 68, "output_dim": 32, "activation": "relu"},
    {"input_dim": 32, "output_dim": 16, "activation": "sigmoid"},
    {"input_dim": 16, "output_dim": 4, "activation": "relu"},
    {"input_dim": 4, "output_dim": 1, "activation": "sigmoid"}
]

# Create NeuralNetwork instance
nn = NeuralNetwork(nn_arch, lr=0.01, seed=42, batch_size=16, epochs=1000, loss_function="binary cross entropy")

In [44]:
# Train the neural network
training_loss, validation_loss = nn.fit(X_train, y_train, X_val, y_val)

IndexError: tuple index out of range

## Plot training and validation loss

In [42]:
# Plot training and validation loss by epoch
plt.plot(training_loss, label="Training Loss")
plt.plot(validation_loss, label="Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

plt.show()

print(f'Average Training Error: {np.mean(training_loss)}')
print(f'Average Validation Error: {np.mean(validation_loss)}')

NameError: name 'training_loss' is not defined

# Hyperparameter choice rationale:

### Dimensions
The input dimension was selected to be 68 since each nucleotide's encoding is represented by a 1x4 vector
and each sequence of nucleotides is 17-nt in length (68 = 4 x 17). The output dimension was selected to be 1 since we are looking to do binary classification.

### Learning rate
The learning rate lr is set to 0.01, since that seems to be a common choice for gradient descent optimization. Decreasing the learning rate would make training go slower, but also risks getting stuck at a local minimum. Conversely, increasing the learning rate would allow model training to proceed more quickly but risks divergent behavior.

### Seed value
The random seed is set to 42 because, according to the supercomputer Deep Thought, the number 42 is the "Answer to the Ultimate Question of Life, The Universe, and Everything."

### Batch size
The batch size is set to 16, because it seemed to be a reasonable size for this dataset.

### Epoch number
The number of epochs is set to 1000, because I was planning to see how the model performs at 1000 epochs and see how the error changes as I decrease the number of epochs the model runs through.

### Choice of loss function
The loss function was selected to be the binary cross entropy function because it is well suited for binary classification problems such as this as it compares each of the predicted values to the ground truth values and returns a value of 0 or 1.

## Comments

My autoencoder runs using either loss function, but I think there is an issue in how I read in or segment the data I train on for the classifier, since now I run into issues when I try to use either loss function. Specifically, I run into an issue where y.shape[1] is out of range for this dataset but not for the digits dataset.