# Transcription Factor Classifier

In [1]:
from nn import NeuralNetwork
import numpy as np
from typing import List, Dict, Tuple, Union
from numpy.typing import ArrayLike
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
import pandas as pd
from TFC_io import read_text_file, read_fasta_file
from preprocess import sample_seqs, one_hot_encode_seqs

### Read in Rap1 motif examples

In [2]:
rap1_motifs = read_text_file("../data/rap1-lieb-positives.txt")
labels = [True for i in range(len(rap1_motifs))] #generate labels, where True = positive

### Read in negative example from yeast

In [3]:
yeast_neg = read_fasta_file("../data/yeast-upstream-1k-negative.fa")
labels = labels + [False for i in range(len(yeast_neg))]

### Explain Sampling Scheme
I chose to use oversampling to correct the class imbalance in the data by resampling the positive class with replacement until it had the same number of observations as the negative class. While this can make the classification model prone to ovefitting, it prevents information loss and avoids an extremely small sample size if we were to downsample the yeast data to the size of the rap1_motifs.

### Implement sampling scheme

In [4]:
seqs = rap1_motifs + yeast_neg #combine the two lists of seqs
new_seqs, new_labels = sample_seqs(seqs, labels) #fix class imbalance

### Generate training and validation sets

In [5]:
#use a 70/30 train/test split
X_train, X_val, y_train, y_val = train_test_split(new_seqs, new_labels, test_size=0.3, random_state=42)

### One hot encode the sequences in the training and test sets

In [6]:
X_train_encoded = one_hot_encode_seqs(X_train)
X_val_encoded = one_hot_encode_seqs(X_val)

### Define plotting function so loss history of neural network can be visualized

In [None]:
def plot_loss_history(self):
    """
    Plots the loss history after training is complete.
    """
    loss_hist = self.loss_history_train
    loss_hist_val = self.loss_history_val
    assert len(loss_hist) > 0, "Need to run training before plotting loss history"
    fig, axs = plt.subplots(2, figsize=(8,8))
    fig.suptitle('Loss History')
    axs[0].plot(np.arange(len(loss_hist)), loss_hist)
    axs[0].set_title('Training Loss')
    axs[1].plot(np.arange(len(loss_hist_val)), loss_hist_val)
    axs[1].set_title('Validation Loss')
    plt.xlabel('Steps')
    axs[0].set_ylabel('Train Loss')
    axs[1].set_ylabel('Val Loss')
    fig.tight_layout()

### Train a neural network
I will be exploring some hyperparameters in the next few code chunks

Number of hidden layers = 1, hidden layer activation function = relu 

In [None]:
#start with a neural network with only 1 hidden layer with 5 nodes and go from there (final output should stem from a sigmoid function)
nn_arch = [{'input_dim': 1, 'output_dim': 5, 'activation': 'relu'}, 
           {'input_dim': 5, 'output_dim': 1, 'activation:': 'sigmoid'}]
