# Transcription Factor Classifier

In [1]:
from nn import NeuralNetwork
import numpy as np
from typing import List, Dict, Tuple, Union
from numpy.typing import ArrayLike
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
import pandas as pd
from TFC_io import read_text_file, read_fasta_file
from preprocess import sample_seqs, one_hot_encode_seqs

### Read in Rap1 motif examples

In [2]:
rap1_motifs = read_text_file("../data/rap1-lieb-positives.txt")
labels = [True for i in range(len(rap1_motifs))] #generate labels, where True = positive

### Read in negative example from yeast

In [3]:
yeast_neg = read_fasta_file("../data/yeast-upstream-1k-negative.fa")
labels = labels + [False for i in range(len(yeast_neg))]

### Explain Sampling Scheme
I chose to use oversampling to correct the class imbalance in the data by resampling the positive class with replacement until it had the same number of observations as the negative class. While this can make the classification model prone to ovefitting, it prevents information loss and avoids an extremely small sample size if we were to downsample the yeast data to the size of the rap1_motifs.

### Implement sampling scheme

In [4]:
seqs = rap1_motifs + yeast_neg #combine the two lists of seqs
new_seqs, new_labels = sample_seqs(seqs, labels) #fix class imbalance

In [10]:
new_labels.count(True)

3163

In [11]:
new_labels.count(False)

3163