# Application 2: Transcription Factor Classifier 

In [1]:
from nn import NeuralNetwork
from preprocess import one_hot_encode_seqs, sample_seqs
from seq_io import read_fasta_file, read_text_file
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 
from random import randrange

## 1. Read in data & downsample the sequence length in neg samples

In [2]:
pos = open('../data/rap1-lieb-positives.txt').read().split("\n")
neg = read_fasta_file('../data/yeast-upstream-1k-negative.fa')

In [3]:
pos_seq_length=len(pos[0])
pos_seq_length

17

In [4]:
neg_seq_length=len(neg[0])
neg_seq_length

1000

In [5]:
short_neg=[]
for seq in neg:
    x=randrange(0,neg_seq_length-pos_seq_length) # start of subset 
    new_seq=seq[x:x+pos_seq_length]
    if len(new_seq)==17:
        short_neg.append(new_seq)

In [6]:
seqs=pos+short_neg
pos_labels=[1]*len(pos)
neg_labels=[0]*len(neg)
labels=pos_labels+neg_labels

## 2. Sampling Scheme & one hot encoding 

To correct for class imbalance, I upsampled the positive samples using sampling with replacement to have the same number of samples as positive samples. I did upsampling as opposed to downsampling because there are a small number of positve samples. 

In [7]:
sampled_seqs, sampled_labels=sample_seqs(seqs, labels)

In [8]:
encoded_seqs = [one_hot_encode_seqs(x) for x in sampled_seqs]
len(encoded_seqs[0])

68

## 3. Training and validation set 

In [9]:
X_train, X_val, y_train, y_val = train_test_split(encoded_seqs, sampled_labels,
                                                 test_size=0.2,
                                                 random_state=0)

In [10]:
#data shape = samples x encoded seq 
X_train=np.vstack(X_train)
X_val=np.vstack(X_val)
y_train=np.array(y_train)
y_val=np.array(y_val)

## 4. Train!

In [11]:
lr_=0.1
epochs_=100
loss_='bce'
arch = [{'input_dim': 68, 'output_dim': 34, 'activation': 'sigmoid'},
       {'input_dim': 34, 'output_dim': 1, 'activation': 'sigmoid'}]

# instantiate model with arch and written params
nn= NeuralNetwork(arch, lr=lr_, seed=36, batch_size=1,
                  epochs=epochs_, loss_function = loss_)

In [12]:
y_train.shape

(5057,)

In [13]:
train_loss, val_loss = nn.fit(X_train, y_train, X_val, y_val)

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 34 is different from 5057)