In [1]:
import numpy as np
from nn import nn, io, preprocess

from sklearn import datasets
from sklearn.model_selection import train_test_split

import matplotlib as mpl
import matplotlib.pyplot as plt

In [2]:
architecture = [{'input_dim': 1000, 'output_dim': 500, 'activation': 'relu'}, 
                {'input_dim': 500, 'output_dim': 500, 'activation': 'relu'},
                {'input_dim': 500, 'output_dim': 250, 'activation': 'relu'},
                {'input_dim': 250, 'output_dim': 2, 'activation': 'sigmoid'}]

my_nn = nn.NeuralNetwork(nn_arch=architecture, lr=1, seed=42, batch_size=500, epochs=1000, loss_function='mse')

In [3]:
negatives = io.read_fasta_file('data/yeast-upstream-1k-negative.fa')
positives = io.read_text_file('data/rap1-lieb-positives.txt')

seqs, labels = preprocess.sample_seqs(negatives+positives, [False for _ in negatives] + [True for _ in positives])
seqs_1hot = preprocess.one_hot_encode_seqs(seqs)

GCGCCCATACATCACAT
['T', 'T', 'G', 'T', 'C', 'G', 'C', 'C', 'G', 'G', 'G', 'C', 'G', 'C', 'C', 'T', 'A', 'T', 'A', 'G', 'C', 'A', 'G', 'A', 'G', 'T', 'G', 'A', 'C', 'G', 'A', 'T', 'A', 'A', 'A', 'G', 'G', 'T', 'C', 'C', 'T', 'T', 'A', 'A', 'T', 'C', 'C', 'T', 'G', 'C', 'G', 'C', 'T', 'A', 'A', 'G', 'T', 'C', 'G', 'A', 'G', 'C', 'C', 'A', 'T', 'G', 'C', 'A', 'T', 'G', 'C', 'G', 'G', 'A', 'G', 'G', 'G', 'T', 'A', 'A', 'T', 'C', 'A', 'C', 'G', 'A', 'C', 'A', 'T', 'A', 'T', 'A', 'A', 'C', 'C', 'A', 'T', 'T', 'A', 'G', 'G', 'C', 'G', 'A', 'T', 'C', 'T', 'G', 'C', 'A', 'G', 'C', 'A', 'A', 'G', 'T', 'A', 'T', 'A', 'C', 'C', 'T', 'G', 'C', 'C', 'G', 'G', 'A', 'G', 'C', 'G', 'A', 'G', 'G', 'T', 'C', 'G', 'G', 'C', 'A', 'G', 'G', 'C', 'T', 'C', 'T', 'G', 'C', 'G', 'G', 'G', 'G', 'C', 'T', 'C', 'T', 'C', 'G', 'A', 'C', 'T', 'A', 'C', 'T', 'T', 'G', 'C', 'C', 'G', 'C', 'C', 'A', 'G', 'T', 'G', 'C', 'C', 'G', 'G', 'G', 'A', 'G', 'C', 'G', 'G', 'T', 'C', 'A', 'G', 'C', 'G', 'G', 'G', 'T', 'C', 'T', '

TypeError: can only concatenate list (not "str") to list

In [None]:
{length: [len(seq) for seq in negatives].count(length) for length in {52, 334, 490, 629, 792, 1000}}

In [None]:
plt.hist([len(seq) for seq in negatives])

In [None]:

X_train, X_test, y_train, y_test = train_test_split(seqs_1hot, labels, test_size=0.3, random_state=42, shuffle=True, stratify=None)

per_epoch_loss_train, per_epoch_loss_val = my_nn.fit(X_train, y_train, X_test, y_test)

In [None]:
# plot training and test losses across epochs
plt.plot(per_epoch_loss_train)
plt.plot(per_epoch_loss_val)

plt.xlabel('Epoch')
plt.ylabel('Average MSE')
plt.title('Training and test loss')
plt.tight_layout()

In [None]:
lbl_map = {0: 'A', 1: 'T', 2: 'C', 3: 'G'}
mapper = np.vectorize(pyfunc=lambda x: lbl_map[x])
max_seq_len = 10
padded_seq = mapper(np.random.randint(0, high=4, size=max_seq_len))
padded_seq.tolist()