In [5]:
import os
import sys

In [6]:
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

In [7]:
sys.path.append("../../deep-learning-dna")
sys.path.append("../../settransformer")

In [12]:
import tensorflow as tf
import tensorflow.keras as keras
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display
import math
import string

import settransformer as stf
from common.models import dnabert
from common import dna
from lmdbm import Lmdb
from common.data import DnaSequenceGenerator, DnaLabelType, DnaSampleGenerator, find_dbs
import wandb

import tf_utils as tfu

In [13]:
strategy = tfu.strategy.cpu()

---
# Load Data

In [14]:
#Import pretrained model
api = wandb.Api()
model_path = api.artifact("sirdavidludwig/deep-learning-dna/dnabert-pretrain-ablation-dim:8dim").download()
pretrained_model = dnabert.DnaBertModel.load(model_path)
pretrained_model

<common.models.dnabert.DnaBertPretrainModel at 0x7efea6308c40>

In [15]:
#Load datafiles
dataset_path = api.artifact("sirdavidludwig/nachusa-dna/dnasamples:latest").download()
samples = find_dbs(dataset_path + '/train')
samples[13]

[34m[1mwandb[0m: Downloading large artifact dnasamples:latest, 289.81MB. 336 files... Done. 0:0:0.1


'./artifacts/dnasamples:v2/train/Wesley014-CCE-051120_S153_L001_R1_001.db'

In [16]:
max_files = 1260

---
# Create Dataset

In [17]:
#Generate batches
subsample_length = 700
sequence_length = 150
kmer = 3
batch_size = 20
batches_per_epoch = 128
augument = True
labels = DnaLabelType.SampleIds
dataset = DnaSampleGenerator(samples=samples, subsample_length = subsample_length, sequence_length=sequence_length,kmer=kmer,batch_size=batch_size,batches_per_epoch=batches_per_epoch,augment=augument,labels=labels)

In [62]:
dataset[0]

(array([[[67., 86., 56., ..., 87., 60., 52.],
         [85., 51.,  5., ..., 60., 52., 10.],
         [67., 87., 61., ..., 61., 55., 27.],
         ...,
         [56., 30., 27., ..., 87., 60., 52.],
         [67., 86., 56., ..., 27., 13., 65.],
         [86., 56., 30., ..., 65., 77., 12.]],
 
        [[67., 86., 56., ..., 63., 65., 77.],
         [86., 56., 30., ..., 75.,  2., 10.],
         [86., 56., 30., ..., 63., 65., 77.],
         ...,
         [86., 56., 30., ..., 55., 27., 10.],
         [86., 56., 30., ..., 65., 77., 10.],
         [86., 56., 30., ..., 53., 17., 85.]],
 
        [[30., 27., 11., ..., 10., 52., 12.],
         [86., 56., 30., ..., 67., 87., 60.],
         [30., 27., 11., ..., 12., 62., 62.],
         ...,
         [86., 56., 30., ..., 50.,  2., 10.],
         [67., 86., 56., ..., 87., 60., 52.],
         [67., 86., 56., ..., 88., 67., 87.]],
 
        ...,
 
        [[67., 86., 56., ..., 13., 65., 77.],
         [67., 86., 56., ..., 60., 50.,  2.],
         [56.,

In [19]:
dataset[0][0].shape

(20, 700, 148)

In [20]:
dataset[0][0][0].shape

(700, 148)

In [21]:
dataset[0][0][0][0].shape

(148,)

---
# Create Embeddings

In [22]:
#Create 8 dimensional embeddings
pretrained_encoder= dnabert.DnaBertEncoderModel(pretrained_model.base)
pretrained_encoder.trainable = False

In [23]:
embeddings = pretrained_encoder.predict(dataset[0][0][0])

In [24]:
embeddings.shape

(700, 8)

In [25]:
embeddings

array([[ -3.0589375 ,   0.09530499,   3.1571653 , ...,  -1.2163663 ,
          9.383499  ,  -0.8313564 ],
       [ -3.5858917 ,   1.5305752 ,  -0.9098054 , ...,  -6.64118   ,
          8.697727  ,   0.25428525],
       [ -0.36243826,   2.1628273 ,   1.9515008 , ..., -10.093372  ,
          7.515818  ,   4.344171  ],
       ...,
       [ -2.218179  ,   1.8394516 ,  -3.6845567 , ...,  -4.50106   ,
          7.6143627 ,   5.869076  ],
       [  6.7310724 ,   2.6626039 ,   4.446248  , ...,  -9.541835  ,
          6.4417896 ,  -1.7020733 ],
       [ -0.76325035,  -1.1196578 ,  -1.8744181 , ...,   0.6938467 ,
          9.9582615 ,  -1.5631678 ]], dtype=float32)

---
# Create Model

In [46]:
#Create set transformer -
def Create_Model(embed_dim, num_heads, stack, use_layernorm, pre_layernorm, use_keras_mha, seq_len, encoder, output_shape):
    y = x = keras.layers.Input((None,seq_len))
    y = keras.layers.TimeDistributed(encoder)(y)
    y = keras.layers.Dense(embed_dim)(y)
    
    for _ in range(stack):
        y = stf.InducedSetAttentionBlock(embed_dim=embed_dim,num_heads=num_heads,num_induce=3,use_layernorm=use_layernorm,pre_layernorm=pre_layernorm,use_keras_mha=use_keras_mha)(y)

    y = stf.PoolingByMultiHeadAttention(num_seeds=1,embed_dim=embed_dim,num_heads=1,use_layernorm=use_layernorm,pre_layernorm=pre_layernorm,use_keras_mha=use_keras_mha,is_final_block=True)(y)
    y = keras.layers.Dense(output_shape, activation=keras.activations.sigmoid)(y)
    
    return keras.Model(x, y)

In [36]:
#Hyperparameters
embed_dim = 128
num_heads = 4
stack = 4
use_layernorm = False
pre_layernorm = False
use_keras_mha = False
seq_len = 148
encoder = pretrained_encoder
output_shape = max_files

In [47]:
model = Create_Model(embed_dim, num_heads, stack, use_layernorm, pre_layernorm, use_keras_mha, seq_len, encoder, output_shape)
model.compile(optimizer=keras.optimizers.Adam(1e-4),loss=keras.losses.SparseCategoricalCrossentropy(from_logits = True), metrics = [keras.metrics.sparse_categorical_accuracy])

In [79]:
#prediction = model.predict(dataset)

In [80]:
#prediction.shape

In [81]:
#prediction

In [82]:
epochs = 200

In [None]:
history = model.fit(dataset, epochs=epochs, verbose=1)

Epoch 1/200


In [None]:
#Plot history and accuracy
plt.subplot(211)
plt.plot(history.history['sparse_categorical_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')

plt.subplot(212)
plt.plot(history.history['loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.tight_layout()
plt.show() 