In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
# The GPU id to use, usually either "0" or "1"
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3" 

In [10]:
import os
import sys
from os.path import abspath, expanduser
sys.path.append(abspath(expanduser("/../..")))
import unittest
from sequence_annotation.processor.compiler import SimpleCompiler,AnnSeqCompiler
from sequence_annotation.model.model_processor import SimpleModel,ModelCreator,IModelProcessor
from sequence_annotation.processor.data_processor import AnnSeqData,SimpleData
from keras import optimizers
from keras.models import Sequential
from keras.layers import Dense, Activation,Input
from sequence_annotation.worker.train_worker import TrainWorker
from sequence_annotation.worker.test_worker import TestWorker
from sequence_annotation.pipeline.pipeline import Pipeline
from sequence_annotation.model.customize import MaskedConvolution1D,RemoveMask

from sequence_annotation.genome_handler.seq_container import AnnSeqContainer
from sequence_annotation.genome_handler.ann_seq_processor import class_count
from sequence_annotation.data_handler.fasta import read_fasta
from sequence_annotation.data_handler.json import read_json
from sequence_annotation.data_handler.seq_converter import SeqConverter
from sequence_annotation.pipeline.wrapper import fit_generator_wrapper_generator
from sequence_annotation.pipeline.wrapper import evaluate_generator_wrapper_generator
from sequence_annotation.pipeline.callback import AdvancedModelCheckpoint,ResultHistory,ModelPlot
from sequence_annotation.processor.metric import BatchCount,TruePositive,SampleCount
from sequence_annotation.processor.stateful_metric import StatefulMetric
from keras.callbacks import TensorBoard 
import numpy as np
from keras.utils import plot_model
from keras.callbacks import EarlyStopping
#from hyperas import optim
#from hyperas.distributions import choice, uniform
#from hyperopt import Trials, STATUS_OK, tpe
import random
from sequence_annotation.genome_handler.sequence import AnnSequence
from sequence_annotation.genome_handler.seq_container import AnnSeqContainer
from sequence_annotation.pipeline.data_generator import DataGenerator
#from hyperopt import Trials, STATUS_OK, tpe,fmin, tpe,hp
from keras.layers import BatchNormalization, ReLU, Conv1D, Add, Dropout, Concatenate,Layer,LSTM,Bidirectional,RNN,Masking,SimpleRNNCell
from keras.engine.training import Model

In [3]:
fasta = read_fasta('../io/data/2018_11_24/Araport11_exon_2018_11_24_merged_with_coordinate_file_megred_exon_dist_to_five_1000_dist_to_three_500_merged.fasta')

In [4]:
import deepdish as dd
h5=dd.io.load('../io/data/2018_11_24/Araport11_exon_2018_11_24_merged_with_coordinate_file_megred_exon_dist_to_five_1000_dist_to_three_500_merged.h5')

In [5]:
ann_seqs = AnnSeqContainer().from_dict(h5)

In [6]:
from keras.layers import Activation
from keras import backend as K
import tensorflow as tf
from keras.utils.generic_utils import get_custom_objects
import keras


from keras.utils.generic_utils import CustomObjectScope
from keras.utils import multi_gpu_model
class SeqAnnModelBuilder(IModelProcessor):
    def __init__(self,setting,input_dim,output_dim):
        self._record = {'setting':setting,
                        'input_dim':input_dim,
                        'output_dim':output_dim}
        self._setting = setting
        self._input_dim = input_dim
        self._output_dim = output_dim
    def process(self):
        model = Sequential()
        inputs = Input(shape=(None,self._input_dim),name='Input')
        inputs_ = Masking(mask_value=0,name='mask')(inputs)
        previous_layer = inputs_
        previous_input = inputs_
        cnn_number = 16
        rnn_number = 8
        for index in range(cnn_number):
            if previous_layer!=previous_input:
                input_ = Concatenate(name='CNN_concat_'+str(index))([previous_input,previous_layer])
            else:
                input_ = previous_input
            cnn = MaskedConvolution1D(2,kernel_size=32,activation='relu',padding='same',name='cnn_'+str(index))(input_)
            bn = BatchNormalization()(cnn)
            previous_input = input_
            previous_layer = bn
        for index in range(rnn_number):
            if previous_layer!=previous_input:
                input_ = Concatenate(name='RNN_concat_'+str(index))([previous_input,previous_layer])
            else:
                input_ = previous_input
            with CustomObjectScope({'IRNNCell' : IRNNCell}):
                cnn_input = MaskedConvolution1D(8,kernel_size=1,activation=None,padding='same',name='rnn_cnn_'+str(index))(input_)
                rnn_ = RNN(IRNNCell(8), return_sequences=True,name='rnn_'+str(index))
                rnn = Bidirectional(rnn_,name='bidir_rnn_'+str(index))(cnn_input)
                bn = BatchNormalization()(rnn)
                previous_input = input_
                previous_layer = bn
        cnn = MaskedConvolution1D(self._output_dim,kernel_size=1,activation='softmax',name='predict',padding='same')(previous_layer)
        outputs = RemoveMask()(cnn)
        self._model = Model(inputs=inputs, outputs=outputs)
    @property
    def model(self):
        return self._model
    @property
    def record(self):
        return self._record
class IRNNCell(Layer):
    def __init__(self, units, **kwargs):
        self.units = units
        self.state_size = units
        super().__init__(**kwargs)
    def get_config(self):
        config = super().get_config()
        config['units'] = self.units
        return config
    def build(self, input_shape):
        self.activation =  Activation('relu')
        self.built = True
    def call(self, inputs, states):
        prev_output = states[0]
        output = self.activation(inputs + prev_output)
        return output, [output]

In [7]:
config = tf.ConfigProto()
if hasattr(config,"gpu_options"):
    config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
K.set_session(sess)

In [12]:
from numpy import median
median_len = median([len(seq) for seq in ann_seqs])
median_seqs = AnnSeqContainer()
median_seqs.ANN_TYPES = ann_seqs.ANN_TYPES
median_fasta = {}
number = 0
for seq in ann_seqs:
    if len(seq) < median_len:
        median_seqs.add(seq)
        median_fasta[seq.id]=fasta[seq.id]
        number += 1
    if number >= 100:
        break

In [13]:
from keras import optimizers

builder = SeqAnnModelBuilder({},4,4)
compiler = AnnSeqCompiler('adam','categorical_crossentropy',
                          ann_types=median_seqs.ANN_TYPES,values_to_ignore=0)
data = AnnSeqData({'data':{'training':{'inputs':median_fasta,
                                       'answers':median_seqs}
                          },'ANN_TYPES':median_seqs.ANN_TYPES},
                  discard_invalid_seq=False,validation_split=0,
                  do_validate=True,padding_value=0
              
                 )
worker = TrainWorker(is_verbose_visible=True)
callbacks = [ModelPlot('2018_11_29/model.png',
                        show_shapes =True,
                        show_layer_names =True),
             TensorBoard('../io/logs/',write_graph=True, write_grads=True, write_images=True),
             ResultHistory('2018_11_29/result.csv',period=3,verbose=True),
             AdvancedModelCheckpoint('2018_11_29/weights/weights_{epoch:03d}.hdf5',period=1)]
wrapper = fit_generator_wrapper_generator(verbose=1,batch_size=100,
                                          epochs=10,epoch_shuffle=True,callbacks=[])
pipeline = Pipeline(builder,data,compiler,worker,
                    wrapper,is_prompt_visible=True)

pipeline.execute()

Processing model..
Processing data...
Compiling model...
Processing worker...
Executing...
Start working(2018-12-04 16:32:00)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
End working(2018-12-04 16:36:46)
Spend time: 00:04:45


In [None]:
model.summary()

In [None]:
class ModelOptimizer:
    def __init__(self):
        self.x_train, self.y_train, self.x_test, self.y_test=self._prepare_data()
    def _prepare_data(self):
        x,y=generator()
        val_x,val_y=generator()
        data = SimpleData({'training':{'inputs':x,'answers':y},
                           'validation':{'inputs':val_x,'answers':val_y}})
        data.before_process()
        data.process()
        data.after_process()
        return (data.data['training']['inputs'],
                data.data['training']['answers'],
                data.data['validation']['inputs'],
                data.data['validation']['answers'])
    def _create_model(self,space):
        builder = SeqAnnModelBuilder({},3,4)
        builder.process()
        simple_model = SimpleModel(builder.model)
        return simple_model
    def get_loss(self,space):
        model = self._create_model(space)
        compiler = AnnSeqCompiler('adam','categorical_crossentropy')
        data = SimpleData({'training':{'inputs':self.x_train,'answers':self.y_train},
                          'validation':{'inputs':self.x_test,'answers':self.y_test}})
        worker = TrainWorker(is_verbose_visible=False)
        wrapper = fit_generator_wrapper_generator(verbose=1,batch_size=1000,
                                                  epochs=30,callbacks=[#ModelPlot(str(space['layer'])+'.png',show_shapes =True,
                                                                        #          show_layer_names =True),
                                                                        #ResultHistory(str(space['layer'])+'tsv',period=2,verbose=True),
                                                                        #AdvancedModelCheckpoint('weights.{epoch:02d}.hdf5',period=2),
                                                  #                      EarlyStopping(patience=5)
                                                  ])
        pipeline = Pipeline(model,data,compiler,worker,
                            wrapper,is_prompt_visible=False)

        pipeline.execute()
        #get the highest validation accuracy of the training epochs
        val_loss = np.amax(worker.result['loss']) 
        #self._loss = -validation_acc
        print('Best validation loss of epoch:', val_loss)
        return {'loss': val_loss, 'status': STATUS_OK, 'model': model.model,'space':space}

In [None]:
modelOptimizer = ModelOptimizer()

In [None]:
trials = Trials()

In [None]:
best = fmin(modelOptimizer.get_loss, space={'layer':hp.choice('nubmer',[_ for _ in range(1,11)])}
           , algo=tpe.suggest ,trials=trials, max_evals=10)