In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1"
import sys
from os.path import abspath, expanduser
sys.path.append(abspath(expanduser("../RNN")))
sys.path.append(abspath(expanduser("../sequence_annotation")))
import torch
from torch import nn
import time
import numpy as np
from sequence_annotation.visual.visual import visual_ann_seq,visual_ann_genome
from sequence_annotation.genome_handler import ann_seq_processor
import deepdish as dd
from sequence_annotation.data_handler.fasta import read_fasta
from sequence_annotation.genome_handler.sequence import AnnSequence
from sequence_annotation.genome_handler.seq_container import AnnSeqContainer
from sequence_annotation.function.data_generator import SeqGenerator
import random
from sequence_annotation.genome_handler.ann_genome_processor import simplify_genome,get_genome_region_info
from sequence_annotation.data_handler.seq_converter import SeqConverter
from sequence_annotation.genome_handler.utils import ann_count 
from sequence_annotation.process.ann_seq_data import AnnSeqData
from sequence_annotation.pytorch.worker import Trainer
from sequence_annotation.process.pipeline import Pipeline 
from sequence_annotation.function.model_processor import SimpleModel
from sequence_annotation.pytorch.compiler import SimpleCompiler
import torch.optim as optim
from tensorboardX import SummaryWriter
import shutil
h5=dd.io.load('../io/Arabidopsis_thaliana/data/2019_02_20/result/result_dist_to_five_1000_dist_to_three_1000_merged.h5')
fasta = read_fasta('../io/Arabidopsis_thaliana/data/2019_02_20/result/result_dist_to_five_1000_dist_to_three_1000_merged.fasta')
ann_seqs = AnnSeqContainer().from_dict(h5)
seqs_len = [len(seq) for seq in ann_seqs]
seqs_len.sort()
len_threshold = seqs_len[:int(len(seqs_len)/100)][-1]
inner_fasta = {}
for seq in ann_seqs:
    if len(seq) <= len_threshold:
        inner_fasta[seq.id]=fasta[seq.id]
    elif len(seq) >= len_threshold*1.5:
        outlier_name = seq.id
keys = list(inner_fasta.keys())
random.shuffle(keys)
selected_seqs = AnnSeqContainer()
selected_seqs.ANN_TYPES = list(ann_seqs.ANN_TYPES) + ['mix']
selected_fasta = {}
number = 0
for seq_id in keys:
    seq = ann_seq_processor.mixed_typed_seq_generate(ann_seqs.get(seq_id))
    selected_seqs.add(seq)
    selected_fasta[seq_id]=inner_fasta[seq_id]
    number += 1

Using TensorFlow backend.
  vecs.append(np.nan_to_num(temp/temp))


In [2]:
selected_seqs = simplify_genome(selected_seqs,{'gene':['exon','intron','mix'],'other':['other']})
print(number)

28


In [3]:
from sequence_annotation.pytorch.customize_layer import SeqAnnlLoss,SeqAnnModel,GatedIndRnnCell,noisy_relu,noisy_hard_sigmoid
from sequence_annotation.pytorch.CRF import BatchCRFLoss,CRFLoss
from sequence_annotation.pytorch.callback import CategoricalMetric,TensorboardCallback,TensorboardWriter,SeqFigCallback,EarlyStop
from torch.nn.init import ones_,zeros_,uniform_,normal_,constant_,eye_

In [4]:
class GatedIndRnnCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self._gate_num = 1
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.gate_weights_ih = nn.Parameter(torch.empty(hidden_size, input_size))
        #self.gate_weights_hh = nn.Parameter(torch.empty(hidden_size, hidden_size))
        self.weights_i = nn.Parameter(torch.empty(hidden_size, input_size))
        self.gate_bias = nn.Parameter(torch.empty(hidden_size))
        self.input_bias = nn.Parameter(torch.empty(hidden_size))
        self.gate_function =  noisy_hard_sigmoid()
        self.recurrent_function = noisy_relu()
        self.reset_parameters()
        self.output_names = ['new_h','pre_gate','gate','values_i','pre_h','pre_gate_i_ih']
    def reset_parameters(self):
        #gate_bound_hh = (1/((self.hidden_size)))
        gate_bound_ih = (1/((self.input_size)))
        input_bound = (1/(self.input_size))
        #uniform_(self.gate_weights_hh,-gate_bound_hh,gate_bound_hh)
        uniform_(self.gate_weights_ih,-gate_bound_ih,gate_bound_ih)
        uniform_(self.weights_i,-input_bound,input_bound)
        constant_(self.gate_bias,1)
        constant_(self.input_bias,0)
    def forward(self, input, state):
        #input shape should be (number,feature size)
        pre_gate_ih = F.linear(input, self.gate_weights_ih)
        pre_gate = pre_gate_ih + self.gate_bias
        gate = self.gate_function(pre_gate,training=self.training)
        values_i = F.linear(input, self.weights_i,self.input_bias)
        pre_h = state*gate+ values_i
        new_h = self.recurrent_function(pre_h,self.training)
        return new_h,pre_gate,gate,values_i,pre_h,pre_gate_ih

In [13]:
96/8

12.0

In [None]:
def train(id_,loss,model,class_num):
    compiler = SimpleCompiler(lambda params:optim.Adam(params,lr=1e-4),loss)
    early_stop = EarlyStop(target='val_macro_F1',optimized="max",patient=16)
    writer = TensorboardWriter(SummaryWriter('../io/record/train_2019_03_04/model_'+str(id_)))
    builder = SimpleModel(model)
    data = AnnSeqData({'training':{'inputs':selected_fasta,'answers':selected_seqs}},
                       discard_invalid_seq=True,validation_split=0.1)
    train_metric = CategoricalMetric(class_num=class_num,ignore_index=-1,class_names=selected_seqs.ANN_TYPES)
    val_metric = CategoricalMetric(prefix='val',class_num=class_num,ignore_index=-1,class_names=selected_seqs.ANN_TYPES)
    tensorboard = TensorboardCallback(writer)
    seq = SeqConverter().seq2vecs(fasta[outlier_name])
    answer = ann_seqs.get(outlier_name)
    answer = ann_seq_processor.mixed_typed_seq_generate(answer)
    answer = ann_seq_processor.simplify_seq(answer,{'gene':['exon','intron','mix'],'other':['other']})
    answer = ann_seq_processor.seq2vecs(answer)
    #print(answer.shape)
    seq = np.transpose(np.array([seq]),[0,2,1])
    seq = torch.from_numpy(seq).type('torch.FloatTensor').cuda()
    colors = {'other':'blue','gene':'red'}
    seq_fig = SeqFigCallback(writer,seq,answer,class_names=selected_seqs.ANN_TYPES,
                             colors=[colors[type_]for type_ in selected_seqs.ANN_TYPES],prefix='test')
    worker = Trainer(batch_size=16,return_extra_info=True, order='NCL', order_target=['answers','inputs'],
                     pad_value={'answers':-1,'inputs':0},epoch_num=100,generator=SeqGenerator,
                     train_callbacks=[train_metric],val_callbacks=[val_metric,early_stop],other_callbacks=[tensorboard,seq_fig],
                    writer=writer)
    pipeline = Pipeline(builder,data,worker,compiler,
                        is_prompt_visible=True)
    pipeline.execute()
    return pipeline._worker.result.data
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
space={
    'cnn_num':hp.quniform('cnn_num', 4, 8, 1),
    'cnn_kernel_sizes':hp.quniform('cnn_kernel_sizes', 64, 128, 8),
    'cnn_outputs_num':hp.quniform('cnn_outputs_num', 16, 32, 2),
    'reduce_cnn_ratio':hp.uniform('reduce_cnn_ratio', 0.1, 1),
    'rnn_init_value':hp.uniform('rnn_init_value', 0, 1),
    'reversed_rnn_init_value':hp.uniform('reversed_rnn_init_value', 0, 1)
    #'with_alphas':hp.choice('with_alphas',[True,False])
}
class SpaceFinder:
    def __init__(self):
        self.train_counter = 0
        self.space_result = {}
        self.records = {}
        self.class_num=len(selected_seqs.ANN_TYPES)
        other_count = ann_count(selected_seqs)['other']
        gene_count = ann_count(selected_seqs)['gene']
        total = 1/other_count + 1/gene_count
        w={}
        w['gene'] = (2/gene_count)/total
        w['other'] = (2/other_count)/total
        #self.alphas = torch.FloatTensor([w[type_] for type_ in selected_seqs.ANN_TYPES]).cuda()
    def objective(self,space):
        print(self.train_counter,space)
        cnn_num = int(space['cnn_num'])
        cnn_kernel_sizes = int(space['cnn_kernel_sizes'])
        cnn_outputs_num = int(space['cnn_outputs_num'])
        reduce_cnn_ratio = space['reduce_cnn_ratio']
        rnn_init_value = space['rnn_init_value']
        reversed_rnn_init_value = space['reversed_rnn_init_value']
        #with_alphas = space['with_alphas']
        self.train_counter +=1
        model = SeqAnnModel(in_channels=4,out_channels=2,
                            cnn_num=cnn_num,cnn_kernel_sizes=[cnn_kernel_sizes]*cnn_num,
                            cnn_outputs_num=[cnn_outputs_num]*cnn_num,
                            rnn_num=2,rnn_outputs_num=[4]*2,
                            rnn_cell_class=GatedIndRnnCell,
                            reduce_cnn_ratio=reduce_cnn_ratio,
                            rnn_init_value=rnn_init_value,
                            reversed_rnn_init_value=reversed_rnn_init_value,
                            use_CRF=True,
                            rnn_layer_norm=True
                           ).cuda()
        #if with_alphas:
        #    alphas_ = self.alphas
        #else:
        #    alphas_ = None
        #loss = SeqAnnlLoss(class_num=self.class_num,ignore_index=-1,gamma=0,alphas=alphas_).cuda()
        item_loss = CRFLoss(model.CRF.transitions).cuda()
        loss = BatchCRFLoss(item_loss).cuda()
        record = train(self.train_counter,loss,model,self.class_num)
        best = max(record['val_macro_F1'])
        self.space_result[self.train_counter] = {'space':space,'val_macro_F1':best}
        self.records[self.train_counter] = record
        return {'loss':-best,'status': STATUS_OK,'eval_time': time.time()}
trials = Trials()
trainer = SpaceFinder()
best = fmin(trainer.objective, space, algo=tpe.suggest, max_evals=32,trials=trials)

0 {'cnn_num': 5.0, 'reduce_cnn_ratio': 0.38298742597362734, 'cnn_outputs_num': 16.0, 'rnn_init_value': 0.27538791248272754, 'reversed_rnn_init_value': 0.04916035948670194, 'cnn_kernel_sizes': 120.0}
Processing data...
Processing model..
Compiling model...
Processing worker...
Executing...
Start working(2019-03-04 17:53:36)
[[ 0.0002  0.0002]
 [ 0.0002 -0.0002]]
1 {'F1_other': 0.8673302629624127, 'macro_F1': 0.4336651314812064, 'F1_gene': 0.0, 'val_loss': 5950.970703125, 'val_F1_gene': 0.0, 'accuracy': 0.7657397691500525, 'val_F1_other': 0.8625506985128436, 'loss': 45859.201171875, 'val_accuracy': 0.7583201267828843, 'val_macro_F1': 0.4312753492564218}
[[ 0.0004  0.0004]
 [ 0.0004 -0.0004]]
2 {'F1_other': 0.8673302629624127, 'macro_F1': 0.4336651314812064, 'F1_gene': 0.0, 'val_loss': 5061.4765625, 'val_F1_gene': 0.0, 'accuracy': 0.7657397691500525, 'val_F1_other': 0.8625506985128436, 'loss': 37629.5439453125, 'val_accuracy': 0.7583201267828843, 'val_macro_F1': 0.4312753492564218}
[[ 0.0

22 {'F1_other': 0.8391608391608392, 'macro_F1': 0.5722419792179196, 'F1_gene': 0.2781456953642384, 'val_loss': 878.7620849609375, 'val_F1_gene': 0.07962529274004684, 'accuracy': 0.7369359916054564, 'val_F1_other': 0.8125894134477826, 'loss': 4659.017822265625, 'val_accuracy': 0.6885895404120443, 'val_macro_F1': 0.45770333568745625}
[[ 0.00282808 -0.00137593]
 [-0.00190513 -0.00177679]]
23 {'F1_other': 0.8153098420413123, 'macro_F1': 0.5860156191652696, 'F1_gene': 0.3556288271314178, 'val_loss': 855.029052734375, 'val_F1_gene': 0.10201342281879194, 'accuracy': 0.7129066107030431, 'val_F1_other': 0.7800065767839527, 'loss': 4411.51318359375, 'val_accuracy': 0.6465927099841522, 'val_macro_F1': 0.4448276718577369}
[[ 0.00296962 -0.00157346]
 [-0.00210757 -0.00171493]]
24 {'F1_other': 0.7912503943632349, 'macro_F1': 0.5863396574376498, 'F1_gene': 0.37923485875117274, 'val_loss': 843.17919921875, 'val_F1_gene': 0.10532915360501567, 'accuracy': 0.6875655823714586, 'val_F1_other': 0.7612514639

[[ 0.00673519 -0.00558491]
 [-0.00617589  0.00170698]]
44 {'F1_other': 0.8734210014899269, 'macro_F1': 0.6799300101277617, 'F1_gene': 0.46066795473364613, 'val_loss': 560.7764282226562, 'val_F1_gene': 0.21018062397372742, 'accuracy': 0.7949632738719832, 'val_F1_other': 0.8485993075228203, 'loss': 2983.419189453125, 'val_accuracy': 0.745905969360803, 'val_macro_F1': 0.567337774963032}
[[ 0.00693501 -0.00578599]
 [-0.00637887  0.00190534]]
45 {'F1_other': 0.8769642799522442, 'macro_F1': 0.6863848803473618, 'F1_gene': 0.4651423762098471, 'val_loss': 544.6603393554688, 'val_F1_gene': 0.23127035830618892, 'accuracy': 0.7999475341028331, 'val_F1_other': 0.8511979823455234, 'loss': 2944.759521484375, 'val_accuracy': 0.7506603275224512, 'val_macro_F1': 0.5797106519590831}
[[ 0.00713495 -0.00598705]
 [-0.00658178  0.00210401]]
46 {'F1_other': 0.8795503730740657, 'macro_F1': 0.6943657451716841, 'F1_gene': 0.4792626728110599, 'val_loss': 526.3265380859375, 'val_F1_gene': 0.27466456195737965, 'acc

[[ 0.01114083 -0.01000483]
 [-0.01062938  0.00609841]]
66 {'F1_other': 0.9099461493544411, 'macro_F1': 0.7793106200857344, 'F1_gene': 0.619413216342199, 'val_loss': 391.88140869140625, 'val_F1_gene': 0.5102040816326531, 'accuracy': 0.8543546694648478, 'val_F1_other': 0.8720852764823451, 'loss': 2318.1174926757812, 'val_accuracy': 0.7971473851030111, 'val_macro_F1': 0.6984250435297497}
[[ 0.01134116 -0.01020557]
 [-0.01083134  0.0062984 ]]
67 {'F1_other': 0.910660953122971, 'macro_F1': 0.7814732165197211, 'F1_gene': 0.6238381629305632, 'val_loss': 389.235107421875, 'val_F1_gene': 0.5117460317460317, 'accuracy': 0.855613850996852, 'val_F1_other': 0.8717692179423044, 'loss': 2300.3213500976562, 'val_accuracy': 0.7968832540940306, 'val_macro_F1': 0.698613187246589}
[[ 0.01154149 -0.01040629]
 [-0.01103327  0.00649839]]
68 {'F1_other': 0.9118908382066276, 'macro_F1': 0.7850172626180041, 'F1_gene': 0.6305177111716621, 'val_loss': 387.03546142578125, 'val_F1_gene': 0.520702634880803, 'accurac

In [None]:
1 {'cnn_kernel_sizes': 120.0, 'reduce_cnn_ratio': 0.46717151978663296, 'rnn_init_value': 0.20125152917954714, 'cnn_outputs_num': 44.0, 'reversed_rnn_init_value': 0.5201781842874957, 'with_alphas': True, 'cnn_num': 6.0}

In [2]:
a = ['other','intron','exon']
a.sort()
a

['exon', 'intron', 'other']