In [1]:
from eden.util import configure_logging
import logging
configure_logging(logging.getLogger(),verbosity=2)

In [2]:
def rfam_uri(family_id):
    return 'http://rfam.xfam.org/family/%s/alignment?acc=%s&format=fastau&download=0'%(family_id,family_id)

def rfam_uri(family_id):
    return '%s.fa'%(family_id)

In [3]:
rfam_id = 'RF02275' #Hammerhead_HH9
rfam_id = 'RF00871' #microRNA mir-689
rfam_id = 'RF00005' #tRNA

In [4]:
def pre_processor( data, **args):
    from eden.converter.rna.rnafold import rnafold_to_eden
    graphs = rnafold_to_eden( data, **args )
    return graphs

In [5]:
def pre_processor( data, **args):
    from eden.converter.rna.rnashapes import rnashapes_to_eden
    graphs = rnashapes_to_eden( data, **args )
    return graphs

In [6]:
from eden.graph import Vectorizer
vectorizer = Vectorizer()

In [7]:
from sklearn.linear_model import SGDClassifier, Perceptron, PassiveAggressiveClassifier
estimator = PassiveAggressiveClassifier(shuffle=True)
estimator = Perceptron(class_weight='auto', shuffle=True)
estimator = SGDClassifier(average=True, class_weight='auto', shuffle=True)

In [8]:
#data setup
model_fname='eden_model_%s'%rfam_id
size=150
train_test_split=0.5
n_iter=20
times=2
n_jobs=8

#BinaryClassificationModel

In [9]:
#create iterable from files
from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs
from eden.modifier.seq import seq_to_seq, shuffle_modifier
iterable_neg = seq_to_seq( seqs_, modifier=shuffle_modifier, times=times, order=2 )

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)
iterable_neg = islice(iterable_neg,size*times)

#split train/test
from eden.util import random_bipartition_iter
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)

In [10]:
%%time
#make predictive model
from eden.model import ActiveLearningBinaryClassificationModel
model = ActiveLearningBinaryClassificationModel(pre_processor=pre_processor, 
                                                estimator=estimator, 
                                                vectorizer=vectorizer,
                                                n_jobs=n_jobs,
                                                pre_processor_n_jobs=n_jobs,
                                                n_blocks=5)

#optimize hyperparameters and fit model
from numpy.random import randint
from numpy.random import uniform
pre_processor_parameters={'max_num':[1,2,3], 
                          'shape_type':[4,5], 
                          'energy_range':[5,10,20,30,40]}

vectorizer_parameters={'complexity':[2,3]}

estimator_parameters={'n_iter':randint(5, 200, size=n_iter),
                      'penalty':['l1','l2','elasticnet'],
                      'l1_ratio':uniform(0.1,0.9, size=n_iter), 
                      'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                      'power_t':uniform(0.1, size=n_iter),
                      'alpha': [10**x for x in range(-8,0)],
                      'eta0': [10**x for x in range(-4,-1)],
                      'learning_rate': ["invscaling", "constant", "optimal"],
                      'n_jobs':[n_jobs]}

model.optimize(iterable_pos_train, iterable_neg_train, 
               model_name=model_fname,
               max_total_time=-1, 
               n_iter=n_iter,
               n_inner_iter_estimator=5,
               cv=5,
               score_func=lambda avg_score,std_score : avg_score - std_score * 2,
               scoring='roc_auc',
               two_steps_optimization=True,
               pre_processor_parameters=pre_processor_parameters, 
               vectorizer_parameters=vectorizer_parameters, 
               estimator_parameters=estimator_parameters)



	Parameters range:

Pre_processor:
energy_range: [5, 10, 20, 30, 40]
   max_num: [1, 2, 3]
shape_type: [4, 5]

Vectorizer:
complexity: [2, 3]

Estimator:
     alpha: [1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1]
      eta0: [0.0001, 0.001, 0.01]
  l1_ratio: [ 0.52778475  0.76985807  0.54059693  0.25416609  0.5888038   0.3683864
  0.34806239  0.43645223  0.42677193  0.1395003   0.11279333  0.59178554
  0.31955873  0.74416832  0.89157066  0.12894729  0.43573494  0.8946762
  0.51032348  0.59213283]
learning_rate: ['invscaling', 'constant', 'optimal']
      loss: ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']
    n_iter: [ 86 150 168 115 189  65  27 183  80 113  98 171  93 189  14  29  40 190
 114 169]
    n_jobs: [8]
   penalty: ['l1', 'l2', 'elasticnet']
   power_t: [ 0.99365758  0.30518604  0.7779725   0.49062093  0.57091294  0.67960705
  0.38012782  0.62176999  0.49436924  0.46458576  0.98904455  0.23605035
  0.49101612  0.82194656  0.11035579  0.84041843 

In [15]:
%%time
#estimate predictive performance
print model.get_parameters()
apr, roc = model.estimate( iterable_pos_test, iterable_neg_test )

TypeError: 'NoneType' object is not iterable

Models can be reloaded from disk

In [12]:
from eden.model import ActiveLearningBinaryClassificationModel

model2 = ActiveLearningBinaryClassificationModel()
model2.load(model_fname)

from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)

predictions= model2.decision_function( iterable_pos )
for n,i in enumerate(sorted(predictions)): print n,i

0 -0.00164563903696
1 0.000706448549672
2 0.00102198477499
3 0.00179585188571
4 0.00217072221578
5 0.00290860930835
6 0.00322474618901
7 0.00322938403495
8 0.00329252049893
9 0.00421826605754
10 0.00425737677899
11 0.00454187236297
12 0.00486809237846
13 0.0048734430484
14 0.00495389892576
15 0.00514653369853
16 0.0051748809219
17 0.00537898827078
18 0.00567193720799
19 0.00581228582138
20 0.00611549247421
21 0.00626834725849
22 0.00664969406358
23 0.00668194824267
24 0.00669716143413
25 0.00683939721325
26 0.00687954824014
27 0.00706597745842
28 0.00707430912018
29 0.0070992603667
30 0.00727582082503
31 0.00749808425587
32 0.0075291696794
33 0.00760494589897
34 0.00761351989985
35 0.00769320084907
36 0.0077041063998
37 0.00795868272685
38 0.00812236880191
39 0.00852282789048
40 0.00852370558206
41 0.00862637417543
42 0.00882015233883
43 0.00890874176692
44 0.00898508024086
45 0.00908436195815
46 0.00910569327791
47 0.00913129547314
48 0.00920734121722
49 0.00924898126136
50 0.00930338

#ActiveLearningBinaryClassificationModel

In [13]:
#create iterable from files
from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs
from eden.modifier.seq import seq_to_seq, shuffle_modifier
iterable_neg = seq_to_seq( seqs_, modifier=shuffle_modifier, times=times, order=2 )

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)
iterable_neg = islice(iterable_neg,size*times)

#split train/test
from eden.util import random_bipartition_iter
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)

In [14]:
%%time
#make predictive model
from eden.model import ActiveLearningBinaryClassificationModel
model = ActiveLearningBinaryClassificationModel(pre_processor=pre_processor, 
                                                estimator=estimator, 
                                                vectorizer=vectorizer,
                                                n_jobs=n_jobs,
                                                pre_processor_n_jobs=n_jobs,
                                                n_blocks=8)

#optimize hyperparameters and fit model
from numpy.random import randint
from numpy.random import uniform
pre_processor_parameters={'max_num':[1,3], 
                          'shape_type':[5], 
                          'energy_range':[10,30]}

vectorizer_parameters={'complexity':[2,3]}

estimator_parameters={'n_iter':randint(5, 100, size=n_iter),
                      'penalty':['l1','l2','elasticnet'],
                      'l1_ratio':uniform(0.1,0.9, size=n_iter), 
                      'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                      'power_t':uniform(0.1, size=n_iter),
                      'alpha': [10**x for x in range(-8,0)],
                      'eta0': [10**x for x in range(-4,-1)],
                      'learning_rate': ["invscaling", "constant", "optimal"],
                      'n_jobs':[n_jobs]}
active_set_size = size * 2
model_fname='eden_model_active_%s'%rfam_id
model.optimize(iterable_pos_train, iterable_neg_train, 
               model_name=model_fname,
               score_func=lambda avg_score,std_score : avg_score - std_score * 2,
               scoring='roc_auc',
               n_active_learning_iterations=4,
               n_iter=n_iter, 
               size_positive=-1,
               size_negative=active_set_size,
               cv=5,
               pre_processor_parameters=pre_processor_parameters, 
               vectorizer_parameters=vectorizer_parameters, 
               estimator_parameters=estimator_parameters)



	Parameters range:

Pre_processor:
energy_range: [10, 30]
   max_num: [1, 3]
shape_type: [5]

Vectorizer:
complexity: [2, 3]

Estimator:
     alpha: [1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1]
      eta0: [0.0001, 0.001, 0.01]
  l1_ratio: [ 0.63405141  0.49409925  0.43733058  0.84273391  0.74943742  0.12935857
  0.74796423  0.23494067  0.76513155  0.46857989  0.60615448  0.79020165
  0.72387158  0.42970979  0.38881726  0.74730415  0.56049453  0.27484479
  0.2972957   0.3557329 ]
learning_rate: ['invscaling', 'constant', 'optimal']
      loss: ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']
    n_iter: [20 74 57 60 99 87 55 10 87 76 11 62 74  6 73 37 95 67 79 87]
    n_jobs: [8]
   penalty: ['l1', 'l2', 'elasticnet']
   power_t: [ 0.50258769  0.99178652  0.28111626  0.59125157  0.99734861  0.75121048
  0.71330955  0.51694746  0.63013414  0.61725927  0.80714054  0.1012318
  0.21776973  0.75706118  0.76356052  0.45972932  0.60465661  0.24708863
  0.27101693 

TypeError: 'NoneType' object is not iterable

In [None]:
%%time
#estimate predictive performance
apr, roc = model.estimate( iterable_pos_test, iterable_neg_test )

In [None]:
from eden.model import ActiveLearningBinaryClassificationModel

model2 = ActiveLearningBinaryClassificationModel()
model2.load(model_fname)

from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)

predictions= model2.decision_function( iterable_pos )
for n,i in enumerate(sorted(predictions)): print n,i