In [1]:
from eden.util import configure_logging
import logging
configure_logging(logging.getLogger(),verbosity=2)

In [2]:
def rfam_uri(family_id):
    return 'http://rfam.xfam.org/family/%s/alignment?acc=%s&format=fastau&download=0'%(family_id,family_id)

def rfam_uri(family_id):
    return '%s.fa'%(family_id)

In [3]:
rfam_id = 'RF02275' #Hammerhead_HH9
rfam_id = 'RF00871' #microRNA mir-689
rfam_id = 'RF00005' #tRNA

In [4]:
def pre_processor( data, **args):
    from eden.converter.rna.rnafold import rnafold_to_eden
    graphs = rnafold_to_eden( data, **args )
    return graphs

In [5]:
def pre_processor( data, **args):
    from eden.converter.rna.rnashapes import rnashapes_to_eden
    graphs = rnashapes_to_eden( data, **args )
    return graphs

In [6]:
from eden.graph import Vectorizer
vectorizer = Vectorizer()

In [7]:
from sklearn.linear_model import SGDClassifier, Perceptron, PassiveAggressiveClassifier
estimator = PassiveAggressiveClassifier(shuffle=True)
estimator = Perceptron(class_weight='auto', shuffle=True)
estimator = SGDClassifier(average=True, class_weight='auto', shuffle=True)

In [8]:
#data setup
model_fname='eden_model_%s'%rfam_id
size=100
train_test_split=0.5
n_iter=20
times=2
n_jobs=8

#BinaryClassificationModel

In [9]:
#create iterable from files
from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs
from eden.modifier.seq import seq_to_seq, shuffle_modifier
iterable_neg = seq_to_seq( seqs_, modifier=shuffle_modifier, times=times, order=2 )

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)
iterable_neg = islice(iterable_neg,size*times)

#split train/test
from eden.util import random_bipartition_iter
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)

In [None]:
%%time
#make predictive model
from eden.model import ActiveLearningBinaryClassificationModel
model = ActiveLearningBinaryClassificationModel(pre_processor=pre_processor, 
                                                estimator=estimator, 
                                                vectorizer=vectorizer,
                                                n_jobs=n_jobs,
                                                pre_processor_n_jobs=n_jobs,
                                                n_blocks=5)

#optimize hyperparameters and fit model
from numpy.random import randint
from numpy.random import uniform
pre_processor_parameters={'max_num':[1,3], 
                          'shape_type':[5], 
                          'energy_range':[5,10,20,30]}

vectorizer_parameters={'complexity':[2,3,4]}

estimator_parameters={'n_iter':randint(5, 200, size=n_iter),
                      'penalty':['l1','l2','elasticnet'],
                      'l1_ratio':uniform(0.1,0.9, size=n_iter), 
                      'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                      'power_t':uniform(0.1, size=n_iter),
                      'alpha': [10**x for x in range(-8,0)],
                      'eta0': [10**x for x in range(-4,-1)],
                      'learning_rate': ["invscaling", "constant", "optimal"],
                      'n_jobs':[n_jobs]}

model.optimize(iterable_pos_train, iterable_neg_train, 
               model_name=model_fname,
               max_total_time=-1, 
               n_iter=n_iter,
               n_inner_iter_estimator=5,
               cv=5,
               score_func=lambda avg_score,std_score : avg_score - std_score * 2,
               scoring='roc_auc',
               two_steps_optimization=True,
               pre_processor_parameters=pre_processor_parameters, 
               vectorizer_parameters=vectorizer_parameters, 
               estimator_parameters=estimator_parameters)



	Parameters range:

Pre_processor:
energy_range: [5, 10, 20, 30]
   max_num: [1, 3]
shape_type: [5]

Vectorizer:
complexity: [2, 3, 4]

Estimator:
     alpha: [1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1]
      eta0: [0.0001, 0.001, 0.01]
  l1_ratio: [ 0.17482417  0.69768703  0.68100948  0.62539939  0.23006934  0.84876059
  0.78816375  0.74518468  0.64119135  0.50315439  0.6391717   0.16274715
  0.67007622  0.73891609  0.27296863  0.62970486  0.65766094  0.74392974
  0.83304329  0.85660139]
learning_rate: ['invscaling', 'constant', 'optimal']
      loss: ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']
    n_iter: [ 87  45  94  79 112 171 119  75 122 193 113  49 148 108 170  65  16 172
 113  68]
    n_jobs: [8]
   penalty: ['l1', 'l2', 'elasticnet']
   power_t: [ 0.77007352  0.9053417   0.74732784  0.96204492  0.1490612   0.15312946
  0.88928929  0.19962213  0.47111422  0.70008401  0.41142763  0.10431711
  0.18798093  0.73422708  0.10745526  0.29276504  0.83

In [None]:
%%time
#estimate predictive performance
print model.get_parameters()
apr, roc = model.estimate( iterable_pos_test, iterable_neg_test )

Models can be reloaded from disk

In [12]:
from eden.model import ActiveLearningBinaryClassificationModel

model2 = ActiveLearningBinaryClassificationModel()
model2.load(model_fname)

from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)

predictions= model2.decision_function( iterable_pos )
for n,i in enumerate(sorted(predictions)): print n,i

0 -6.6446876101e-05
1 -1.07330851415e-05
2 -2.46083103463e-06
3 2.68052962312e-06
4 1.16926712828e-05
5 1.17318784818e-05
6 1.93433825727e-05
7 1.96932378551e-05
8 2.57584126984e-05
9 2.59378132915e-05
10 2.7503257444e-05
11 2.79824615424e-05
12 3.24867866333e-05
13 3.47749441957e-05
14 3.58166488332e-05
15 3.98652223942e-05
16 4.31558950465e-05
17 4.48311550984e-05
18 4.50654252384e-05
19 4.52482516244e-05
20 4.63946753809e-05
21 4.86906838648e-05
22 5.07834661988e-05
23 5.187638526e-05
24 5.29556561857e-05
25 5.4765272365e-05
26 5.48137798329e-05
27 5.98309323683e-05
28 6.06973424699e-05
29 6.38006872195e-05
30 6.5079908308e-05
31 6.72991370485e-05
32 6.80366131352e-05
33 7.26730695852e-05
34 7.37198314876e-05
35 7.43107033208e-05
36 7.49635823821e-05
37 8.16504793707e-05
38 8.17668348854e-05
39 8.27806105623e-05
40 8.86700847357e-05
41 8.93631256127e-05
42 9.09916961444e-05
43 9.62287107949e-05
44 9.65642942761e-05
45 0.000100232964685
46 0.000100367041398
47 0.000104345669962
48 0.

#ActiveLearningBinaryClassificationModel

In [13]:
#create iterable from files
from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs
from eden.modifier.seq import seq_to_seq, shuffle_modifier
iterable_neg = seq_to_seq( seqs_, modifier=shuffle_modifier, times=times, order=2 )

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)
iterable_neg = islice(iterable_neg,size*times)

#split train/test
from eden.util import random_bipartition_iter
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)

In [14]:
%%time
#make predictive model
from eden.model import ActiveLearningBinaryClassificationModel
model = ActiveLearningBinaryClassificationModel(pre_processor=pre_processor, 
                                                estimator=estimator, 
                                                vectorizer=vectorizer,
                                                n_jobs=n_jobs,
                                                pre_processor_n_jobs=n_jobs,
                                                n_blocks=8)

#optimize hyperparameters and fit model
from numpy.random import randint
from numpy.random import uniform
pre_processor_parameters={'max_num':[1,3], 
                          'shape_type':[5], 
                          'energy_range':[10,30]}

vectorizer_parameters={'complexity':[2,3]}

estimator_parameters={'n_iter':randint(5, 100, size=n_iter),
                      'penalty':['l1','l2','elasticnet'],
                      'l1_ratio':uniform(0.1,0.9, size=n_iter), 
                      'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                      'power_t':uniform(0.1, size=n_iter),
                      'alpha': [10**x for x in range(-8,0)],
                      'eta0': [10**x for x in range(-4,-1)],
                      'learning_rate': ["invscaling", "constant", "optimal"],
                      'n_jobs':[n_jobs]}
active_set_size = size * 2
model_fname='eden_model_active_%s'%rfam_id
model.optimize(iterable_pos_train, iterable_neg_train, 
               model_name=model_fname,
               score_func=lambda avg_score,std_score : avg_score - std_score * 2,
               scoring='roc_auc',
               n_active_learning_iterations=4,
               n_iter=n_iter, 
               size_positive=-1,
               size_negative=active_set_size,
               cv=5,
               pre_processor_parameters=pre_processor_parameters, 
               vectorizer_parameters=vectorizer_parameters, 
               estimator_parameters=estimator_parameters)



	Iteration: 2/10 (after 104.7 sec; 0:01:44.729504)
Best score (roc_auc): 0.797 (0.893 +- 0.048)

Data:
Instances: 150 ; Features: 1048577 with an avg of 541 features per instance
class: 1 count:50 (0.33)	class: -1 count:100 (0.67)	

	Model parameters:

Pre_processor:
energy_range: 10
   max_num: 1
shape_type: 5

Vectorizer:
complexity: 2

Estimator:
     alpha: 1e-06
      eta0: 0.01
  l1_ratio: 0.276471138229
learning_rate: constant
      loss: log
    n_iter: 83
    n_jobs: 8
   penalty: l1
   power_t: 0.292384564217


	Iteration: 5/10 (after 264.1 sec; 0:04:24.088198)
Best score (roc_auc): 0.840 (0.931 +- 0.046)

Data:
Instances: 150 ; Features: 1048577 with an avg of 1894 features per instance
class: 1 count:50 (0.33)	class: -1 count:100 (0.67)	

	Model parameters:

Pre_processor:
energy_range: 10
   max_num: 3
shape_type: 5

Vectorizer:
complexity: 3

Estimator:
     alpha: 0.01
      eta0: 0.01
  l1_ratio: 0.175513021931
learning_rate: optimal
      loss: log
    n_iter: 72
   

In [15]:
%%time
#estimate predictive performance
apr, roc = model.estimate( iterable_pos_test, iterable_neg_test )


Classifier:
SGDClassifier(alpha=0.01, average=True, class_weight='auto', epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.17551302193112905,
       learning_rate='optimal', loss='log', n_iter=72, n_jobs=8,
       penalty='l2', power_t=0.3216459384400111, random_state=None,
       shuffle=True, verbose=0, warm_start=False)

Data:
Instances: 150 ; Features: 1048577 with an avg of 1898 features per instance

Predictive performace estimate:
             precision    recall  f1-score   support

         -1       0.93      0.98      0.96       100
          1       0.96      0.86      0.91        50

avg / total       0.94      0.94      0.94       150

APR: 0.975
ROC: 0.977
CPU times: user 3.2 s, sys: 350 ms, total: 3.55 s
Wall time: 7.5 s


In [16]:
from eden.model import ActiveLearningBinaryClassificationModel

model2 = ActiveLearningBinaryClassificationModel()
model2.load(model_fname)

from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)

predictions= model2.decision_function( iterable_pos )
for n,i in enumerate(sorted(predictions)): print n,i

0 -0.207640159996
1 -0.0877555056891
2 -0.0407280268273
3 -0.0260552622429
4 -0.0211254675774
5 -0.0178619327287
6 -0.00909325943827
7 0.00225324336579
8 0.00672392984217
9 0.0215441101587
10 0.0318942605131
11 0.0333238497983
12 0.042915469343
13 0.0485582828176
14 0.0488817584683
15 0.0501445340556
16 0.0519468263948
17 0.0618222266992
18 0.0621923301874
19 0.0667016584461
20 0.069828839551
21 0.0785769146315
22 0.0804500994852
23 0.0804552053105
24 0.0970224313264
25 0.0987050937849
26 0.102642940265
27 0.104449823478
28 0.108785636695
29 0.111767694789
30 0.113620980433
31 0.115329338583
32 0.121520066786
33 0.129446807689
34 0.130007741977
35 0.132573103683
36 0.137276070024
37 0.143803508735
38 0.147786752678
39 0.158764185263
40 0.161517148664
41 0.165325263825
42 0.168385035823
43 0.186278008357
44 0.196543096791
45 0.201227519303
46 0.207959470582
47 0.212944753755
48 0.217177399584
49 0.218409993464
50 0.220422136437
51 0.222482890942
52 0.225526827033
53 0.2256955221
54 0.22