In [20]:
from eden.util import configure_logging
import logging
configure_logging(logging.getLogger(),verbosity=2)

In [21]:
def rfam_uri(family_id):
    return 'http://rfam.xfam.org/family/%s/alignment?acc=%s&format=fastau&download=0'%(family_id,family_id)

def rfam_uri(family_id):
    return '%s.fa'%(family_id)

In [22]:
rfam_id = 'RF02275' #Hammerhead_HH9
rfam_id = 'RF00871' #microRNA mir-689
rfam_id = 'RF00005' #tRNA

In [23]:
def pre_processor( data, **args):
    from eden.converter.rna.rnafold import rnafold_to_eden
    graphs = rnafold_to_eden( data, **args )
    return graphs

In [24]:
def pre_processor( data, **args):
    from eden.converter.rna.rnashapes import rnashapes_to_eden
    graphs = rnashapes_to_eden( data, **args )
    return graphs

In [25]:
from eden.graph import Vectorizer
vectorizer = Vectorizer()

In [26]:
from sklearn.linear_model import SGDClassifier, Perceptron, PassiveAggressiveClassifier
estimator = PassiveAggressiveClassifier(shuffle=True)
estimator = Perceptron(class_weight='auto', shuffle=True)
estimator = SGDClassifier(average=True, class_weight='auto', shuffle=True)

In [27]:
#data setup
model_fname='eden_model_%s'%rfam_id
size=50
train_test_split=0.5
n_iter=8
times=4
n_jobs=8

#BinaryClassificationModel with Default Parameters

In [28]:
#create iterable from files
from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs
from eden.modifier.seq import seq_to_seq, shuffle_modifier
iterable_neg = seq_to_seq( seqs_, modifier=shuffle_modifier, times=times, order=2 )

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)
iterable_neg = islice(iterable_neg,size*times)

#split train/test
from eden.util import random_bipartition_iter
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)

In [29]:
%%time
#make predictive model
from eden.model import ActiveLearningBinaryClassificationModel
model = ActiveLearningBinaryClassificationModel(pre_processor=pre_processor, 
                                                estimator=estimator, 
                                                vectorizer=vectorizer,
                                                n_jobs=n_jobs,
                                                pre_processor_n_jobs=n_jobs,
                                                n_blocks=5)

#optimize hyperparameters and fit model
from numpy.random import randint
from numpy.random import uniform
pre_processor_parameters={'max_num':[3,1,2,3], 
                          'shape_type':[4,5], 
                          'energy_range':[30, 5,10,20,30,40]}

vectorizer_parameters={'complexity':[2,3]}

estimator_parameters={'n_iter':randint(5, 200, size=n_iter),
                      'penalty':['l1','l2','elasticnet'],
                      'l1_ratio':uniform(0.1,0.9, size=n_iter), 
                      'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                      'power_t':uniform(0.1, size=n_iter),
                      'alpha': [10**x for x in range(-8,0)],
                      'eta0': [10**x for x in range(-4,-1)],
                      'learning_rate': ["invscaling", "constant", "optimal"],
                      'n_jobs':[n_jobs]}

model.optimize(iterable_pos_train, iterable_neg_train, 
               model_name=model_fname,
               n_iter=1,
               pre_processor_parameters=pre_processor_parameters, 
               vectorizer_parameters=vectorizer_parameters, 
               estimator_parameters=estimator_parameters)



	Parameters range:

Pre_processor:
energy_range: [30, 5, 10, 20, 30, 40]
   max_num: [3, 1, 2, 3]
shape_type: [4, 5]

Vectorizer:
complexity: [2, 3]

Estimator:
     alpha: [1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1]
      eta0: [0.0001, 0.001, 0.01]
  l1_ratio: [ 0.45833187  0.46905387  0.64718992  0.61538876  0.47578163  0.45277172
  0.45216319  0.17156218]
learning_rate: ['invscaling', 'constant', 'optimal']
      loss: ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']
    n_iter: [180  62 198  63 106  67  20 153]
    n_jobs: [8]
   penalty: ['l1', 'l2', 'elasticnet']
   power_t: [ 0.59115381  0.54069415  0.44542761  0.50605515  0.83524811  0.17475943
  0.82496197  0.66282405]
n_iter is 1: switching to default parameters
Saved current best model in eden_model_RF00005
CPU times: user 4.11 s, sys: 686 ms, total: 4.8 s
Wall time: 7.88 s


In [30]:
%%time
#estimate predictive performance
print model.get_parameters()
apr, roc = model.estimate( iterable_pos_test, iterable_neg_test )


	Model parameters:

Pre_processor:
energy_range: 30
   max_num: 3
shape_type: 4

Vectorizer:
complexity: 2

Estimator:
     alpha: 1e-08
      eta0: 0.0001
  l1_ratio: 0.458331867897
learning_rate: invscaling
      loss: hinge
    n_iter: 180
    n_jobs: 8
   penalty: l1
   power_t: 0.591153810828

Classifier:
SGDClassifier(alpha=1e-08, average=True, class_weight='auto', epsilon=0.1,
       eta0=0.0001, fit_intercept=True, l1_ratio=0.45833186789726621,
       learning_rate='invscaling', loss='hinge', n_iter=180, n_jobs=8,
       penalty='l1', power_t=0.59115381082770557, random_state=None,
       shuffle=True, verbose=0, warm_start=False)

Data:
Instances: 125 ; Features: 1048577 with an avg of 824 features per instance

Predictive performace estimate:
             precision    recall  f1-score   support

         -1       1.00      0.02      0.04       100
          1       0.20      1.00      0.34        25

avg / total       0.84      0.22      0.10       125

APR: 0.648
ROC: 0.778

#BinaryClassificationModel with optimisation

In [31]:
#create iterable from files
from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs
from eden.modifier.seq import seq_to_seq, shuffle_modifier
iterable_neg = seq_to_seq( seqs_, modifier=shuffle_modifier, times=times, order=2 )

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)
iterable_neg = islice(iterable_neg,size*times)

#split train/test
from eden.util import random_bipartition_iter
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)

In [32]:
%%time
#make predictive model
from eden.model import ActiveLearningBinaryClassificationModel
model = ActiveLearningBinaryClassificationModel(pre_processor=pre_processor, 
                                                estimator=estimator, 
                                                vectorizer=vectorizer,
                                                n_jobs=n_jobs,
                                                pre_processor_n_jobs=n_jobs,
                                                n_blocks=5)

#optimize hyperparameters and fit model
from numpy.random import randint
from numpy.random import uniform
pre_processor_parameters={'max_num':[3,1,2,3], 
                          'shape_type':[4,5], 
                          'energy_range':[30, 5,10,20,30,40]}

vectorizer_parameters={'complexity':[2,3]}

estimator_parameters={'n_iter':randint(5, 200, size=n_iter),
                      'penalty':['l1','l2','elasticnet'],
                      'l1_ratio':uniform(0.1,0.9, size=n_iter), 
                      'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                      'power_t':uniform(0.1, size=n_iter),
                      'alpha': [10**x for x in range(-8,0)],
                      'eta0': [10**x for x in range(-4,-1)],
                      'learning_rate': ["invscaling", "constant", "optimal"],
                      'n_jobs':[n_jobs]}

model.optimize(iterable_pos_train, iterable_neg_train, 
               model_name=model_fname,
               max_total_time=-1, 
               n_iter=n_iter,
               n_inner_iter_estimator=5,
               cv=5,
               score_func=lambda avg_score,std_score : avg_score - std_score * 2,
               scoring='roc_auc',
               two_steps_optimization=True,
               pre_processor_parameters=pre_processor_parameters, 
               vectorizer_parameters=vectorizer_parameters, 
               estimator_parameters=estimator_parameters)



	Parameters range:

Pre_processor:
energy_range: [30, 5, 10, 20, 30, 40]
   max_num: [3, 1, 2, 3]
shape_type: [4, 5]

Vectorizer:
complexity: [2, 3]

Estimator:
     alpha: [1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1]
      eta0: [0.0001, 0.001, 0.01]
  l1_ratio: [ 0.63439113  0.22181719  0.75253848  0.63675579  0.6930968   0.43530866
  0.27201489  0.78718222]
learning_rate: ['invscaling', 'constant', 'optimal']
      loss: ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']
    n_iter: [137 146  79  66  79 141  10 168]
    n_jobs: [8]
   penalty: ['l1', 'l2', 'elasticnet']
   power_t: [ 0.4449138   0.69399323  0.61329442  0.15718889  0.8751706   0.89029824
  0.3707164   0.43369971]
iteration: (1/5) 1/8 score (roc_auc): 0.724 (0.910 +- 0.093)


	Iteration: 1/8 (after 7.3 sec; 0:00:07.319340)
Best score (roc_auc): 0.724 (0.910 +- 0.093)

Data:
Instances: 125 ; Features: 1048577 with an avg of 846 features per instance
class: 1 count:25 (0.20)	class: -1 count:10

In [33]:
%%time
#estimate predictive performance
print model.get_parameters()
apr, roc = model.estimate( iterable_pos_test, iterable_neg_test )


	Model parameters:

Pre_processor:
energy_range: 30
   max_num: 1
shape_type: 5

Vectorizer:
complexity: 2

Estimator:
     alpha: 1e-07
      eta0: 0.0001
  l1_ratio: 0.636755794178
learning_rate: optimal
      loss: squared_hinge
    n_iter: 168
    n_jobs: 8
   penalty: l2
   power_t: 0.370716399179

Classifier:
SGDClassifier(alpha=1e-07, average=True, class_weight='auto', epsilon=0.1,
       eta0=0.0001, fit_intercept=True, l1_ratio=0.63675579417838901,
       learning_rate='optimal', loss='squared_hinge', n_iter=168, n_jobs=8,
       penalty='l2', power_t=0.37071639917922394, random_state=None,
       shuffle=True, verbose=0, warm_start=False)

Data:
Instances: 125 ; Features: 1048577 with an avg of 539 features per instance

Predictive performace estimate:
             precision    recall  f1-score   support

         -1       0.91      0.96      0.93       100
          1       0.79      0.60      0.68        25

avg / total       0.88      0.89      0.88       125

APR: 0.846


Models can be reloaded from disk

In [34]:
from eden.model import ActiveLearningBinaryClassificationModel

model2 = ActiveLearningBinaryClassificationModel()
model2.load(model_fname)

from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)

predictions= model2.decision_function( iterable_pos )
for n,i in enumerate(sorted(predictions)): print n,i

0 -1.97101822093e+13
1 -7.35764193893e+12
2 -7.09754592034e+12
3 -4.7882014607e+12
4 -3.37429885621e+12
5 -3.16913100515e+12
6 -1.81069792889e+12
7 -1.55226667126e+12
8 -1.25202997698e+12
9 -797510914496.0
10 2.40676296741e+12
11 2.69653383959e+12
12 3.62637825702e+12
13 5.79260195555e+12
14 6.00823840975e+12
15 6.09694460939e+12
16 9.26065405415e+12
17 1.04129183199e+13
18 1.1657460466e+13
19 1.1934315022e+13
20 1.21041875587e+13
21 1.21574725905e+13
22 1.24561293146e+13
23 1.42181635901e+13
24 1.53622586791e+13
25 1.57535802871e+13
26 1.59110990495e+13
27 1.62549991833e+13
28 2.11702970663e+13
29 2.13733146274e+13
30 2.54813724991e+13
31 2.68358031418e+13
32 3.06106497162e+13
33 3.49245750265e+13
34 3.73941421185e+13
35 3.7468913099e+13
36 3.79534065309e+13
37 3.80124335864e+13
38 3.81243298545e+13
39 3.88708629459e+13
40 4.11960575865e+13
41 4.1222841921e+13
42 4.31545695103e+13
43 4.32748660092e+13
44 4.68674479294e+13
45 4.77337480834e+13
46 4.82046871131e+13
47 4.91709038027e+13


#ActiveLearningBinaryClassificationModel

In [35]:
#create iterable from files
from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs
from eden.modifier.seq import seq_to_seq, shuffle_modifier
iterable_neg = seq_to_seq( seqs_, modifier=shuffle_modifier, times=times, order=2 )

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)
iterable_neg = islice(iterable_neg,size*times)

#split train/test
from eden.util import random_bipartition_iter
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)

In [36]:
%%time
#make predictive model
from eden.model import ActiveLearningBinaryClassificationModel
model = ActiveLearningBinaryClassificationModel(pre_processor=pre_processor, 
                                                estimator=estimator, 
                                                vectorizer=vectorizer,
                                                n_jobs=n_jobs,
                                                pre_processor_n_jobs=n_jobs,
                                                n_blocks=8)

#optimize hyperparameters and fit model
from numpy.random import randint
from numpy.random import uniform
pre_processor_parameters={'max_num':[1,3], 
                          'shape_type':[5], 
                          'energy_range':[10,30]}

vectorizer_parameters={'complexity':[2,3]}

estimator_parameters={'n_iter':randint(5, 100, size=n_iter),
                      'penalty':['l1','l2','elasticnet'],
                      'l1_ratio':uniform(0.1,0.9, size=n_iter), 
                      'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                      'power_t':uniform(0.1, size=n_iter),
                      'alpha': [10**x for x in range(-8,0)],
                      'eta0': [10**x for x in range(-4,-1)],
                      'learning_rate': ["invscaling", "constant", "optimal"],
                      'n_jobs':[n_jobs]}
active_set_size = size * 2
model_fname='eden_model_active_%s'%rfam_id
model.optimize(iterable_pos_train, iterable_neg_train, 
               model_name=model_fname,
               score_func=lambda avg_score,std_score : avg_score - std_score * 2,
               scoring='roc_auc',
               n_active_learning_iterations=4,
               n_iter=n_iter, 
               size_positive=-1,
               size_negative=active_set_size,
               cv=5,
               pre_processor_parameters=pre_processor_parameters, 
               vectorizer_parameters=vectorizer_parameters, 
               estimator_parameters=estimator_parameters)



	Parameters range:

Pre_processor:
energy_range: [10, 30]
   max_num: [1, 3]
shape_type: [5]

Vectorizer:
complexity: [2, 3]

Estimator:
     alpha: [1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1]
      eta0: [0.0001, 0.001, 0.01]
  l1_ratio: [ 0.41498197  0.66027452  0.4106494   0.74756393  0.78836933  0.41586935
  0.19717904  0.1883821 ]
learning_rate: ['invscaling', 'constant', 'optimal']
      loss: ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']
    n_iter: [38 39 18 89 16 92 41 37]
    n_jobs: [8]
   penalty: ['l1', 'l2', 'elasticnet']
   power_t: [ 0.4883884   0.3461041   0.78654708  0.51250154  0.34585254  0.54596373
  0.56011454  0.23687811]
iteration: (1/5) 1/8 score (roc_auc): 0.380 (0.800 +- 0.210)


	Iteration: 1/8 (after 36.1 sec; 0:00:36.143424)
Best score (roc_auc): 0.380 (0.800 +- 0.210)

Data:
Instances: 35 ; Features: 1048577 with an avg of 892 features per instance
class: 1 count:25 (0.71)	class: -1 count:10 (0.29)	

	Model parameters:

Pr

In [37]:
%%time
#estimate predictive performance
apr, roc = model.estimate( iterable_pos_test, iterable_neg_test )


Classifier:
SGDClassifier(alpha=1e-06, average=True, class_weight='auto', epsilon=0.1,
       eta0=0.001, fit_intercept=True, l1_ratio=0.78836933000807574,
       learning_rate='optimal', loss='log', n_iter=41, n_jobs=8,
       penalty='elasticnet', power_t=0.48838839698808101,
       random_state=None, shuffle=True, verbose=0, warm_start=False)

Data:
Instances: 125 ; Features: 1048577 with an avg of 538 features per instance

Predictive performace estimate:
             precision    recall  f1-score   support

         -1       0.90      0.95      0.92       100
          1       0.74      0.56      0.64        25

avg / total       0.86      0.87      0.87       125

APR: 0.805
ROC: 0.919
CPU times: user 1.13 s, sys: 393 ms, total: 1.52 s
Wall time: 2.58 s


In [38]:
from eden.model import ActiveLearningBinaryClassificationModel

model2 = ActiveLearningBinaryClassificationModel()
model2.load(model_fname)

from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)

predictions= model2.decision_function( iterable_pos )
for n,i in enumerate(sorted(predictions)): print n,i

0 -9.15441193458
1 -3.26274204084
2 -2.92329359638
3 -2.53663040528
4 -2.50864632443
5 -1.51944960973
6 -1.45082184291
7 -0.912452486938
8 -0.893883898788
9 -0.844319188942
10 -0.584092410651
11 1.29428172831
12 1.55886611575
13 2.65707754247
14 3.2153114505
15 3.30005985442
16 3.66643922389
17 4.37081449884
18 4.81857917744
19 4.91645421641
20 5.49344923367
21 5.54164847005
22 6.06949893411
23 6.25015214694
24 6.83042294098
25 7.5396420058
26 7.64529287534
27 8.29028070133
28 8.4935455855
29 8.66314069637
30 9.32892574504
31 11.6503877828
32 14.3215794701
33 14.5877776096
34 15.5673705644
35 16.0083179259
36 17.3348909386
37 19.4410047235
38 19.9094633179
39 20.1682547975
40 20.4469795704
41 21.4644840854
42 21.6500559136
43 21.9604598916
44 22.1846592975
45 22.5251133201
46 23.7537913933
47 25.187098142
48 25.6844746392
49 29.5405464382
