In [1]:
from eden.util import configure_logging
import logging
configure_logging(logging.getLogger(),verbosity=2)

In [2]:
def rfam_uri(family_id):
    return '%s.fa'%(family_id)
def rfam_uri(family_id):
    return 'http://rfam.xfam.org/family/%s/alignment?acc=%s&format=fastau&download=0'%(family_id,family_id)

In [3]:
rfam_id = 'RF02275' #Hammerhead_HH9
rfam_id = 'RF00871' #microRNA mir-689
rfam_id = 'RF00005' #tRNA

In [4]:
def pre_processor( data, **args):
    from eden.converter.rna.rnafold import rnafold_to_eden
    graphs = rnafold_to_eden( data, **args )
    return graphs

In [5]:
def pre_processor( data, **args):
    from eden.converter.rna.rnashapes import rnashapes_to_eden
    graphs = rnashapes_to_eden( data, **args )
    return graphs

In [6]:
from eden.graph import Vectorizer
vectorizer = Vectorizer()

In [7]:
from sklearn.linear_model import SGDClassifier, Perceptron, PassiveAggressiveClassifier
estimator = PassiveAggressiveClassifier(shuffle=True)
estimator = Perceptron(class_weight='auto', shuffle=True)
estimator = SGDClassifier(average=True, class_weight='auto', shuffle=True)

In [8]:
#data setup
model_fname='eden_model_%s'%rfam_id
size=50
train_test_split=0.5
n_iter=8
times=4
n_jobs=8

#BinaryClassificationModel with Default Parameters

In [9]:
#create iterable from files
from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs
from eden.modifier.seq import seq_to_seq, shuffle_modifier
iterable_neg = seq_to_seq( seqs_, modifier=shuffle_modifier, times=times, order=2 )

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)
iterable_neg = islice(iterable_neg,size*times)

#split train/test
from eden.util import random_bipartition_iter
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)

Starting new HTTP connection (1): rfam.xfam.org
"GET /family/RF00005/alignment?acc=RF00005&format=fastau&download=0 HTTP/1.1" 200 90476


In [10]:
%%time
#make predictive model
from eden.model import ActiveLearningBinaryClassificationModel
model = ActiveLearningBinaryClassificationModel(pre_processor=pre_processor, 
                                                estimator=estimator, 
                                                vectorizer=vectorizer,
                                                n_jobs=n_jobs,
                                                pre_processor_n_jobs=n_jobs,
                                                n_blocks=5)

#optimize hyperparameters and fit model
from numpy.random import randint
from numpy.random import uniform
pre_processor_parameters={'max_num':[3,1,2,3], 
                          'shape_type':[4,5], 
                          'energy_range':[30, 5,10,20,30,40]}

vectorizer_parameters={'complexity':[2,3]}

estimator_parameters={'n_iter':randint(5, 200, size=n_iter),
                      'penalty':['l1','l2','elasticnet'],
                      'l1_ratio':uniform(0.1,0.9, size=n_iter), 
                      'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                      'power_t':uniform(0.1, size=n_iter),
                      'alpha': [10**x for x in range(-8,0)],
                      'eta0': [10**x for x in range(-4,-1)],
                      'learning_rate': ["invscaling", "constant", "optimal"],
                      'n_jobs':[n_jobs]}

model.optimize(iterable_pos_train, iterable_neg_train, 
               model_name=model_fname,
               n_iter=1,
               pre_processor_parameters=pre_processor_parameters, 
               vectorizer_parameters=vectorizer_parameters, 
               estimator_parameters=estimator_parameters)



	Parameters range:

Pre_processor:
energy_range: [30, 5, 10, 20, 30, 40]
   max_num: [3, 1, 2, 3]
shape_type: [4, 5]

Vectorizer:
complexity: [2, 3]

Estimator:
     alpha: [1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1]
      eta0: [0.0001, 0.001, 0.01]
  l1_ratio: [ 0.33418554  0.21963962  0.144801    0.55086128  0.78204454  0.43987457
  0.33839343  0.39592699]
learning_rate: ['invscaling', 'constant', 'optimal']
      loss: ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']
    n_iter: [147  87  60 187  75 115  28 193]
    n_jobs: [8]
   penalty: ['l1', 'l2', 'elasticnet']
   power_t: [ 0.9394288   0.98815026  0.61164913  0.1143634   0.60606623  0.21615667
  0.2448983   0.39863655]
n_iter is 1: switching to default parameters
Saved current best model in eden_model_RF00005
CPU times: user 4.2 s, sys: 572 ms, total: 4.77 s
Wall time: 6.87 s


In [11]:
%%time
#estimate predictive performance
print model.get_parameters()
apr, roc = model.estimate( iterable_pos_test, iterable_neg_test )


	Model parameters:

Pre_processor:
energy_range: 30
   max_num: 3
shape_type: 4

Vectorizer:
complexity: 2

Estimator:
     alpha: 1e-08
      eta0: 0.0001
  l1_ratio: 0.334185544899
learning_rate: invscaling
      loss: hinge
    n_iter: 147
    n_jobs: 8
   penalty: l1
   power_t: 0.939428796324

Classifier:
SGDClassifier(alpha=1e-08, average=True, class_weight='auto', epsilon=0.1,
       eta0=0.0001, fit_intercept=True, l1_ratio=0.33418554489883145,
       learning_rate='invscaling', loss='hinge', n_iter=147, n_jobs=8,
       penalty='l1', power_t=0.93942879632392096, random_state=None,
       shuffle=True, verbose=0, warm_start=False)

Data:
Instances: 125 ; Features: 1048577 with an avg of 824 features per instance

Predictive performace estimate:
             precision    recall  f1-score   support

         -1       1.00      0.03      0.06       100
          1       0.20      1.00      0.34        25

avg / total       0.84      0.22      0.11       125

APR: 0.621
ROC: 0.741

#BinaryClassificationModel with optimisation

In [12]:
#create iterable from files
from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs
from eden.modifier.seq import seq_to_seq, shuffle_modifier
iterable_neg = seq_to_seq( seqs_, modifier=shuffle_modifier, times=times, order=2 )

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)
iterable_neg = islice(iterable_neg,size*times)

#split train/test
from eden.util import random_bipartition_iter
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)

Starting new HTTP connection (1): rfam.xfam.org
"GET /family/RF00005/alignment?acc=RF00005&format=fastau&download=0 HTTP/1.1" 200 90476


In [13]:
%%time
#make predictive model
from eden.model import ActiveLearningBinaryClassificationModel
model = ActiveLearningBinaryClassificationModel(pre_processor=pre_processor, 
                                                estimator=estimator, 
                                                vectorizer=vectorizer,
                                                n_jobs=n_jobs,
                                                pre_processor_n_jobs=n_jobs,
                                                n_blocks=5)

#optimize hyperparameters and fit model
from numpy.random import randint
from numpy.random import uniform
pre_processor_parameters={'max_num':[3,1,2,3], 
                          'shape_type':[4,5], 
                          'energy_range':[30, 5,10,20,30,40]}

vectorizer_parameters={'complexity':[2,3]}

estimator_parameters={'n_iter':randint(5, 200, size=n_iter),
                      'penalty':['l1','l2','elasticnet'],
                      'l1_ratio':uniform(0.1,0.9, size=n_iter), 
                      'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                      'power_t':uniform(0.1, size=n_iter),
                      'alpha': [10**x for x in range(-8,0)],
                      'eta0': [10**x for x in range(-4,-1)],
                      'learning_rate': ["invscaling", "constant", "optimal"],
                      'n_jobs':[n_jobs]}

model.optimize(iterable_pos_train, iterable_neg_train, 
               model_name=model_fname,
               max_total_time=-1, 
               n_iter=n_iter,
               n_inner_iter_estimator=5,
               cv=5,
               score_func=lambda avg_score,std_score : avg_score - std_score * 2,
               scoring='roc_auc',
               two_steps_optimization=True,
               pre_processor_parameters=pre_processor_parameters, 
               vectorizer_parameters=vectorizer_parameters, 
               estimator_parameters=estimator_parameters)



	Parameters range:

Pre_processor:
energy_range: [30, 5, 10, 20, 30, 40]
   max_num: [3, 1, 2, 3]
shape_type: [4, 5]

Vectorizer:
complexity: [2, 3]

Estimator:
     alpha: [1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1]
      eta0: [0.0001, 0.001, 0.01]
  l1_ratio: [ 0.31258829  0.3834278   0.62296633  0.49285399  0.34305613  0.72833024
  0.56491681  0.7085739 ]
learning_rate: ['invscaling', 'constant', 'optimal']
      loss: ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']
    n_iter: [165 143  39 180  76  28 165  65]
    n_jobs: [8]
   penalty: ['l1', 'l2', 'elasticnet']
   power_t: [ 0.93117423  0.12194226  0.11168518  0.82958922  0.18843868  0.6520813
  0.1807903   0.47740047]
iteration: (1/5) 1/8 score (roc_auc): -0.067 (0.108 +- 0.087)
iteration: (2/5) 1/8 score (roc_auc): 0.810 (0.928 +- 0.059)


	Iteration: 1/8 (after 10.0 sec; 0:00:10.045062)
Best score (roc_auc): 0.810 (0.928 +- 0.059)

Data:
Instances: 125 ; Features: 1048577 with an avg of 846 fea

In [14]:
%%time
#estimate predictive performance
print model.get_parameters()
apr, roc = model.estimate( iterable_pos_test, iterable_neg_test )


	Model parameters:

Pre_processor:
energy_range: 10
   max_num: 3
shape_type: 5

Vectorizer:
complexity: 3

Estimator:
     alpha: 0.0001
      eta0: 0.01
  l1_ratio: 0.38342779841
learning_rate: constant
      loss: perceptron
    n_iter: 143
    n_jobs: 8
   penalty: l2
   power_t: 0.47740046982

Classifier:
SGDClassifier(alpha=0.0001, average=True, class_weight='auto', epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.383427798409541,
       learning_rate='constant', loss='perceptron', n_iter=143, n_jobs=8,
       penalty='l2', power_t=0.47740046982037487, random_state=None,
       shuffle=True, verbose=0, warm_start=False)

Data:
Instances: 125 ; Features: 1048577 with an avg of 1828 features per instance

Predictive performace estimate:
             precision    recall  f1-score   support

         -1       0.85      1.00      0.92       100
          1       1.00      0.32      0.48        25

avg / total       0.88      0.86      0.83       125

APR: 0.793
ROC: 0.91

Models can be reloaded from disk

In [15]:
from eden.model import ActiveLearningBinaryClassificationModel

model2 = ActiveLearningBinaryClassificationModel()
model2.load(model_fname)

from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)

predictions= model2.decision_function( iterable_pos )
for n,i in enumerate(sorted(predictions)): print n,i

Starting new HTTP connection (1): rfam.xfam.org
"GET /family/RF00005/alignment?acc=RF00005&format=fastau&download=0 HTTP/1.1" 200 90476
0 -0.00244867384637
1 -0.00232587992654
2 -0.00202897778446
3 -0.0019152072693
4 -0.00184700827564
5 -0.00139237252617
6 -0.00130011498856
7 -0.00129808767802
8 -0.00127266118329
9 -0.0010903694344
10 -0.00105371679253
11 -0.000891410044599
12 -0.000809706985617
13 -0.000444221987391
14 -0.000165919920118
15 -2.4968877392e-05
16 -2.31121883594e-05
17 0.000248732181698
18 0.000285568702631
19 0.00033953640287
20 0.000417273036804
21 0.000540995960699
22 0.000664323654133
23 0.000809973378205
24 0.00119850724326
25 0.0012089889189
26 0.00137843966026
27 0.00142588435815
28 0.00169989629721
29 0.00192206733694
30 0.00201313435088
31 0.00213499006778
32 0.00247260023454
33 0.00333818794276
34 0.008105038856
35 0.00842395777557
36 0.00843122783122
37 0.00849442377438
38 0.00856517435504
39 0.00885879765141
40 0.00929886193834
41 0.00932017368102
42 0.009492

#ActiveLearningBinaryClassificationModel

In [16]:
#create iterable from files
from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs
from eden.modifier.seq import seq_to_seq, shuffle_modifier
iterable_neg = seq_to_seq( seqs_, modifier=shuffle_modifier, times=times, order=2 )

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)
iterable_neg = islice(iterable_neg,size*times)

#split train/test
from eden.util import random_bipartition_iter
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)

Starting new HTTP connection (1): rfam.xfam.org
"GET /family/RF00005/alignment?acc=RF00005&format=fastau&download=0 HTTP/1.1" 200 90476


In [17]:
%%time
#make predictive model
from eden.model import ActiveLearningBinaryClassificationModel
model = ActiveLearningBinaryClassificationModel(pre_processor=pre_processor, 
                                                estimator=estimator, 
                                                vectorizer=vectorizer,
                                                n_jobs=n_jobs,
                                                pre_processor_n_jobs=n_jobs,
                                                n_blocks=8)

#optimize hyperparameters and fit model
from numpy.random import randint
from numpy.random import uniform
pre_processor_parameters={'max_num':[1,3], 
                          'shape_type':[5], 
                          'energy_range':[10,30]}

vectorizer_parameters={'complexity':[2,3]}

estimator_parameters={'n_iter':randint(5, 100, size=n_iter),
                      'penalty':['l1','l2','elasticnet'],
                      'l1_ratio':uniform(0.1,0.9, size=n_iter), 
                      'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                      'power_t':uniform(0.1, size=n_iter),
                      'alpha': [10**x for x in range(-8,0)],
                      'eta0': [10**x for x in range(-4,-1)],
                      'learning_rate': ["invscaling", "constant", "optimal"],
                      'n_jobs':[n_jobs]}
active_set_size = size * 2
model_fname='eden_model_active_%s'%rfam_id
model.optimize(iterable_pos_train, iterable_neg_train, 
               model_name=model_fname,
               score_func=lambda avg_score,std_score : avg_score - std_score * 2,
               scoring='roc_auc',
               n_active_learning_iterations=4,
               n_iter=n_iter, 
               size_positive=-1,
               size_negative=active_set_size,
               cv=5,
               pre_processor_parameters=pre_processor_parameters, 
               vectorizer_parameters=vectorizer_parameters, 
               estimator_parameters=estimator_parameters)



	Parameters range:

Pre_processor:
energy_range: [10, 30]
   max_num: [1, 3]
shape_type: [5]

Vectorizer:
complexity: [2, 3]

Estimator:
     alpha: [1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1]
      eta0: [0.0001, 0.001, 0.01]
  l1_ratio: [ 0.35729698  0.35895761  0.70615714  0.24710079  0.23582354  0.87530976
  0.26417093  0.64533669]
learning_rate: ['invscaling', 'constant', 'optimal']
      loss: ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']
    n_iter: [29 36 80 74 77 18 14 35]
    n_jobs: [8]
   penalty: ['l1', 'l2', 'elasticnet']
   power_t: [ 0.54660707  0.61053637  0.41022709  0.23531839  0.8363036   0.87466231
  0.72968736  0.55800003]
iteration: (1/5) 1/8 score (roc_auc): 0.642 (0.740 +- 0.049)


	Iteration: 1/8 (after 21.3 sec; 0:00:21.281985)
Best score (roc_auc): 0.642 (0.740 +- 0.049)

Data:
Instances: 35 ; Features: 1048577 with an avg of 540 features per instance
class: 1 count:25 (0.71)	class: -1 count:10 (0.29)	

	Model parameters:

Pr

In [18]:
%%time
#estimate predictive performance
apr, roc = model.estimate( iterable_pos_test, iterable_neg_test )


Classifier:
SGDClassifier(alpha=0.001, average=True, class_weight='auto', epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.24710078857489606,
       learning_rate='optimal', loss='perceptron', n_iter=80, n_jobs=8,
       penalty='l2', power_t=0.72968736217300212, random_state=None,
       shuffle=True, verbose=0, warm_start=False)

Data:
Instances: 125 ; Features: 1048577 with an avg of 1218 features per instance

Predictive performace estimate:
             precision    recall  f1-score   support

         -1       0.91      0.96      0.93       100
          1       0.79      0.60      0.68        25

avg / total       0.88      0.89      0.88       125

APR: 0.816
ROC: 0.912
CPU times: user 1.11 s, sys: 358 ms, total: 1.47 s
Wall time: 2.48 s


In [19]:
from eden.model import ActiveLearningBinaryClassificationModel

model2 = ActiveLearningBinaryClassificationModel()
model2.load(model_fname)

from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)

predictions= model2.decision_function( iterable_pos )
for n,i in enumerate(sorted(predictions)): print n,i

Starting new HTTP connection (1): rfam.xfam.org
"GET /family/RF00005/alignment?acc=RF00005&format=fastau&download=0 HTTP/1.1" 200 90476
0 -0.107329581886
1 -0.0418255330633
2 -0.0390479312677
3 -0.0334953868928
4 -0.0271685778381
5 -0.0253016243178
6 -0.0235327558443
7 -0.022600673269
8 -0.0158863402782
9 -0.00568221758524
10 0.0143338709995
11 0.0169454439478
12 0.0234259137274
13 0.0313507916506
14 0.039940697623
15 0.0401388686104
16 0.0473337072015
17 0.049109268666
18 0.04959903352
19 0.0573042028738
20 0.0581557230679
21 0.0684821131256
22 0.0738550787497
23 0.0768959021205
24 0.0832661388126
25 0.0998120975962
26 0.110594574811
27 0.11122593637
28 0.111772268302
29 0.124323758288
30 0.139877426949
31 0.172249276246
32 0.278581368473
33 0.286223154407
34 0.331728146454
35 0.336430539937
36 0.33846762729
37 0.344637424562
38 0.353305734691
39 0.353869076013
40 0.355235169229
41 0.365874162857
42 0.375790818538
43 0.376978177257
44 0.405922179314
45 0.406549463837
46 0.426193301506