In [1]:
def rfam_uri(family_id):
    return 'http://rfam.xfam.org/family/%s/alignment?acc=%s&format=fastau&download=0'%(family_id,family_id)

def rfam_uri(family_id):
    return '%s.fa'%(family_id)

In [2]:
rfam_id = 'RF02275' #Hammerhead_HH9
rfam_id = 'RF00871' #microRNA mir-689
rfam_id = 'RF00005' #tRNA

In [3]:
def pre_processor( data, **args):
    from eden.converter.rna.rnafold import rnafold_to_eden
    graphs = rnafold_to_eden( data, **args )
    return graphs

In [4]:
def pre_processor( data, **args):
    from eden.converter.rna.rnashapes import rnashapes_to_eden
    graphs = rnashapes_to_eden( data, **args )
    return graphs

In [5]:
from eden.graph import Vectorizer
vectorizer = Vectorizer()

In [6]:
from sklearn.linear_model import SGDClassifier, Perceptron, PassiveAggressiveClassifier
estimator = PassiveAggressiveClassifier(shuffle=True)
estimator = Perceptron(class_weight='auto', shuffle=True)
estimator = SGDClassifier(average=True, class_weight='auto', shuffle=True)

In [7]:
#data setup
model_fname='eden_model_%s'%rfam_id
size=100
train_test_split=0.5
n_iter=10
times=2
n_jobs=8

#BinaryClassificationModel

In [8]:
#create iterable from files
from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs
from eden.modifier.seq import seq_to_seq, shuffle_modifier
iterable_neg = seq_to_seq( seqs_, modifier=shuffle_modifier, times=times, order=2 )

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)
iterable_neg = islice(iterable_neg,size*times)

#split train/test
from eden.util import random_bipartition_iter
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)

In [9]:
from eden.util import configure_logging
logger=configure_logging(verbosity=2)

In [10]:
%%time
#make predictive model
from eden.model import ActiveLearningBinaryClassificationModel
model = ActiveLearningBinaryClassificationModel(pre_processor=pre_processor, 
                                                estimator=estimator, 
                                                vectorizer=vectorizer,
                                                n_jobs=n_jobs,
                                                pre_processor_n_jobs=n_jobs,
                                                n_blocks=5)

#optimize hyperparameters and fit model
from numpy.random import randint
from numpy.random import uniform
pre_processor_parameters={'max_num':[1,3], 
                          'shape_type':[5], 
                          'energy_range':[5,10,20,30]}

vectorizer_parameters={'complexity':[2,3,4]}

estimator_parameters={'n_iter':randint(5, 200, size=n_iter),
                      'penalty':['l1','l2','elasticnet'],
                      'l1_ratio':uniform(0.1,0.9, size=n_iter), 
                      'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                      'power_t':uniform(0.1, size=n_iter),
                      'alpha': [10**x for x in range(-8,0)],
                      'eta0': [10**x for x in range(-4,-1)],
                      'learning_rate': ["invscaling", "constant", "optimal"],
                      'n_jobs':[n_jobs]}

model.optimize(iterable_pos_train, iterable_neg_train, 
               model_name=model_fname,
               max_total_time=-1, n_iter=n_iter, 
               cv=5,
               score_func=lambda avg_score,std_score : avg_score - std_score * 2,
               scoring='roc_auc',
               two_steps_optimization=True,
               pre_processor_parameters=pre_processor_parameters, 
               vectorizer_parameters=vectorizer_parameters, 
               estimator_parameters=estimator_parameters)



	Parameters range:

Pre_processor:
        energy_range: [5, 10, 20, 30]
             max_num: [1, 3]
          shape_type: [5]

Vectorizer:
          complexity: [2, 3, 4]

Estimator:
               alpha: [1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1]
                eta0: [0.0001, 0.001, 0.01]
            l1_ratio: [ 0.50972202  0.52247731  0.76347771  0.23966287  0.50264757  0.89185564
  0.83363642  0.8214972   0.16630364  0.24644444]
       learning_rate: ['invscaling', 'constant', 'optimal']
                loss: ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']
              n_iter: [ 31 103 138 178  93 162  62 121 165  36]
              n_jobs: [8]
             penalty: ['l1', 'l2', 'elasticnet']
             power_t: [ 0.57211458  0.42522266  0.83041617  0.2732233   0.88254919  0.30481235
  0.45614541  0.12845862  0.71006023  0.80139208]
iteration: 1  score: -0.077 +- 0.055
iteration: 2  score: -0.013 +- 0.077
iteration: 3  score: 0.613 +- 0.132


	It

CPU times: user 41.6 s, sys: 7.44 s, total: 49 s
Wall time: 2min 30s


In [11]:
%%time
#estimate predictive performance
print model.get_parameters()
apr, roc = model.estimate( iterable_pos_test, iterable_neg_test )


	Model parameters:

Pre_processor:
        energy_range: 20
             max_num: 3
          shape_type: 5

Vectorizer:
          complexity: 3

Estimator:
               alpha: 1e-05
                eta0: 0.0001
            l1_ratio: 0.502647569971
       learning_rate: invscaling
                loss: log
              n_iter: 165
              n_jobs: 8
             penalty: l1
             power_t: 0.882549193468


  'precision', 'predicted', average, warn_for)

Classifier:
SGDClassifier(alpha=1e-05, average=True, class_weight='auto', epsilon=0.1,
       eta0=0.0001, fit_intercept=True, l1_ratio=0.50264756997058146,
       learning_rate='invscaling', loss='log', n_iter=165, n_jobs=8,
       penalty='l1', power_t=0.88254919346808447, random_state=None,
       shuffle=True, verbose=0, warm_start=False)

Data:
Instances: 150 ; Features: 1048577 with an avg of 2238 features per instance

Predictive performace estimate:
             precision    recall  f1-score   support

         -1       0.00      0.00      0.00       100
          1       0.33      1.00      0.50        50

avg / total       0.11      0.33      0.17       150

APR: 0.861
ROC: 0.869


CPU times: user 3.18 s, sys: 323 ms, total: 3.5 s
Wall time: 9.3 s


Models can be reloaded from disk

In [12]:
from eden.model import ActiveLearningBinaryClassificationModel

model2 = ActiveLearningBinaryClassificationModel()
model2.load(model_fname)

from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)

predictions= model2.decision_function( iterable_pos )
for n,i in enumerate(sorted(predictions)): print n,i

0 3.38443237819e-05
1 3.43950218287e-05
2 3.44799711754e-05
3 3.60593961094e-05
4 3.65225474212e-05
5 3.76220188634e-05
6 3.79503931575e-05
7 3.85275612726e-05
8 3.88309055396e-05
9 3.89382391428e-05
10 3.91979910153e-05
11 3.92709911338e-05
12 3.93573395548e-05
13 3.98779391339e-05
14 3.99829012576e-05
15 4.0338375574e-05
16 4.06792966317e-05
17 4.2232664065e-05
18 4.23464816945e-05
19 4.23526245153e-05
20 4.24345645333e-05
21 4.24484018849e-05
22 4.27598274699e-05
23 4.3087732491e-05
24 4.39313073744e-05
25 4.41522508749e-05
26 4.43025054844e-05
27 4.46216321382e-05
28 4.46841487077e-05
29 4.47785122364e-05
30 4.51464236063e-05
31 4.52270654891e-05
32 4.58113936759e-05
33 4.6787631824e-05
34 4.6928677669e-05
35 4.70596556566e-05
36 4.73085939679e-05
37 4.73535854385e-05
38 4.73763414958e-05
39 4.73980718572e-05
40 4.74630285977e-05
41 4.8077269638e-05
42 4.81967520921e-05
43 4.84493643043e-05
44 4.84884940519e-05
45 4.86012701409e-05
46 4.87036199363e-05
47 4.87625768795e-05
48 4.887

#ActiveLearningBinaryClassificationModel

In [13]:
#create iterable from files
from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs
from eden.modifier.seq import seq_to_seq, shuffle_modifier
iterable_neg = seq_to_seq( seqs_, modifier=shuffle_modifier, times=times, order=2 )

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)
iterable_neg = islice(iterable_neg,size*times)

#split train/test
from eden.util import random_bipartition_iter
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)

In [14]:
%%time
#make predictive model
from eden.model import ActiveLearningBinaryClassificationModel
model = ActiveLearningBinaryClassificationModel(pre_processor=pre_processor, 
                                                estimator=estimator, 
                                                vectorizer=vectorizer,
                                                n_jobs=n_jobs,
                                                pre_processor_n_jobs=n_jobs,
                                                n_blocks=8)

#optimize hyperparameters and fit model
from numpy.random import randint
from numpy.random import uniform
pre_processor_parameters={'max_num':[1,3], 
                          'shape_type':[5], 
                          'energy_range':[10,30]}

vectorizer_parameters={'complexity':[2,3]}

estimator_parameters={'n_iter':randint(5, 100, size=n_iter),
                      'penalty':['l1','l2','elasticnet'],
                      'l1_ratio':uniform(0.1,0.9, size=n_iter), 
                      'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                      'power_t':uniform(0.1, size=n_iter),
                      'alpha': [10**x for x in range(-8,0)],
                      'eta0': [10**x for x in range(-4,-1)],
                      'learning_rate': ["invscaling", "constant", "optimal"],
                      'n_jobs':[n_jobs]}
active_set_size = size * 2
model_fname='eden_model_active_%s'%rfam_id
model.optimize(iterable_pos_train, iterable_neg_train, 
               model_name=model_fname,
               score_func=lambda avg_score,std_score : avg_score - std_score * 2,
               scoring='roc_auc',
               n_active_learning_iterations=4,
               n_iter=n_iter, 
               size_positive=-1,
               size_negative=active_set_size,
               cv=5,
               pre_processor_parameters=pre_processor_parameters, 
               vectorizer_parameters=vectorizer_parameters, 
               estimator_parameters=estimator_parameters)



	Parameters range:

Pre_processor:
        energy_range: [10, 30]
             max_num: [1, 3]
          shape_type: [5]

Vectorizer:
          complexity: [2, 3]

Estimator:
               alpha: [1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1]
                eta0: [0.0001, 0.001, 0.01]
            l1_ratio: [ 0.85035245  0.76355556  0.25878753  0.23170464  0.75151998  0.25149817
  0.58899709  0.1049068   0.80174861  0.47458959]
       learning_rate: ['invscaling', 'constant', 'optimal']
                loss: ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']
              n_iter: [43 25 68 72 53 30 16 60 85 32]
              n_jobs: [8]
             penalty: ['l1', 'l2', 'elasticnet']
             power_t: [ 0.40704219  0.67745278  0.44481196  0.84507126  0.77947341  0.54806399
  0.21380633  0.40547385  0.25267851  0.35630662]
iteration: 1  score: -0.033 +- 0.053
iteration: 2  score: 0.790 +- 0.050


	Iteration: 2/10 (after 104.1 sec; 0:01:44.081748)
Best scor

CPU times: user 5min 42s, sys: 27.8 s, total: 6min 10s
Wall time: 8min 5s


In [15]:
%%time
#estimate predictive performance
apr, roc = model.estimate( iterable_pos_test, iterable_neg_test )


Classifier:
SGDClassifier(alpha=0.01, average=True, class_weight='auto', epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.25878753312853231,
       learning_rate='optimal', loss='log', n_iter=43, n_jobs=8,
       penalty='l2', power_t=0.35630661834770705, random_state=None,
       shuffle=True, verbose=0, warm_start=False)

Data:
Instances: 150 ; Features: 1048577 with an avg of 1898 features per instance

Predictive performace estimate:
             precision    recall  f1-score   support

         -1       0.93      0.98      0.96       100
          1       0.96      0.86      0.91        50

avg / total       0.94      0.94      0.94       150

APR: 0.975
ROC: 0.976


CPU times: user 3.1 s, sys: 498 ms, total: 3.6 s
Wall time: 7.29 s


In [16]:
from eden.model import ActiveLearningBinaryClassificationModel

model2 = ActiveLearningBinaryClassificationModel()
model2.load(model_fname)

from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)

predictions= model2.decision_function( iterable_pos )
for n,i in enumerate(sorted(predictions)): print n,i

0 -0.203448388346
1 -0.0836656815195
2 -0.0345475275463
3 -0.0206075638101
4 -0.0183465310767
5 -0.0104251642282
6 -0.00422907251445
7 0.00871172449389
8 0.00919466953801
9 0.0287356467614
10 0.0362698647649
11 0.0398554636592
12 0.0429629725943
13 0.0529386792153
14 0.0531018891633
15 0.0550797386129
16 0.0583889478745
17 0.0646488493895
18 0.0692009354132
19 0.0754610875762
20 0.0820975046304
21 0.0826817539874
22 0.0846791411537
23 0.0871852673633
24 0.101224297548
25 0.106460378931
26 0.108519746634
27 0.111751341143
28 0.115006889574
29 0.116175741566
30 0.118040506067
31 0.120011959476
32 0.127217279066
33 0.130750663209
34 0.134353773832
35 0.135753344712
36 0.136575363345
37 0.14425708321
38 0.155195797289
39 0.163681710375
40 0.167604702619
41 0.173824498411
42 0.174003113951
43 0.191006790488
44 0.200090125192
45 0.2042965178
46 0.214269327658
47 0.21485736221
48 0.215548236919
49 0.222337122962
50 0.225918286436
51 0.229321666663
52 0.230470104448
53 0.230693939022
54 0.2334