In [1]:
def rfam_uri(family_id):
    return 'http://rfam.xfam.org/family/%s/alignment?acc=%s&format=fastau&download=0'%(family_id,family_id)

def rfam_uri(family_id):
    return '%s.fa'%(family_id)

In [2]:
rfam_id = 'RF02275' #Hammerhead_HH9
rfam_id = 'RF00871' #microRNA mir-689
rfam_id = 'RF00005' #tRNA

In [3]:
def pre_processor( data, **args):
    from eden.converter.rna.rnafold import rnafold_to_eden
    graphs = rnafold_to_eden( data, **args )
    return graphs

In [4]:
def pre_processor( data, **args):
    from eden.converter.rna.rnashapes import rnashapes_to_eden
    graphs = rnashapes_to_eden( data, **args )
    return graphs

In [5]:
from eden.graph import Vectorizer
vectorizer = Vectorizer()

In [6]:
from sklearn.linear_model import SGDClassifier, Perceptron, PassiveAggressiveClassifier
estimator = PassiveAggressiveClassifier(shuffle=True)
estimator = Perceptron(class_weight='auto', shuffle=True)
estimator = SGDClassifier(average=True, class_weight='auto', shuffle=True)

In [18]:
#data setup
model_fname='eden_model_%s'%rfam_id
size=100
train_test_split=0.5
n_iter=10
times=2
n_jobs=8

#BinaryClassificationModel

In [11]:
#create iterable from files
from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs
from eden.modifier.seq import seq_to_seq, shuffle_modifier
iterable_neg = seq_to_seq( seqs_, modifier=shuffle_modifier, times=times, order=2 )

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)
iterable_neg = islice(iterable_neg,size*times)

#split train/test
from eden.util import random_bipartition_iter
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)

In [12]:
%%time
#make predictive model
from eden.model import ActiveLearningBinaryClassificationModel
model = ActiveLearningBinaryClassificationModel(pre_processor=pre_processor, 
                                                estimator=estimator, 
                                                vectorizer=vectorizer,
                                                n_jobs=n_jobs,
                                                pre_processor_n_jobs=n_jobs,
                                                n_blocks=5)

#optimize hyperparameters and fit model
from numpy.random import randint
from numpy.random import uniform
pre_processor_parameters={'max_num':[1,3], 
                          'shape_type':[5], 
                          'energy_range':[5,10,20,30]}

vectorizer_parameters={'complexity':[2,3,4]}

estimator_parameters={'n_iter':randint(5, 200, size=n_iter),
                      'penalty':['l1','l2','elasticnet'],
                      'l1_ratio':uniform(0.1,0.9, size=n_iter), 
                      'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                      'power_t':uniform(0.1, size=n_iter),
                      'alpha': [10**x for x in range(-8,0)],
                      'eta0': [10**x for x in range(-4,-1)],
                      'learning_rate': ["invscaling", "constant", "optimal"],
                      'n_jobs':[n_jobs]}

model.optimize(list(iterable_pos_train), list(iterable_neg_train), 
               model_name=model_fname,
               max_total_time=60*30, n_iter=n_iter, 
               cv=5, verbosity=2,
               score_func=lambda avg_score,std_score : avg_score - std_score * 2,
               scoring='roc_auc',
               two_steps_optimization=False,
               pre_processor_parameters=pre_processor_parameters, 
               vectorizer_parameters=vectorizer_parameters, 
               estimator_parameters=estimator_parameters)

--------------------------------------------------------------------------------
Parameters range:
--------------------------------------------------------------------------------
Pre_processor:
{'energy_range': [5, 10, 20, 30], 'max_num': [1, 3], 'shape_type': [5]}
Vectorizer:
{'complexity': [2, 3, 4]}
Estimator:
{'alpha': [1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1],
 'eta0': [0.0001, 0.001, 0.01],
 'l1_ratio': array([ 0.41502333,  0.67294397,  0.82826544,  0.29312977,  0.5049181 ,
        0.22451756,  0.83223115,  0.62900713,  0.61457074,  0.54870325]),
 'learning_rate': ['invscaling', 'constant', 'optimal'],
 'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
 'n_iter': array([158,  11, 161, 164, 133,  63,  98, 137,   8, 104]),
 'n_jobs': [8],
 'penalty': ['l1', 'l2', 'elasticnet'],
 'power_t': array([ 0.64864806,  0.65136359,  0.33250079,  0.36792659,  0.21434627,
        0.76963073,  0.36521054,  0.9279416 ,  0.96185464,  0.36467614])}
----------

In [13]:
%%time
#estimate predictive performance
model.print_model_parameter_configuration()
model.estimate( iterable_pos_test, iterable_neg_test )

--------------------------------------------------------------------------------
Current parameters:
Pre_processor:
{'energy_range': 10, 'max_num': 3, 'shape_type': 5}
Vectorizer:
{'complexity': 2}
Estimator:
{'alpha': 0.01,
 'eta0': 0.001,
 'l1_ratio': 0.41502332701415212,
 'learning_rate': 'optimal',
 'loss': 'hinge',
 'n_iter': 63,
 'n_jobs': 8,
 'penalty': 'l2',
 'power_t': 0.76963073259320125}
--------------------------------------------------------------------------------
Classifier:
SGDClassifier(alpha=0.01, average=True, class_weight='auto', epsilon=0.1,
       eta0=0.001, fit_intercept=True, l1_ratio=0.41502332701415212,
       learning_rate='optimal', loss='hinge', n_iter=63, n_jobs=8,
       penalty='l2', power_t=0.76963073259320125, random_state=None,
       shuffle=True, verbose=0, warm_start=False)
--------------------------------------------------------------------------------
Instances: 150 ; Features: 1048577 with an avg of 770 features per instance
-------------------

Models can be reloaded from disk

In [19]:
from eden.model import ActiveLearningBinaryClassificationModel

model2 = ActiveLearningBinaryClassificationModel()
model2.load(model_fname)

from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)

predictions= model2.decision_function( iterable_pos )
for n,i in enumerate(sorted(predictions)): print n,i

0 -0.715006156577
1 -0.401033076474
2 -0.316823232301
3 -0.314670289038
4 -0.298361579631
5 -0.296886937707
6 -0.261150806501
7 -0.252257868169
8 -0.23296888078
9 -0.158138761029
10 -0.146765168869
11 -0.123809089712
12 -0.108403329096
13 -0.0713451237617
14 -0.0429361077217
15 -0.0384997178046
16 -0.0334504835875
17 -0.0266053098417
18 0.0109169716723
19 0.0668944443593
20 0.101276386078
21 0.181845795109
22 0.18279310448
23 0.199161389671
24 0.209723763269
25 0.218098642428
26 0.229454935741
27 0.242948043809
28 0.244925383801
29 0.270848385374
30 0.272241664564
31 0.281894377799
32 0.290550297705
33 0.320978760987
34 0.335565089485
35 0.346982133999
36 0.347999342071
37 0.351431270535
38 0.356974243224
39 0.381159389942
40 0.381795605754
41 0.386239455625
42 0.386695573661
43 0.40246178611
44 0.403025393753
45 0.408227969047
46 0.418859467948
47 0.429045681127
48 0.440951028494
49 0.446270961346
50 0.460442132328
51 0.461809670306
52 0.469089128317
53 0.482751124061
54 0.48888048745

#ActiveLearningBinaryClassificationModel

In [14]:
#create iterable from files
from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs
from eden.modifier.seq import seq_to_seq, shuffle_modifier
iterable_neg = seq_to_seq( seqs_, modifier=shuffle_modifier, times=times, order=2 )

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)
iterable_neg = islice(iterable_neg,size*times)

#split train/test
from eden.util import random_bipartition_iter
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)

In [15]:
%%time
#make predictive model
from eden.model import ActiveLearningBinaryClassificationModel
model = ActiveLearningBinaryClassificationModel(pre_processor=pre_processor, 
                                                estimator=estimator, 
                                                vectorizer=vectorizer,
                                                n_jobs=n_jobs,
                                                pre_processor_n_jobs=n_jobs,
                                                n_blocks=8)

#optimize hyperparameters and fit model
from numpy.random import randint
from numpy.random import uniform
pre_processor_parameters={'max_num':[1,3], 
                          'shape_type':[5], 
                          'energy_range':[10,30]}

vectorizer_parameters={'complexity':[2,3]}

estimator_parameters={'n_iter':randint(5, 100, size=n_iter),
                      'penalty':['l1','l2','elasticnet'],
                      'l1_ratio':uniform(0.1,0.9, size=n_iter), 
                      'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                      'power_t':uniform(0.1, size=n_iter),
                      'alpha': [10**x for x in range(-8,0)],
                      'eta0': [10**x for x in range(-4,-1)],
                      'learning_rate': ["invscaling", "constant", "optimal"],
                      'n_jobs':[n_jobs]}
active_set_size = size * 2
model_fname='eden_model_active_%s'%rfam_id
model.optimize(iterable_pos_train, iterable_neg_train, 
               model_name=model_fname,
               score_func=lambda avg_score,std_score : avg_score - std_score * 2,
               scoring='roc_auc',
               n_active_learning_iterations=4,
               max_total_time=60*30, n_iter=n_iter, 
               size_positive=-1,
               size_negative=active_set_size,
               cv=5, verbosity=2,
               pre_processor_parameters=pre_processor_parameters, 
               vectorizer_parameters=vectorizer_parameters, 
               estimator_parameters=estimator_parameters)

--------------------------------------------------------------------------------
Parameters range:
--------------------------------------------------------------------------------
Pre_processor:
{'energy_range': [10, 30], 'max_num': [1, 3], 'shape_type': [5]}
Vectorizer:
{'complexity': [2, 3]}
Estimator:
{'alpha': [1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1],
 'eta0': [0.0001, 0.001, 0.01],
 'l1_ratio': array([ 0.57325222,  0.14994646,  0.18865661,  0.22117461,  0.31690621,
        0.73416894,  0.88358599,  0.69062953,  0.37124511,  0.22699437]),
 'learning_rate': ['invscaling', 'constant', 'optimal'],
 'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
 'n_iter': array([41, 88, 22, 19, 14, 29, 31,  8, 88, 47]),
 'n_jobs': [8],
 'penalty': ['l1', 'l2', 'elasticnet'],
 'power_t': array([ 0.70907981,  0.74181617,  0.34499379,  0.46075986,  0.41161215,
        0.35664111,  0.31470491,  0.23543944,  0.99466505,  0.83996848])}
------------------------------

In [16]:
%%time
#estimate predictive performance
model.estimate( iterable_pos_test, iterable_neg_test )

Classifier:
SGDClassifier(alpha=0.01, average=True, class_weight='auto', epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.18865660818999466,
       learning_rate='optimal', loss='log', n_iter=41, n_jobs=8,
       penalty='l2', power_t=0.8399684770440129, random_state=None,
       shuffle=True, verbose=0, warm_start=False)
--------------------------------------------------------------------------------
Instances: 150 ; Features: 1048577 with an avg of 1850 features per instance
--------------------------------------------------------------------------------
Test Estimate
             precision    recall  f1-score   support

         -1       0.88      0.98      0.92       100
          1       0.95      0.72      0.82        50

avg / total       0.90      0.89      0.89       150

APR: 0.927
ROC: 0.946
--------------------------------------------------------------------------------
CPU times: user 3.04 s, sys: 520 ms, total: 3.56 s
Wall time: 6.78 s


(0.92739712448223721, 0.94640000000000124)

In [17]:
from eden.model import ActiveLearningBinaryClassificationModel

model2 = ActiveLearningBinaryClassificationModel()
model2.load(model_fname)

from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)

predictions= model2.decision_function( iterable_pos )
for n,i in enumerate(sorted(predictions)): print n,i

0 -0.215530537367
1 -0.0962730694776
2 -0.0868176104772
3 -0.0847088416301
4 -0.0811165200573
5 -0.0701668745492
6 -0.0627526575834
7 -0.0624467388183
8 -0.0526253163359
9 -0.0511361146516
10 -0.0330956035244
11 -0.0245764769398
12 -0.0241453313268
13 -0.00942721856461
14 0.0021415400522
15 0.0229518386189
16 0.0231780602368
17 0.0498481040662
18 0.0532152412279
19 0.0720855235509
20 0.0752317339837
21 0.0895195922619
22 0.0951085205687
23 0.0952916430789
24 0.0956782792364
25 0.0971117674309
26 0.0986136219829
27 0.116771673151
28 0.141658554269
29 0.142664474712
30 0.147736044759
31 0.152480956611
32 0.153398304965
33 0.160781261686
34 0.164664832603
35 0.186944611561
36 0.187872349049
37 0.188908376573
38 0.18985390672
39 0.193447404224
40 0.194613249557
41 0.206782161063
42 0.209136500806
43 0.212611598244
44 0.218609036143
45 0.225545083188
46 0.227515891228
47 0.239451987031
48 0.245977514189
49 0.248713380456
50 0.251633845136
51 0.251846507801
52 0.253071299973
53 0.25312699036