In [1]:
def rfam_uri(family_id):
    return 'http://rfam.xfam.org/family/%s/alignment?acc=%s&format=fastau&download=0'%(family_id,family_id)

def rfam_uri(family_id):
    return '%s.fa'%(family_id)

In [2]:
rfam_id = 'RF02275' #Hammerhead_HH9
rfam_id = 'RF00871' #microRNA mir-689
rfam_id = 'RF00005' #tRNA

In [3]:
def pre_processor( data, **args):
    from eden.converter.rna.rnafold import rnafold_to_eden
    graphs = rnafold_to_eden( data, **args )
    return graphs

In [4]:
def pre_processor( data, **args):
    from eden.converter.rna.rnashapes import rnashapes_to_eden
    graphs = rnashapes_to_eden( data, **args )
    return graphs

In [5]:
from eden.graph import Vectorizer
vectorizer = Vectorizer()

In [6]:
from sklearn.linear_model import SGDClassifier, Perceptron, PassiveAggressiveClassifier
estimator = PassiveAggressiveClassifier(shuffle=True)
estimator = Perceptron(class_weight='auto', shuffle=True)
estimator = SGDClassifier(average=True, class_weight='auto', shuffle=True)

In [7]:
#data setup
model_fname='eden_model_%s'%rfam_id
size=100
train_test_split=0.5
n_iter=10
times=2
n_jobs=8

#BinaryClassificationModel

In [8]:
#create iterable from files
from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs
from eden.modifier.seq import seq_to_seq, shuffle_modifier
iterable_neg = seq_to_seq( seqs_, modifier=shuffle_modifier, times=times, order=2 )

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)
iterable_neg = islice(iterable_neg,size*times)

#split train/test
from eden.util import random_bipartition_iter
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)

In [9]:
from eden.util import configure_logging
logger=configure_logging(verbosity=2)

In [10]:
%%time
#make predictive model
from eden.model import ActiveLearningBinaryClassificationModel
model = ActiveLearningBinaryClassificationModel(pre_processor=pre_processor, 
                                                estimator=estimator, 
                                                vectorizer=vectorizer,
                                                n_jobs=n_jobs,
                                                pre_processor_n_jobs=n_jobs,
                                                n_blocks=5)

#optimize hyperparameters and fit model
from numpy.random import randint
from numpy.random import uniform
pre_processor_parameters={'max_num':[1,3], 
                          'shape_type':[5], 
                          'energy_range':[5,10,20,30]}

vectorizer_parameters={'complexity':[2,3,4]}

estimator_parameters={'n_iter':randint(5, 200, size=n_iter),
                      'penalty':['l1','l2','elasticnet'],
                      'l1_ratio':uniform(0.1,0.9, size=n_iter), 
                      'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                      'power_t':uniform(0.1, size=n_iter),
                      'alpha': [10**x for x in range(-8,0)],
                      'eta0': [10**x for x in range(-4,-1)],
                      'learning_rate': ["invscaling", "constant", "optimal"],
                      'n_jobs':[n_jobs]}

model.optimize(iterable_pos_train, iterable_neg_train, 
               model_name=model_fname,
               max_total_time=-1, n_iter=n_iter, 
               cv=5,
               score_func=lambda avg_score,std_score : avg_score - std_score * 2,
               scoring='roc_auc',
               two_steps_optimization=True,
               pre_processor_parameters=pre_processor_parameters, 
               vectorizer_parameters=vectorizer_parameters, 
               estimator_parameters=estimator_parameters)



	Parameters range:

Pre_processor:
     shape_type: [5]
        max_num: [1, 3]
   energy_range: [5, 10, 20, 30]

Vectorizer:
     complexity: [2, 3, 4]

Estimator:
        penalty: ['l1', 'l2', 'elasticnet']
         n_iter: [124  14 115  62 104 179 117  80 128  31]
          alpha: [1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1]
        power_t: [ 0.2148822   0.45068192  0.48977187  0.52342338  0.29263398  0.99110286
  0.79608719  0.86095175  0.53375918  0.61563526]
           eta0: [0.0001, 0.001, 0.01]
         n_jobs: [8]
           loss: ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']
       l1_ratio: [ 0.65378507  0.84483412  0.17692865  0.23865253  0.57768276  0.11135616
  0.45145909  0.23278448  0.48913473  0.49545365]
  learning_rate: ['invscaling', 'constant', 'optimal']


	Iteration: 2/10 (after 27.8 sec; 0:00:27.839309)
Best score (roc_auc): 0.174287 (0.357000 +- 0.091356)

Data:
Instances: 150 ; Features: 1048577 with an avg of 1234 features per

CPU times: user 34.5 s, sys: 6.93 s, total: 41.4 s
Wall time: 2min 7s


In [11]:
%%time
#estimate predictive performance
print model.get_parameters()
apr, roc = model.estimate( iterable_pos_test, iterable_neg_test )


	Model parameters:

Pre_processor:
     shape_type: 5
        max_num: 3
   energy_range: 20

Vectorizer:
     complexity: 3

Estimator:
         n_iter: 128
         n_jobs: 8
           eta0: 0.0001
           loss: squared_hinge
       l1_ratio: 0.577682755184
  learning_rate: invscaling
        penalty: elasticnet
        power_t: 0.991102858051
          alpha: 0.0001



Classifier:
SGDClassifier(alpha=0.0001, average=True, class_weight='auto', epsilon=0.1,
       eta0=0.0001, fit_intercept=True, l1_ratio=0.57768275518396928,
       learning_rate='invscaling', loss='squared_hinge', n_iter=128,
       n_jobs=8, penalty='elasticnet', power_t=0.99110285805058729,
       random_state=None, shuffle=True, verbose=0, warm_start=False)

Data:
Instances: 150 ; Features: 1048577 with an avg of 2264 features per instance

Predictive performace estimate:
             precision    recall  f1-score   support

         -1       0.86      0.30      0.44       100
          1       0.39      0.90      0.55        50

avg / total       0.70      0.50      0.48       150

APR: 0.748
ROC: 0.774


CPU times: user 3.93 s, sys: 831 ms, total: 4.76 s
Wall time: 11.9 s


Models can be reloaded from disk

In [12]:
from eden.model import ActiveLearningBinaryClassificationModel

model2 = ActiveLearningBinaryClassificationModel()
model2.load(model_fname)

from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)

predictions= model2.decision_function( iterable_pos )
for n,i in enumerate(sorted(predictions)): print n,i

0 -5.83468627762e-06
1 -4.98013167731e-06
2 -4.61913929024e-06
3 -4.05459842409e-06
4 -3.69512190676e-06
5 -2.66475484109e-06
6 4.08015885479e-08
7 1.23236734187e-06
8 1.42167154926e-06
9 2.01977359652e-06
10 2.53086723547e-06
11 3.0970813885e-06
12 3.20999992537e-06
13 3.31033595884e-06
14 4.11809518558e-06
15 4.52028687316e-06
16 4.63143829198e-06
17 4.70356764578e-06
18 7.20241373306e-06
19 7.499942102e-06
20 8.58597356432e-06
21 9.26625395585e-06
22 9.26881846715e-06
23 1.06945987492e-05
24 1.14545099249e-05
25 1.15234390581e-05
26 1.2147142101e-05
27 1.25500298485e-05
28 1.41034571086e-05
29 1.41119361387e-05
30 1.4374897736e-05
31 1.51396478637e-05
32 1.60763493244e-05
33 1.62611561433e-05
34 1.66471069591e-05
35 1.71773714968e-05
36 1.72689770359e-05
37 1.79985610316e-05
38 1.80629204929e-05
39 1.86523135599e-05
40 1.89665083036e-05
41 1.92884616254e-05
42 1.99471500905e-05
43 2.0381853073e-05
44 2.06013990392e-05
45 2.08296471252e-05
46 2.10205701757e-05
47 2.13406424206e-05
48

#ActiveLearningBinaryClassificationModel

In [19]:
#create iterable from files
from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs
from eden.modifier.seq import seq_to_seq, shuffle_modifier
iterable_neg = seq_to_seq( seqs_, modifier=shuffle_modifier, times=times, order=2 )

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)
iterable_neg = islice(iterable_neg,size*times)

#split train/test
from eden.util import random_bipartition_iter
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)

In [15]:
%%time
#make predictive model
from eden.model import ActiveLearningBinaryClassificationModel
model = ActiveLearningBinaryClassificationModel(pre_processor=pre_processor, 
                                                estimator=estimator, 
                                                vectorizer=vectorizer,
                                                n_jobs=n_jobs,
                                                pre_processor_n_jobs=n_jobs,
                                                n_blocks=8)

#optimize hyperparameters and fit model
from numpy.random import randint
from numpy.random import uniform
pre_processor_parameters={'max_num':[1,3], 
                          'shape_type':[5], 
                          'energy_range':[10,30]}

vectorizer_parameters={'complexity':[2,3]}

estimator_parameters={'n_iter':randint(5, 100, size=n_iter),
                      'penalty':['l1','l2','elasticnet'],
                      'l1_ratio':uniform(0.1,0.9, size=n_iter), 
                      'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                      'power_t':uniform(0.1, size=n_iter),
                      'alpha': [10**x for x in range(-8,0)],
                      'eta0': [10**x for x in range(-4,-1)],
                      'learning_rate': ["invscaling", "constant", "optimal"],
                      'n_jobs':[n_jobs]}
active_set_size = size * 2
model_fname='eden_model_active_%s'%rfam_id
model.optimize(iterable_pos_train, iterable_neg_train, 
               model_name=model_fname,
               score_func=lambda avg_score,std_score : avg_score - std_score * 2,
               scoring='roc_auc',
               n_active_learning_iterations=4,
               n_iter=n_iter, 
               size_positive=-1,
               size_negative=active_set_size,
               cv=5,
               pre_processor_parameters=pre_processor_parameters, 
               vectorizer_parameters=vectorizer_parameters, 
               estimator_parameters=estimator_parameters)



	Parameters range:

Pre_processor:
     shape_type: [5]
        max_num: [1, 3]
   energy_range: [10, 30]

Vectorizer:
     complexity: [2, 3]

Estimator:
        penalty: ['l1', 'l2', 'elasticnet']
         n_iter: [42 32 52 63  8 41 37 69 57 43]
          alpha: [1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1]
        power_t: [ 0.58494892  0.46653773  0.46656162  0.75420599  0.71085947  0.18584645
  0.27899915  0.49955717  0.27634826  0.16169929]
           eta0: [0.0001, 0.001, 0.01]
         n_jobs: [8]
           loss: ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']
       l1_ratio: [ 0.25844711  0.20981813  0.22657921  0.68430392  0.37136793  0.70560072
  0.4467224   0.37116012  0.67290475  0.81222286]
  learning_rate: ['invscaling', 'constant', 'optimal']


	Iteration: 2/10 (after 103.5 sec; 0:01:43.525448)
Best score (roc_auc): 0.781594 (0.889000 +- 0.053703)

Data:
Instances: 150 ; Features: 1048577 with an avg of 541 features per instance
class: 1 c

CPU times: user 6min 50s, sys: 32.6 s, total: 7min 22s
Wall time: 9min 48s


In [20]:
%%time
#estimate predictive performance
apr, roc = model.estimate( iterable_pos_test, iterable_neg_test )


Classifier:
SGDClassifier(alpha=0.01, average=True, class_weight='auto', epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.2265792148714689,
       learning_rate='optimal', loss='log', n_iter=42, n_jobs=8,
       penalty='l2', power_t=0.16169928930773869, random_state=None,
       shuffle=True, verbose=0, warm_start=False)

Data:
Instances: 150 ; Features: 1048577 with an avg of 1898 features per instance

Predictive performace estimate:
             precision    recall  f1-score   support

         -1       0.93      0.98      0.96       100
          1       0.96      0.86      0.91        50

avg / total       0.94      0.94      0.94       150

APR: 0.974
ROC: 0.976


CPU times: user 3.45 s, sys: 455 ms, total: 3.91 s
Wall time: 7.63 s


In [17]:
from eden.model import ActiveLearningBinaryClassificationModel

model2 = ActiveLearningBinaryClassificationModel()
model2.load(model_fname)

from eden.converter.fasta import fasta_to_sequence
seqs = fasta_to_sequence( rfam_uri( rfam_id ) )
from itertools import tee
seqs,seqs_=tee(seqs)
iterable_pos = seqs

#consier only first 'size' elements
from itertools import islice
iterable_pos = islice(iterable_pos,size)

predictions= model2.decision_function( iterable_pos )
for n,i in enumerate(sorted(predictions)): print n,i

0 -0.203415261818
1 -0.0851263535726
2 -0.0359680532621
3 -0.0209168698668
4 -0.0187466174435
5 -0.0130460687376
6 -0.00447992080783
7 0.00835004703737
8 0.0106497564069
9 0.0254532118723
10 0.0353704812393
11 0.0383372834838
12 0.0453487184191
13 0.0519307531102
14 0.0522969077665
15 0.0552605661329
16 0.0580280009683
17 0.0658718099461
18 0.0675390705535
19 0.0708858251858
20 0.0721879589302
21 0.0818043718717
22 0.0826663225459
23 0.0852497149507
24 0.101991907586
25 0.103195724132
26 0.103575054544
27 0.107432750776
28 0.113094903365
29 0.113787642517
30 0.115659410748
31 0.119298803151
32 0.124779514215
33 0.132444733366
34 0.133064094696
35 0.140009202617
36 0.141374334264
37 0.15101496649
38 0.153537645735
39 0.162175224726
40 0.164563588764
41 0.169474184631
42 0.175106618298
43 0.18925975236
44 0.203188704885
45 0.206969034944
46 0.208162863111
47 0.215615611989
48 0.217277388088
49 0.222632558478
50 0.223991618412
51 0.224039394303
52 0.22541400263
53 0.229488580666
54 0.2393