In [8]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
from eden.converter.molecule import obabel
import logging
logging.basicConfig(filename="example.log")

In [17]:
import requests

def get_compounds(fname, size, listkey):
    PROLOG='https://pubchem.ncbi.nlm.nih.gov/rest/pug/'
    with open(fname,'w') as file_handle:
        stepsize=50
        index_start=0
        for chunk, index_end in enumerate(range(0,size+stepsize,stepsize)):
            if index_end is not 0 :
                print 'Chunk %s) Processing compounds %s to %s (of a total of %s)' % (chunk, index_start, index_end-1, size)
                RESTQ = PROLOG + 'compound/listkey/' + str(listkey) + '/SDF?&listkey_start=' + str(index_start) + '&listkey_count=' + str(stepsize)
                reply=requests.get(RESTQ)
                file_handle.write(reply.text)
            index_start = index_end
        print 'compounds available in file: ', fname


def get_assay(assay_id):
    PROLOG='https://pubchem.ncbi.nlm.nih.gov/rest/pug/'
    AID=str(assay_id)
    #active
    RESTQ = PROLOG + 'assay/aid/' + AID + '/cids/JSON?cids_type=active&list_return=listkey'
    reply=requests.get(RESTQ)
    #extract the listkey
    active_listkey = reply.json()['IdentifierList']['ListKey']
    active_size = reply.json()['IdentifierList']['Size'] 
    active_fname = 'data/AID'+AID+'_active.sdf'
    get_compounds(fname=active_fname, size=active_size, listkey=active_listkey)

    #inactive
    RESTQ = PROLOG + 'assay/aid/' + AID + '/cids/JSON?cids_type=inactive&list_return=listkey'
    reply=requests.get(RESTQ)
    #extract the listkey
    inactive_listkey = reply.json()['IdentifierList']['ListKey']
    inactive_size = reply.json()['IdentifierList']['Size']
    inactive_fname = 'data/AID'+AID+'_inactive.sdf'
    get_compounds(fname=inactive_fname, size=inactive_size, listkey=inactive_listkey)

    return (active_fname,inactive_fname)

In [18]:
import datetime, time
def train_obabel_model(pos_fname, neg_fname, model_fname=None, n_iter=40, active_set_size=1000, n_active_learning_iterations=3, threshold=1, train_test_split=0.7, verbose=False):
    
    
    def pre_processor( data, **args):
        return data
    
    from eden.graph import Vectorizer
    vectorizer = Vectorizer()

    from sklearn.linear_model import SGDClassifier
    estimator = SGDClassifier(average=True, class_weight='auto', shuffle=True)

    #create iterable from files
    from eden.converter.molecule import obabel
    iterable_pos=obabel.obabel_to_eden(pos_fname)
    iterable_neg=obabel.obabel_to_eden(neg_fname)
    
    from itertools import tee
    iterable_pos, iterable_pos_ = tee(iterable_pos)
    iterable_neg, iterable_neg_ = tee(iterable_neg)
    
    import time
    start = time.time()
    print('# positives: %d  # negatives: %d (%.1f sec %s)'%(sum(1 for x in iterable_pos_), sum(1 for x in iterable_neg_), time.time() - start, str(datetime.timedelta(seconds=(time.time() - start)))))
    
    #split train/test
    from eden.util import random_bipartition_iter
    iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
    iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)



    #make predictive model
    from eden.model import ActiveLearningBinaryClassificationModel
    # model = ActiveLearningBinaryClassificationModel( pre_processor, estimator=estimator, vectorizer=vectorizer )
    model = ActiveLearningBinaryClassificationModel(pre_processor,
                                                estimator=estimator,
                                                vectorizer=vectorizer,
                                                n_jobs = 2,
                                                n_blocks = 10,
                                                fit_vectorizer=True)
 
    from numpy.random import randint
    from numpy.random import uniform

    pre_processor_parameters={'model_type':'default'} 
    
    # The training time for this model is much smaller, so we can use various iterations of the
    # vectorizer
    vectorizer_parameters={'complexity':[2,3,4,5,6]}

    estimator_parameters={'n_iter':randint(5, 100, size=n_iter),
                          'penalty':['l1','l2','elasticnet'],
                          'l1_ratio':uniform(0.1,0.9, size=n_iter), 
                          'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                          'power_t':uniform(0.1, size=n_iter),
                          'alpha': [10**x for x in range(-8,-2)],
                          'eta0': [10**x for x in range(-4,-1)],
                          'learning_rate': ["invscaling", "constant", "optimal"]}

    model.optimize(iterable_pos_train, iterable_neg_train, 
                   model_name=model_fname,
                   n_active_learning_iterations=n_active_learning_iterations,
                   size_positive=-1,
                   size_negative=active_set_size,
                   n_iter=n_iter, cv=3,
                   pre_processor_parameters=pre_processor_parameters, 
                   vectorizer_parameters=vectorizer_parameters, 
                   estimator_parameters=estimator_parameters)
    
    #estimate predictive performance
    model.estimate( iterable_pos_test, iterable_neg_test)
    return model
    
    
def test_obabel_model(fname, model_fname=None):
    from eden.model import ActiveLearningBinaryClassificationModel

    model = ActiveLearningBinaryClassificationModel()
    model.load(model_fname)

    #create iterable from files
    from eden.converter.molecule import obabel
    iterable=obabel.obabel_to_eden(fname)
    
    predictions= model.decision_function( iterable )
        
    return predictions

In [19]:
AID=720577
#AID=2801

In [20]:
%%time

#READ_FROM_FILE=False
READ_FROM_FILE=False

if READ_FROM_FILE:
    active_fname='data/AID%s_active.sdf'%AID
    inactive_fname='data/AID%s_inactive.sdf'%AID
else:
    active_fname, inactive_fname = get_assay(AID)

Chunk 1) Processing compounds 0 to 49 (of a total of 80)
Chunk 2) Processing compounds 50 to 99 (of a total of 80)
compounds available in file:  data/AID720577_active.sdf
Chunk 1) Processing compounds 0 to 49 (of a total of 146)
Chunk 2) Processing compounds 50 to 99 (of a total of 146)
Chunk 3) Processing compounds 100 to 149 (of a total of 146)
compounds available in file:  data/AID720577_inactive.sdf
CPU times: user 230 ms, sys: 17 ms, total: 247 ms
Wall time: 35.2 s


In [14]:
%%time

model_fname='models/AID%s.default_model'%AID
fitted_model = train_obabel_model(active_fname, inactive_fname, model_fname=model_fname, 
                           n_iter=20, 
                           active_set_size=0, 
                           n_active_learning_iterations=0, 
                           threshold=1, 
                           train_test_split=0.7, 
                           verbose=1)

INFO:root.eden.model:

	Iteration: 1/20 (after 3.8 sec; 0:00:03.777023)
Best score (roc_auc): 0.423 (0.574 +- 0.151)

Data:
Instances: 158 ; Features: 1048577 with an avg of 160 features per instance
class: 1 count:56 (0.35)	class: -1 count:102 (0.65)	

	Model parameters:

Pre_processor:
     model_type: d

Vectorizer:
     complexity: 2

Estimator:
         n_iter: 44
           eta0: 0.001
           loss: modified_huber
       l1_ratio: 0.38216035311
  learning_rate: optimal
        penalty: l1
        power_t: 0.342280171631
          alpha: 0.0001
INFO:root.eden.model:

	Iteration: 3/20 (after 15.4 sec; 0:00:15.407195)
Best score (roc_auc): 0.458 (0.505 +- 0.047)

Data:
Instances: 158 ; Features: 1048577 with an avg of 600 features per instance
class: 1 count:56 (0.35)	class: -1 count:102 (0.65)	

	Model parameters:

Pre_processor:
     model_type: a

Vectorizer:
     complexity: 4

Estimator:
         n_iter: 35
           eta0: 0.0001
           loss: modified_huber
       l1_ra

# positives: 80  # negatives: 146 (0.5 sec 0:00:00.506782)
CPU times: user 2min 14s, sys: 531 ms, total: 2min 14s
Wall time: 2min 15s


In [15]:
print fitted_model.get_parameters()


	Model parameters:

Pre_processor:
     model_type: a

Vectorizer:
     complexity: 3

Estimator:
         n_iter: 35
           eta0: 0.0001
           loss: modified_huber
       l1_ratio: 0.2519667605
  learning_rate: invscaling
        penalty: l1
        power_t: 0.786804743427
          alpha: 0.0001


In [16]:
from eden.converter.molecule import obabel
graphs=obabel.obabel_to_eden(active_fname,file_type = 'sdf')
from itertools import islice
graphs = islice(graphs, 3)
from eden.util.display import draw_graph
for graph in graphs:  draw_graph(graph, size=12, node_size=400, node_border=1, vertex_label='hlabel')

ImportError: ('requires pygraphviz ', 'http://networkx.lanl.gov/pygraphviz ', '(not available for Python3)')