In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
from eden.converter.molecule import obabel
import networkx as nx
import pybel
import requests
import os.path

In [3]:
def get_compounds(fname, size, listkey):
    PROLOG='https://pubchem.ncbi.nlm.nih.gov/rest/pug/'
    with open(fname,'w') as file_handle:
        stepsize=50
        index_start=0
        for chunk, index_end in enumerate(range(0,size+stepsize,stepsize)):
            if index_end is not 0 :
                print 'Chunk %s) Processing compounds %s to %s (of a total of %s)' % (chunk, index_start, index_end-1, size)
                RESTQ = PROLOG + 'compound/listkey/' + str(listkey) + '/SDF?&listkey_start=' + str(index_start) + '&listkey_count=' + str(stepsize)
                reply=requests.get(RESTQ)
                file_handle.write(reply.text)
            index_start = index_end
        print 'compounds available in file: ', fname


def get_assay(assay_id):
    PROLOG='https://pubchem.ncbi.nlm.nih.gov/rest/pug/'
    AID=str(assay_id)
    #active
    RESTQ = PROLOG + 'assay/aid/' + AID + '/cids/JSON?cids_type=active&list_return=listkey'
    reply=requests.get(RESTQ)
    #extract the listkey
    active_listkey = reply.json()['IdentifierList']['ListKey']
    active_size = reply.json()['IdentifierList']['Size'] 
    active_fname = 'data/AID'+AID+'_active.sdf'
    get_compounds(fname=active_fname, size=active_size, listkey=active_listkey)

    #inactive
    RESTQ = PROLOG + 'assay/aid/' + AID + '/cids/JSON?cids_type=inactive&list_return=listkey'
    reply=requests.get(RESTQ)
    #extract the listkey
    inactive_listkey = reply.json()['IdentifierList']['ListKey']
    inactive_size = reply.json()['IdentifierList']['Size']
    inactive_fname = 'data/AID'+AID+'_inactive.sdf'
    get_compounds(fname=inactive_fname, size=inactive_size, listkey=inactive_listkey)

    return (active_fname,inactive_fname)

In [4]:
%%time

AID=825
READ_FROM_FILE=True
DATA_DIR = 'data'
if READ_FROM_FILE:
    active_fname=DATA_DIR + '/AID%s_active.sdf'%AID
    inactive_fname=DATA_DIR + '/AID%s_inactive.sdf'%AID
else:
    active_fname, inactive_fname = get_assay(AID)


CPU times: user 16 µs, sys: 10 µs, total: 26 µs
Wall time: 42.9 µs


Generate conformers for data

In [5]:
%%time
# Active compounds
if not os.path.exists(DATA_DIR + '/conf_AID%s_active.sdf'%AID):
    obabel.generate_conformers(active_fname, active_conf, 10, 'rmsd')
# Inactive compounds
if not os.path.exists(DATA_DIR + '/conf_AID%s_inactive.sdf'%AID):
    obabel.generate_conformers(active_fname, inactive_conf, 10, 'rmsd')

CPU times: user 167 µs, sys: 81 µs, total: 248 µs
Wall time: 179 µs


Functions for training and testing the model

In [15]:
import datetime, time
def train_obabel_model(pos_fname, neg_fname, model_type = "default",
                       model_fname=None, n_iter=40, active_set_size=1000, 
                       n_active_learning_iterations=3, threshold=1, train_test_split=0.7, 
                       verbose=False):
    
    from numpy.random import randint
    from numpy.random import uniform
    
    # this will be passed as an argument to the model later on
    def pre_processor( data, **args):
        return data
    
    from eden.graph import Vectorizer
    #vectorizer_parameters={'complexity':[1,2,3,4],
    #                       'discretization_size':randint(3, 100,size=n_iter),
    #                       'discretization_dimension':randint(3, 50, size=n_iter)}
    vectorizer_parameters={'complexity':[1,2,3,4],
                           'discretization_size':3,
                           'discretization_dimension':3}
    vectorizer = Vectorizer(**vectorizer_parameters)

    from sklearn.linear_model import SGDClassifier
    estimator = SGDClassifier(class_weight='auto', shuffle=True)

    #create iterable from files
    from eden.converter.molecule import obabel
    if model_type == "default":
        iterable_pos=obabel.obabel_to_eden(pos_fname)
        iterable_neg=obabel.obabel_to_eden(neg_fname)
    elif model_type == "3d":
        iterable_pos=obabel.obabel_to_eden3d(pos_fname)
        iterable_neg=obabel.obabel_to_eden3d(neg_fname)

    
    from itertools import tee
    iterable_pos, iterable_pos_ = tee(iterable_pos)
    iterable_neg, iterable_neg_ = tee(iterable_neg)
    
    import time
    start = time.time()
    print('# positives: %d  # negatives: %d (%.1f sec %s)'%(sum(1 for x in iterable_pos_), sum(1 for x in iterable_neg_), time.time() - start, str(datetime.timedelta(seconds=(time.time() - start)))))
    
    iterable_pos, iterable_pos_ = tee(iterable_pos)
    iterable_neg, iterable_neg_ = tee(iterable_neg)
    
    vectorizer.fit(iterable_pos_)
    #split train/test
    from eden.util import random_bipartition_iter
    iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
    iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)



    #make predictive model
    from eden.model import ActiveLearningBinaryClassificationModel
    model = ActiveLearningBinaryClassificationModel( pre_processor, estimator=estimator, vectorizer=vectorizer )

    #optimize hyperparameters and fit model

    pre_processor_parameters={} 

    vectorizer_parameters={'complexity':[4]}
    
    estimator_parameters={'n_iter':randint(5, 100, size=n_iter),
                          'penalty':['l1','l2','elasticnet'],
                          'l1_ratio':uniform(0.1,0.9, size=n_iter), 
                          'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                          'power_t':uniform(0.1, size=n_iter),
                          'alpha': [10**x for x in range(-8,-2)],
                          'eta0': [10**x for x in range(-4,-1)],
                          'learning_rate': ["invscaling", "constant", "optimal"]}

    model.optimize(iterable_pos_train, iterable_neg_train, 
                   model_name=model_fname,
                   fit_vectorizer=True,
                   n_active_learning_iterations=n_active_learning_iterations,
                   size_positive=-1,
                   size_negative=active_set_size,
                   n_iter=n_iter, cv=3, n_jobs=1, verbose=verbose,
                   pre_processor_parameters=pre_processor_parameters, 
                   vectorizer_parameters=vectorizer_parameters, 
                   estimator_parameters=estimator_parameters)
  
    #estimate predictive performance
    model.estimate( iterable_pos_test, iterable_neg_test, cv=5 )
    return model

def test_obabel_model(fname, model_type = "default", model_fname=None):
    from eden.model import ActiveLearningBinaryClassificationModel

    model = ActiveLearningBinaryClassificationModel()
    model.load(model_fname)

    #create iterable from files
    from eden.converter.molecule import obabel
    if model_type == "default":
        iterable=obabel.obabel_to_eden(fname)
    elif model_type == "3d":
        iterable=obabel.obabel_to_eden3d(fname)
    
    predictions= model.decision_function( iterable )
        
    return predictions

Train the model

In [16]:
#%%time
model_fname='AID%s.model'%AID
model = train_obabel_model(active_fname, inactive_fname,
                           model_type = "3d",
                           model_fname=model_fname,
                           n_iter=40, 
                           active_set_size=500, 
                           n_active_learning_iterations=0, 
                           threshold=1, 
                           train_test_split=0.8, 
                           verbose=1)

# positives: 48  # negatives: 50 (8.1 sec 0:00:08.088641)

Iteration: 1/40 (at 22.3 sec; 0:00:22.265133)
Best score (roc_auc): 0.459774 (0.506716 +- 0.046941)
Instances: 78 ; Features: 1048577 with an avg of 1474 features per instance
class: 1 count:38 (0.49)	class: -1 count:40 (0.51)	

Iteration: 2/40 (at 44.8 sec; 0:00:44.764306)
Best score (roc_auc): 0.463374 (0.549192 +- 0.085819)
Instances: 78 ; Features: 1048577 with an avg of 1474 features per instance
class: 1 count:38 (0.49)	class: -1 count:40 (0.51)	

Iteration: 6/40 (at 134.5 sec; 0:02:14.454479)
Best score (roc_auc): 0.491824 (0.554147 +- 0.062323)
Instances: 78 ; Features: 1048577 with an avg of 1474 features per instance
class: 1 count:38 (0.49)	class: -1 count:40 (0.51)	

Iteration: 22/40 (at 509.6 sec; 0:08:29.587208)
Best score (roc_auc): 0.507524 (0.581995 +- 0.074471)
Instances: 78 ; Features: 1048577 with an avg of 1474 features per instance
class: 1 count:38 (0.49)	class: -1 count:40 (0.51)	


  ret = ret.dtype.type(ret / rcount)


KeyboardInterrupt: 

In [17]:
iterable_pos=obabel.obabel_to_eden3d(active_fname, k = 10)
iterable_neg=obabel.obabel_to_eden3d(inactive_fname, k = 10)

In [15]:
%%time
iterable_pos=obabel.obabel_to_eden3d(active_fname, k = 20)
iterable_neg=obabel.obabel_to_eden3d(inactive_fname, k = 20)

from eden.graph import Vectorizer
vectorizer = Vectorizer(complexity=3, discretization_size = 5, discretization_dimension = 5)

Xp = vectorizer.fit_transform(iterable_pos, n_jobs = 1)
Xn = vectorizer.transform(iterable_neg, n_jobs = 1)

import numpy as np
from scipy.sparse import vstack

yp = [1] * Xp.shape[0]
yn = [-1] * Xn.shape[0]
y = np.array(yp + yn)
X = vstack([Xp, Xn], format="csr")


from sklearn import cross_validation
from sklearn.linear_model import SGDClassifier
estimator = SGDClassifier(class_weight='auto', shuffle=True)

for scoring in ['accuracy', 'precision', 'recall', 'f1', 'average_precision', 'roc_auc']:
    scores = cross_validation.cross_val_score(estimator, X, y, cv=5, scoring=scoring, n_jobs=-1)
    print('%20s: %.3f +- %.3f' % (scoring, np.mean(scores), np.std(scores)))

            accuracy: 0.544 +- 0.191
           precision: 0.466 +- 0.240
              recall: 0.513 +- 0.343
                  f1: 0.479 +- 0.290
   average_precision: 0.655 +- 0.251
             roc_auc: 0.572 +- 0.313
CPU times: user 38.7 s, sys: 981 ms, total: 39.7 s
Wall time: 41.2 s


In [13]:
%%time
iterable_pos=obabel.obabel_to_eden(active_fname)
iterable_neg=obabel.obabel_to_eden(inactive_fname)
from eden.graph import Vectorizer

vectorizer = Vectorizer(complexity=3)
Xp = vectorizer.transform(iterable_pos, n_jobs = 1)
Xn = vectorizer.transform(iterable_neg, n_jobs = 1)

import numpy as np
from scipy.sparse import vstack

yp = [1] * Xp.shape[0]
yn = [-1] * Xn.shape[0]
y = np.array(yp + yn)
X = vstack([Xp, Xn], format="csr")


from sklearn import cross_validation
from sklearn.linear_model import SGDClassifier
estimator = SGDClassifier(class_weight='auto', shuffle=True)

for scoring in ['accuracy', 'precision', 'recall', 'f1', 'average_precision', 'roc_auc']:
    scores = cross_validation.cross_val_score(estimator, X, y, cv=5, scoring=scoring, n_jobs=-1)
    print('%20s: %.3f +- %.3f' % (scoring, np.mean(scores), np.std(scores)))

            accuracy: 0.552 +- 0.141
           precision: 0.633 +- 0.319
              recall: 0.333 +- 0.247
                  f1: 0.387 +- 0.239
   average_precision: 0.614 +- 0.162
             roc_auc: 0.571 +- 0.177
CPU times: user 3.07 s, sys: 345 ms, total: 3.42 s
Wall time: 4.47 s


In [14]:
%matplotlib inline
from eden.util.display import draw_graph
from itertools import islice

for G in islice(iterable_pos, 3):
    draw_graph(G, vertex_label="atom_type")