In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
from eden.converter.molecule import obabel
import networkx as nx
import pybel
import requests
import os.path

In [26]:
%%time
AID=1
#AID=2401
#DATA_DIR = '/Volumes/seagate/thesis/examples/data'
DATA_DIR = '/Users/jl/uni-freiburg/thesis/EDeN/examples/3Dmodel/data'
active_fname=DATA_DIR + '/AID%s_active.sdf'%AID
inactive_fname=DATA_DIR + '/AID%s_inactive.sdf'%AID

CPU times: user 12 µs, sys: 1 µs, total: 13 µs
Wall time: 20 µs


In [27]:
active_fname

'/Users/jl/uni-freiburg/thesis/EDeN/examples/3Dmodel/data/AID1_active.sdf'

In [28]:
def make_iterable(filename, file_format):
    if file_format == 'sdf':
        with open(filename) as f:
            s = ''
            for line in f:
                if line.strip() != '$$$$':
                    s = s + line
                else:
                    return_value = s + line
                    s = ''
                    yield return_value
    elif file_format == 'smi':
        with open(filename) as f:
            for line in f:
                yield line

Functions for training and testing the model

In [41]:
import datetime, time
def train_obabel_model(iterable_pos, iterable_neg, pre_processor_parameters, data_dir,
                       model_type = "default",
                       model_fname=None, n_iter=40, active_set_size=1000,
                       n_active_learning_iterations=3, threshold=1, train_test_split=0.7,
                       verbose=False):

    from numpy.random import randint
    from numpy.random import uniform


    global_cache = {}

    # this will be passed as an argument to the model later on
    def pre_processor(data, model_type="3d", **kwargs):

        #### Use the model_type variable from outside (?) ####
        # model_type = kwargs.get('mode', 'default')
        if model_type == "default":
            iterable = obabel.obabel_to_eden(data, **kwargs)
        elif model_type == "3d":
            iterable = obabel.obabel_to_eden3d(data, cache=global_cache, **kwargs)
        return iterable

    from eden.graph import Vectorizer
    vectorizer = Vectorizer()

    from sklearn.linear_model import SGDClassifier
    estimator = SGDClassifier(class_weight='auto', shuffle=True)

    #######3
    #create iterable from files
    ########

    from itertools import tee
    iterable_pos, iterable_pos_ = tee(iterable_pos)
    iterable_neg, iterable_neg_ = tee(iterable_neg)

    import time
    start = time.time()
    print('# positives: %d  # negatives: %d (%.1f sec %s)'%(sum(1 for x in iterable_pos_), sum(1 for x in iterable_neg_), time.time() - start, str(datetime.timedelta(seconds=(time.time() - start)))))

    iterable_pos, iterable_pos_ = tee(iterable_pos)
    iterable_neg, iterable_neg_ = tee(iterable_neg)

    #split train/test
    from eden.util import random_bipartition_iter
    iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
    iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)



    #make predictive model
    from eden.model import ActiveLearningBinaryClassificationModel
    model = ActiveLearningBinaryClassificationModel(pre_processor,
                                                    estimator=estimator,
                                                    vectorizer=vectorizer,
                                                    n_jobs=2,
                                                    n_blocks = 10,
                                                    fit_vectorizer=True)

    #optimize hyperparameters and fit model

    #print "pre processor parameters: " + str(pre_processor_parameters)
    vectorizer_parameters={'complexity':[2,3,4],
                           'discretization_size':randint(2, 3,size=n_iter),
                           'discretization_dimension':randint(2, 3,size=n_iter)}

    estimator_parameters={'n_iter':randint(5, 100, size=n_iter),
                          'penalty':['l1','l2','elasticnet'],
                          'l1_ratio':uniform(0.1,0.9, size=n_iter),
                          'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                          'power_t':uniform(0.1, size=n_iter),
                          'alpha': [10**x for x in range(-8,-2)],
                          'eta0': [10**x for x in range(-4,-1)],
                          'learning_rate': ["invscaling", "constant", "optimal"]}

    print "calling optimizer.."
    model.optimize(iterable_pos_train, iterable_neg_train,
                   model_name=model_fname,
                   n_active_learning_iterations=n_active_learning_iterations,
                   size_positive=-1,
                   size_negative=active_set_size,
                   n_iter=n_iter, cv=3, verbose=verbose,
                   pre_processor_parameters=pre_processor_parameters,
                   vectorizer_parameters=vectorizer_parameters,
                   estimator_parameters=estimator_parameters)

    #estimate predictive performance
    #model.estimate( iterable_pos_test, iterable_neg_test, cv=5 )
    # Had to change this call, estimate has no cv parameter
    model.estimate( iterable_pos_test, iterable_neg_test )

    return model

def test_obabel_model(fname, model_type = "default", model_fname=None):
    from eden.model import ActiveLearningBinaryClassificationModel

    model = ActiveLearningBinaryClassificationModel()
    model.load(model_fname)

    #create iterable from files
    from eden.converter.molecule import obabel
    if model_type == "default":
        iterable=obabel.obabel_to_eden(fname)
    elif model_type == "3d":
        iterable=obabel.obabel_to_eden3d(fname)

    predictions= model.decision_function( iterable )

    return predictions

Train the models

**3D model - no extra conformers** 
---

In [37]:
%%time
from numpy.random import randint
from numpy.random import uniform

pos_iterator=make_iterable(active_fname, 'sdf')
neg_iterator=make_iterable(inactive_fname, 'sdf')

model_fname=DATA_DIR + '/AID%s.model3d'%AID

n_iter = 5
pre_processor_parameters={'k':randint(1, 10,size=n_iter),
                          'threshold':randint(1, 10, size=n_iter),
                          'model_type':['3d'],
                          'n_conf':[0]}

model = train_obabel_model(pos_iterator, neg_iterator, pre_processor_parameters,
                           data_dir=DATA_DIR,
                           model_type = "3d",
                           model_fname=model_fname,
                           n_iter=5,
                           active_set_size=5,
                           n_active_learning_iterations=0,
                           threshold=1,
                           train_test_split=0.8,
                           verbose=1)

# positives: 11  # negatives: 6 (0.0 sec 0:00:00.002826)
calling optimizer..

Iteration: 1/5 (at 4.5 sec; 0:00:04.500102)
Best score (roc_auc): 1.000000 (1.000000 +- 0.000000)
Instances: 12 ; Features: 1048577 with an avg of 1373 features per instance
class: 1 count:8 (0.67)	class: -1 count:4 (0.33)	
Classifier:
SGDClassifier(alpha=0.0001, average=False, class_weight='auto', epsilon=0.1,
       eta0=0.001, fit_intercept=True, l1_ratio=0.77194894921781543,
       learning_rate='optimal', loss='modified_huber', n_iter=20, n_jobs=1,
       penalty='l1', power_t=0.61440120155300504, random_state=None,
       shuffle=True, verbose=0, warm_start=False)
--------------------------------------------------------------------------------
Instances: 5 ; Features: 1048577 with an avg of 1697 features per instance
--------------------------------------------------------------------------------
Test Estimate
             precision    recall  f1-score   support

         -1       1.00      1.00      1.

**3D model - with conformers** 
---

In [38]:
%%time
from numpy.random import randint
from numpy.random import uniform

pos_iterator=make_iterable(active_fname, 'sdf')
neg_iterator=make_iterable(inactive_fname, 'sdf')

model_fname=DATA_DIR + '/AID%s.model3d'%AID

n_iter = 5
pre_processor_parameters={'k':randint(1, 10,size=n_iter),
                          'threshold':randint(1, 10, size=n_iter),
                          'model_type':['3d'],
                          'n_conf':[10]}

model = train_obabel_model(pos_iterator, neg_iterator, pre_processor_parameters,
                           data_dir=DATA_DIR,
                           model_type = "3d",
                           model_fname=model_fname,
                           n_iter=5,
                           active_set_size=5,
                           n_active_learning_iterations=0,
                           threshold=1,
                           train_test_split=0.8,
                           verbose=1)

# positives: 11  # negatives: 6 (0.0 sec 0:00:00.002152)
calling optimizer..

Iteration: 1/5 (at 56.9 sec; 0:00:56.857138)
Best score (roc_auc): 1.000000 (1.000000 +- 0.000000)
Instances: 120 ; Features: 1048577 with an avg of 1562 features per instance
class: 1 count:80 (0.67)	class: -1 count:40 (0.33)	
Classifier:
SGDClassifier(alpha=0.0001, average=False, class_weight='auto', epsilon=0.1,
       eta0=0.001, fit_intercept=True, l1_ratio=0.33077232668134537,
       learning_rate='optimal', loss='modified_huber', n_iter=52, n_jobs=1,
       penalty='l1', power_t=0.90337874315562983, random_state=None,
       shuffle=True, verbose=0, warm_start=False)
--------------------------------------------------------------------------------
Instances: 50 ; Features: 1048577 with an avg of 1869 features per instance
--------------------------------------------------------------------------------
Test Estimate
             precision    recall  f1-score   support

         -1       1.00      1.00   

In [39]:
AID=2401
active_fname=DATA_DIR + '/AID%s_active.sdf'%AID
inactive_fname=DATA_DIR + '/AID%s_inactive.sdf'%AID

In [43]:
from numpy.random import randint
from numpy.random import uniform

pos_iterator=make_iterable(active_fname, 'sdf')
neg_iterator=make_iterable(inactive_fname, 'sdf')

model_fname=DATA_DIR + '/AID%s.model3d'%AID

n_iter = 5
pre_processor_parameters={'k':randint(1, 10,size=n_iter),
                          'threshold':randint(1, 10, size=n_iter),
                          'model_type':['3d'],
                          'n_conf':[10]}

model = train_obabel_model(pos_iterator, neg_iterator, pre_processor_parameters,
                           data_dir=DATA_DIR,
                           model_type = "3d",
                           model_fname=model_fname,
                           n_iter=5,
                           active_set_size=5,
                           n_active_learning_iterations=0,
                           threshold=1,
                           train_test_split=0.8,
                           verbose=2)

# positives: 530  # negatives: 1705 (0.6 sec 0:00:00.624288)
calling optimizer..
--------------------------------------------------------------------------------
Parameters range:
--------------------------------------------------------------------------------
Pre_processor:
{'k': array([6, 1, 7, 6, 3]),
 'model_type': ['3d'],
 'n_conf': [10],
 'threshold': array([2, 9, 4, 2, 2])}
Vectorizer:
{'complexity': [2, 3, 4],
 'discretization_dimension': array([2, 2, 2, 2, 2]),
 'discretization_size': array([2, 2, 2, 2, 2])}
Estimator:
{'alpha': [1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001],
 'eta0': [0.0001, 0.001, 0.01],
 'l1_ratio': array([ 0.70673729,  0.76168606,  0.50149884,  0.65328373,  0.15137272]),
 'learning_rate': ['invscaling', 'constant', 'optimal'],
 'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
 'n_iter': array([37, 71, 28, 53, 58]),
 'penalty': ['l1', 'l2', 'elasticnet'],
 'power_t': array([ 0.93015614,  0.64639291,  0.13526024,  0.24623256,  0.89188

KeyboardInterrupt: 

Test the models:

In [None]:
# active_X, inactive_X are created as above (data matrices) with vectorize(...)
active_X
inactive_X
from eden.util import fit
fit(active_X, inactive_X, vectorizer)
# do transform on both, fit only on positive
### next meet wednesday 16.30