In [39]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [40]:
from eden.converter.molecule import obabel
import networkx as nx
import pybel
import requests
import os.path

In [41]:
%%time
AID=825
#AID=2401
#DATA_DIR = '/Volumes/seagate/thesis/examples/data'
DATA_DIR = '/Users/jl/uni-freiburg/thesis/EDeN/examples/3Dmodel/data'
active_fname=DATA_DIR + '/AID%s_active.smi'%AID
inactive_fname=DATA_DIR + '/AID%s_inactive.smi'%AID

CPU times: user 13 µs, sys: 2 µs, total: 15 µs
Wall time: 21 µs


In [42]:
active_fname

'/Users/jl/uni-freiburg/thesis/EDeN/examples/3Dmodel/data/AID825_active.smi'

In [43]:
def make_iterable(filename):
    with open(filename) as f:
        for line in f:
            yield line.strip()

Functions for training and testing the model

In [44]:
import datetime, time
def train_obabel_model(iterable_pos, iterable_neg, data_dir,
                       model_type = "default",
                       model_fname=None, n_iter=40, active_set_size=1000, 
                       n_active_learning_iterations=3, threshold=1, train_test_split=0.7, 
                       verbose=False):
    
    from numpy.random import randint
    from numpy.random import uniform
    
    
    #### Create the obabel converter that is to be used inside the preprocessor (???) ######
    converter = obabel.OBabelConverter()
    
    # this will be passed as an argument to the model later on
    def pre_processor(data, model_type="default", converter=None, **kwargs):

        #### Use the model_type variable from outside (?) ####
        # model_type = kwargs.get('mode', 'default')

        if model_type == "default":
            iterable = converter.obabel_to_eden(data, **kwargs)
        elif model_type == "3d":
            iterable = converter.obabel_to_eden3d(data, **kwargs)
        return iterable
    
    from eden.graph import Vectorizer
    vectorizer = Vectorizer()

    from sklearn.linear_model import SGDClassifier
    estimator = SGDClassifier(class_weight='auto', shuffle=True)

    #######3
    #create iterable from files
    ########
    
    from itertools import tee
    iterable_pos, iterable_pos_ = tee(iterable_pos)
    iterable_neg, iterable_neg_ = tee(iterable_neg)
    
    import time
    start = time.time()
    print('# positives: %d  # negatives: %d (%.1f sec %s)'%(sum(1 for x in iterable_pos_), sum(1 for x in iterable_neg_), time.time() - start, str(datetime.timedelta(seconds=(time.time() - start)))))
    
    iterable_pos, iterable_pos_ = tee(iterable_pos)
    iterable_neg, iterable_neg_ = tee(iterable_neg)
    
    #split train/test
    from eden.util import random_bipartition_iter
    iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
    iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)



    #make predictive model
    from eden.model import ActiveLearningBinaryClassificationModel
    model = ActiveLearningBinaryClassificationModel( pre_processor, estimator=estimator, vectorizer=vectorizer )

    #optimize hyperparameters and fit model

    pre_processor_parameters={'k':randint(1, 10,size=n_iter),
                             'converter':[converter],
                             'model_type':['default']} 
    
    #print "pre processor parameters: " + str(pre_processor_parameters)
    vectorizer_parameters={'complexity':[2,3,4],  
                           'discretization_size':randint(3, 100,size=n_iter),                     
                           'discretization_dimension':randint(3, 100,size=n_iter)}
    
    estimator_parameters={'n_iter':randint(5, 100, size=n_iter),
                          'penalty':['l1','l2','elasticnet'],
                          'l1_ratio':uniform(0.1,0.9, size=n_iter), 
                          'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                          'power_t':uniform(0.1, size=n_iter),
                          'alpha': [10**x for x in range(-8,-2)],
                          'eta0': [10**x for x in range(-4,-1)],
                          'learning_rate': ["invscaling", "constant", "optimal"]}

    print "calling optimizer.."
    model.optimize(iterable_pos_train, iterable_neg_train, 
                   model_name=model_fname,
                   fit_vectorizer=True,
                   n_active_learning_iterations=n_active_learning_iterations,
                   size_positive=-1,
                   size_negative=active_set_size,
                   n_iter=n_iter, cv=3, n_jobs=1, verbose=verbose,
                   pre_processor_parameters=pre_processor_parameters, 
                   vectorizer_parameters=vectorizer_parameters, 
                   estimator_parameters=estimator_parameters)
  
    #estimate predictive performance
    #model.estimate( iterable_pos_test, iterable_neg_test, cv=5 )
    # Had to change this call, estimate has no cv parameter
    model.estimate( iterable_pos_test, iterable_neg_test )
    
    return model

def test_obabel_model(fname, model_type = "default", model_fname=None):
    from eden.model import ActiveLearningBinaryClassificationModel

    model = ActiveLearningBinaryClassificationModel()
    model.load(model_fname)

    #create iterable from files
    from eden.converter.molecule import obabel
    if model_type == "default":
        iterable=obabel.obabel_to_eden(fname)
    elif model_type == "3d":
        iterable=obabel.obabel_to_eden3d(fname)
    
    predictions= model.decision_function( iterable )
        
    return predictions

Train the models

**3D model**
---

In [45]:
#%%time
pos_iterator=make_iterable(active_fname) #this is a SMILES file
neg_iterator=make_iterable(inactive_fname) #this is a SMILES file 
model_fname=DATA_DIR + '/AID%s.model3d'%AID
model = train_obabel_model(pos_iterator, neg_iterator,
                           data_dir=DATA_DIR,
                           model_type = "3d",
                           model_fname=model_fname,
                           n_iter=5, 
                           active_set_size=500, 
                           n_active_learning_iterations=0, 
                           threshold=1, 
                           train_test_split=0.5, 
                           verbose=2)


# positives: 5  # negatives: 5 (0.0 sec 0:00:00.000305)
calling optimizer..
--------------------------------------------------------------------------------
Parameters range:
Pre_processor:
{'converter': [<eden.converter.molecule.obabel.OBabelConverter object at 0x10b0bd9d0>],
 'k': array([7, 2, 6, 2, 3]),
 'model_type': ['default']}
Vectorizer:
{'complexity': [2, 3, 4],
 'discretization_dimension': array([89, 46, 66, 19, 73]),
 'discretization_size': array([68, 64, 27, 55, 76])}
Estimator:
{'alpha': [1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001],
 'eta0': [0.0001, 0.001, 0.01],
 'l1_ratio': array([ 0.4833102 ,  0.23601488,  0.6898897 ,  0.73678389,  0.18946831]),
 'learning_rate': ['invscaling', 'constant', 'optimal'],
 'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
 'n_iter': array([40, 16, 78, 94, 69]),
 'penalty': ['l1', 'l2', 'elasticnet'],
 'power_t': array([ 0.45764911,  0.59500274,  0.47449233,  0.25741428,  0.63303516])}
-------------------------------

AttributeError: 'NoneType' object has no attribute 'set_params'

**Original model**
---

In [None]:
%%time
model_fname='AID%s.modeld'%AID
model = train_obabel_model(active_fname, inactive_fname,
                           model_type = "default",
                           model_fname=model_fname,
                           n_iter=40, 
                           active_set_size=500, 
                           n_active_learning_iterations=0, 
                           threshold=1, 
                           train_test_split=0.8, 
                           verbose=1)



Test the models:

In [None]:
%matplotli

In [None]:
def pre_processor(data, model_type="default", converter=None, **kwargs):

        #### Use the model_type variable from outside (?) ####
        # model_type = kwargs.get('mode', 'default')

        if model_type == "default":
            iterable = converter.obabel_to_eden(data, **kwargs)
        elif model_type == "3d":
            iterable = converter.obabel_to_eden3d(data, **kwargs)
        return iterable



In [None]:
iterable_pos=make_iterable(active_fname) #this is a SMILES file
iterable_neg=make_iterable(inactive_fname) #this is a SMILES file 
from itertools import tee
iterable_pos, iterable_pos_ = tee(iterable_pos)
iterable_neg, iterable_neg_ = tee(iterable_neg)

import time
start = time.time()
#print('# positives: %d  # negatives: %d (%.1f sec %s)'%(sum(1 for x in iterable_pos_), sum(1 for x in iterable_neg_), time.time() - start, str(datetime.timedelta(seconds=(time.time() - start)))))

iterable_pos, iterable_pos_ = tee(iterable_pos)
iterable_neg, iterable_neg_ = tee(iterable_neg)

#split train/test
from eden.util import random_bipartition_iter
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=.5)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=.5)


In [None]:
%matplotlib inline
from eden.converter.molecule import obabel
from eden.util.display import draw_graph
from itertools import islice
converter = obabel.OBabelConverter()
iterable_pos = pre_processor(iterable_pos_train, converter=converter, k=3, model_type="3d")
for G in islice(iterable_pos, 3):
    draw_graph(G, vertex_label="atom_type")