In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2


In [2]:
from eden.converter.molecule import obabel
import networkx as nx
import pybel
import requests
import os.path
from itertools import tee
from numpy.random import randint
from numpy.random import uniform
from eden.graph import Vectorizer
from sklearn.linear_model import SGDClassifier
import datetime, time
from eden.util import random_bipartition_iter
from eden.model import ActiveLearningBinaryClassificationModel

from eden.util import configure_logging
import logging
configure_logging(logging.getLogger(),verbosity=2)

In [3]:
def make_iterable(filename, file_format):
    if file_format == 'sdf':
        with open(filename) as f:
            s = ''
            for line in f:
                if line.strip() != '$$$$':
                    s = s + line
                else:
                    return_value = s + line
                    s = ''
                    yield return_value
    elif file_format == 'smi':
        with open(filename) as f:
            for line in f:
                yield line

This is where the data sets are defined:

In [4]:
AID = 720577
#AID=2401
DATA_DIR = '/home/liconj/proj/thesis/EDeN/examples/model_comparison/data'
active_fname=DATA_DIR + '/AID%s_active.sdf'%AID
inactive_fname=DATA_DIR + '/AID%s_inactive.sdf'%AID

___
___


### Original model

Functions for training and testing the model

In [5]:
model_fname = DATA_DIR + '/AID%s.model'%AID
model_type = "default"
n_conf = 10
n_iter = 50
active_set_size = 5
n_active_learning_iterations = 0
threshold = 1
train_test_split = 0.8


pre_processor_parameters={'k':randint(1, 10,size=n_iter),
                          'threshold':randint(3, 10, size=n_iter),
                          'model_type':[model_type],
                          'n_conf':[n_conf]}

def pre_processor(data, model_type="3d", **kwargs):
    # model_type = kwargs.get('mode', 'default')
    if model_type == "default":
        iterable = obabel.obabel_to_eden(data, **kwargs)
    elif model_type == "3d":
        iterable = obabel.obabel_to_eden3d(data, **kwargs)
    return iterable



vectorizer = Vectorizer()
estimator = SGDClassifier(class_weight='auto', shuffle=True)

# Make predictive model
model = ActiveLearningBinaryClassificationModel(pre_processor,
                                                estimator=estimator,
                                                vectorizer=vectorizer,
                                                n_jobs=2,
                                                pre_processor_n_jobs=2,
                                                n_blocks = 10,
                                                fit_vectorizer=True)

In [6]:
########
# Create iterables from files
########

iterable_pos = make_iterable('AID720577_active.sdf', 'sdf')
iterable_neg = make_iterable('AID720577_inactive.sdf', 'sdf')
iterable_pos, iterable_pos_ = tee(iterable_pos)
iterable_neg, iterable_neg_ = tee(iterable_neg)

start = time.time()
print('# positives: %d  # negatives: %d (%.1f sec %s)'%(sum(1 for x in iterable_pos_), sum(1 for x in iterable_neg_), time.time() - start, str(datetime.timedelta(seconds=(time.time() - start)))))


iterable_pos, iterable_pos_ = tee(iterable_pos)
iterable_neg, iterable_neg_ = tee(iterable_neg)

# Split train/test
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)

# positives: 80  # negatives: 146 (0.2 sec 0:00:00.166845)


In [7]:
%%time
# Optimize hyperparameters and fit model
# Since this model is fitted much more slowly, use a single vectorizer
#vectorizer_parameters={'complexity':[2,3,4],
#                       'discretization_size':randint(2, 3,size=n_iter),
#                       'discretization_dimension':randint(2, 3,size=n_iter)}

vectorizer_parameters={'complexity':[4,5,6], 'n':[2,3,4]}


estimator_parameters={'n_iter':randint(5, 100, size=n_iter),
                      'penalty':['l1','l2','elasticnet'],
                      'l1_ratio':uniform(0.1,0.9, size=n_iter),
                      'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                      'power_t':uniform(0.1, size=n_iter),
                      'alpha': [10**x for x in range(-8,-2)],
                      'eta0': [10**x for x in range(-4,-1)],
                      'learning_rate': ["invscaling", "constant", "optimal"]}

model.optimize(iterable_pos_train, iterable_neg_train,
               model_name=model_fname,
               n_active_learning_iterations=0,
               size_positive=-1,
               size_negative=active_set_size,
               n_iter=n_iter, cv=3,
               pre_processor_parameters=pre_processor_parameters,
               vectorizer_parameters=vectorizer_parameters,
               estimator_parameters=estimator_parameters)



	Parameters range:

Pre_processor:
         k: [1 2 5 7 2 4 6 2 8 1 9 7 4 1 1 7 5 7 4 9 8 7 7 5 7 3 8 2 4 5 9 1 8 8 7 3 3
 1 3 3 8 7 7 5 6 8 5 3 6 5]
model_type: ['default']
    n_conf: [10]
 threshold: [4 9 3 9 9 5 4 8 4 6 8 6 8 8 7 9 4 8 9 7 5 3 7 9 3 7 8 3 3 4 7 6 8 6 4 7 9
 4 7 5 7 8 4 4 5 3 7 5 7 9]

Vectorizer:
complexity: [4, 5, 6]
         n: [2, 3, 4]

Estimator:
     alpha: [1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001]
      eta0: [0.0001, 0.001, 0.01]
  l1_ratio: [ 0.16031039  0.19845281  0.81871     0.66751793  0.4009681   0.23889493
  0.71036693  0.50040598  0.15835753  0.6508024   0.36833993  0.13104658
  0.36660095  0.46036753  0.33050416  0.62682886  0.14074584  0.56182112
  0.34261588  0.36530662  0.85425686  0.7306414   0.60846963  0.42691566
  0.78952623  0.3694253   0.5744091   0.40837884  0.14373039  0.83631338
  0.52421769  0.24527844  0.88137749  0.65667606  0.14469994  0.75664721
  0.31348643  0.57295542  0.84086094  0.17308704  0.17949613  0.22120594
  0.584010

In [8]:
%%time
# Estimate predictive performance
model.estimate( iterable_pos_test, iterable_neg_test )


Classifier:
SGDClassifier(alpha=0.001, average=False, class_weight='auto', epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.76752244980748252,
       learning_rate='optimal', loss='hinge', n_iter=67, n_jobs=1,
       penalty='l1', power_t=0.75168972356655483, random_state=None,
       shuffle=True, verbose=0, warm_start=False)

Data:
Instances: 46 ; Features: 1048577 with an avg of 598 features per instance

Predictive performace estimate:
             precision    recall  f1-score   support

         -1       0.71      0.57      0.63        30
          1       0.41      0.56      0.47        16

avg / total       0.60      0.57      0.58        46

APR: 0.493
ROC: 0.617
CPU times: user 219 ms, sys: 89.9 ms, total: 309 ms
Wall time: 1.05 s


(0.49254906850739988, 0.61666666666666659)

_____________
_____________
### 3D model

In [13]:
model_fname = DATA_DIR + '/AID%s.model3d'%AID
model_type = "3d"
n_conf = 10
n_iter = 200
active_set_size = 5
n_active_learning_iterations = 0
train_test_split = 0.8


pre_processor_parameters={'k':randint(1, 10,size=n_iter),
                          'threshold':randint(3, 10, size=n_iter),
                          'model_type':[model_type],
                          'n_conf':[n_conf]}

def pre_processor(data, model_type="3d", **kwargs):
    # model_type = kwargs.get('mode', 'default')
    if model_type == "default":
        iterable = obabel.obabel_to_eden(data, **kwargs)
    elif model_type == "3d":
        iterable = obabel.obabel_to_eden3d(data, **kwargs)
    return iterable



vectorizer = Vectorizer()
estimator = SGDClassifier(class_weight='auto', shuffle=True)

# Make predictive model
model3d = ActiveLearningBinaryClassificationModel(pre_processor,
                                                  estimator=estimator,
                                                  vectorizer=vectorizer,
                                                  n_jobs = 1,
                                                  pre_processor_n_jobs = 1,
                                                  n_blocks = 10,
                                                  fit_vectorizer=True)

In [14]:
########
# Create iterables from files
########

iterable_pos = make_iterable('AID720577_active.sdf', 'sdf')
iterable_neg = make_iterable('AID720577_inactive.sdf', 'sdf')
iterable_pos, iterable_pos_ = tee(iterable_pos)
iterable_neg, iterable_neg_ = tee(iterable_neg)

start = time.time()
print('# positives: %d  # negatives: %d (%.1f sec %s)'%(sum(1 for x in iterable_pos_), sum(1 for x in iterable_neg_), time.time() - start, str(datetime.timedelta(seconds=(time.time() - start)))))


iterable_pos, iterable_pos_ = tee(iterable_pos)
iterable_neg, iterable_neg_ = tee(iterable_neg)

# Split train/test
iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)

# positives: 80  # negatives: 146 (0.0 sec 0:00:00.037743)


In [15]:
model3d.fit_vectorizer

True

In [16]:
#%%time
# Optimize hyperparameters and fit model

vectorizer_parameters={'complexity':[6], 'n':[2,3,4]}


estimator_parameters={'n_iter':randint(5, 100, size=n_iter),
                      'penalty':['l1','l2','elasticnet'],
                      'l1_ratio':uniform(0.1,0.9, size=n_iter),
                      'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
                      'power_t':uniform(0.1, size=n_iter),
                      'alpha': [10**x for x in range(-8,-2)],
                      'eta0': [10**x for x in range(-4,-1)],
                      'learning_rate': ["invscaling", "constant", "optimal"]}

model3d.optimize(iterable_pos_train, iterable_neg_train,
               model_name=model_fname,
               n_iter=n_iter, cv=3,
               pre_processor_parameters=pre_processor_parameters,
               vectorizer_parameters=vectorizer_parameters,
               estimator_parameters=estimator_parameters)



	Parameters range:

Pre_processor:
         k: [1 8 9 9 1 7 5 1 1 7 5 3 1 9 1 1 4 7 3 7 4 5 9 5 4 9 7 3 5 7 4 7 4 7 9 6 4
 3 3 9 1 4 9 1 8 2 3 1 2 8 8 1 2 8 4 9 6 2 8 1 4 2 3 3 6 2 4 4 4 7 9 2 4 6
 5 8 5 2 9 9 5 9 1 1 9 8 9 1 6 1 9 5 1 4 3 7 1 9 7 2 2 4 7 8 5 9 6 3 7 4 3
 9 4 7 6 8 6 4 6 6 2 9 5 2 4 3 7 2 3 4 9 4 9 1 1 6 2 4 5 3 4 3 9 2 9 4 3 6
 6 7 8 1 9 5 5 2 1 4 4 4 9 6 3 1 9 2 2 4 7 1 1 2 2 4 2 2 1 2 7 7 7 4 4 7 2
 1 5 8 9 4 8 4 5 9 1 5 8 1 4 7]
model_type: ['3d']
    n_conf: [10]
 threshold: [4 6 3 8 4 5 7 8 7 4 3 6 7 4 5 5 3 5 8 3 8 3 3 7 3 7 3 5 7 9 5 4 3 7 7 8 7
 9 7 3 4 3 7 7 8 6 3 6 5 5 7 3 5 7 8 5 3 9 3 8 3 6 5 9 6 8 9 6 7 9 9 4 3 3
 8 7 7 8 4 8 5 8 5 4 6 7 7 9 3 8 4 4 3 7 8 5 6 3 6 4 6 9 6 9 7 7 4 6 9 8 9
 6 7 3 8 4 9 9 6 5 7 6 3 3 4 9 6 6 6 3 4 4 5 7 9 9 7 7 9 9 9 7 6 4 6 4 5 6
 5 7 8 5 9 4 8 9 8 6 3 9 7 4 9 3 7 4 3 9 6 4 6 5 4 5 4 7 5 5 5 3 3 7 6 5 4
 3 8 7 7 3 8 5 4 7 8 7 4 8 3 9]

Vectorizer:
complexity: [6]
         n: [2, 3, 4]

Estimator:
     alpha: [1e-08, 1e-07,

KeyError: 'hlabel'

In [None]:
%%time
# Estimate predictive performance
model3d.estimate( iterable_pos_test, iterable_neg_test )

In [None]:
def test_obabel_model(fname, model_type = "default", model_fname=None):
    from eden.model import ActiveLearningBinaryClassificationModel

    model = ActiveLearningBinaryClassificationModel()
    model.load(model_fname)

    #create iterable from files
    from eden.converter.molecule import obabel
    if model_type == "default":
        iterable=obabel.obabel_to_eden(fname)
    elif model_type == "3d":
        iterable=obabel.obabel_to_eden3d(fname)

    predictions= model.decision_function( iterable )

    return predictions