In [267]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import logging
from eden.util import configure_logging
configure_logging(logging.getLogger(), verbosity=2)
from IPython.core.display import HTML
HTML('<style>.container { width:95% !important; }</style>')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Code

In [268]:
from GArDen.interfaces import convert, transform, model, predict
from itertools import izip, islice

In [269]:
from sklearn import metrics
import numpy as np
def evaluate(pos_test, neg_test, fit_predictor):
    y_test = []
    y_pred = []
    y_score = []
    pos_prediction_partition = predict(pos_test, program=fit_predictor)
    for key in pos_prediction_partition:
        for element in pos_prediction_partition[key]:
            y_test.append(1)
            y_pred.append(key)
            y_score.append(element['score'])
    neg_prediction_partition = predict(neg_test, program=fit_predictor)
    for key in neg_prediction_partition:
        for element in neg_prediction_partition[key]:
            y_test.append(-1)
            y_pred.append(key)
            y_score.append(element['score'])

    y_test = np.array(y_test)
    y_pred = np.array(y_pred)
    y_score = np.array(y_score)

    # confusion matrix
    cm = metrics.confusion_matrix(y_test, y_pred)
    np.set_printoptions(precision=2)
    print 'Confusion matrix:'
    print(cm)
    
    #classification
    print(metrics.classification_report(y_test, y_pred))

    #roc
    print 'ROC: %.3f'%(metrics.roc_auc_score(y_test, y_score))

In [270]:
def preprocess(fname):
    from GArDen.convert.molecular_graph import MoleculeToGraph
    graphs=transform(fname, program=MoleculeToGraph())
    return list(graphs)

In [271]:
def preprocess(fname):
    from GArDen.convert.molecular_graph import MoleculeToGraph
    graphs=transform(fname, program=MoleculeToGraph())
    from GArDen.transform.minimal_cycle_annotation import AnnotateMinimalCycles
    graphs=transform(graphs, program=AnnotateMinimalCycles())
    from GArDen.transform.contraction import Minor, contraction_modifier
    #reduce all 'label' attributes of contracted nodes to a histogram to be written in the 'label' attribute of the resulting graph 
    label_modifier = contraction_modifier(attribute_in='part_name', attribute_out='label', reduction='set_categorical')
    #reduce all 'weight' attributes of contracted nodes using a sum to be written in the 'weight' attribute of the resulting graph 
    weight_modifier = contraction_modifier(attribute_in='weight', attribute_out='weight', reduction='sum')
    modifiers = [label_modifier, weight_modifier]
    priors=dict(nesting=True,  modifiers=[modifiers], weight_scaling_factor=1)
    graphs=transform(graphs, program=Minor(), parameters_priors=priors)
    return list(graphs)

In [272]:
# display of graphs
from eden.util.display import draw_graph_set
#draw_graph_set(graphs[:6], n_graphs_per_line=3, size=9, title_key='info', prog='neato', node_border=1, node_size=400, colormap='Set3', edge_color='_label_',edge_alpha=.3, vertex_label='label', vertex_color='_label_', ignore_for_layout='nesting')

# Experiment

In [273]:
folder = '../../../DATA/'
active_fname = folder + 'AID2401_active.sdf'
inactive_fname = folder + 'AID2401_inactive.sdf'

In [274]:
%%time
active_graphs = preprocess(active_fname)
inactive_graphs = preprocess(inactive_fname)

from eden.util import random_bipartition_iter
relative_size = 0.8
pos_train, pos_test = random_bipartition_iter(active_graphs, relative_size=relative_size, random_state=1)
neg_train, neg_test = random_bipartition_iter(inactive_graphs, relative_size=relative_size, random_state=1)
train_graphs_tuple = (pos_train, neg_train)

CPU times: user 11.2 s, sys: 684 ms, total: 11.9 s
Wall time: 11.5 s


In [275]:
%%time
from sklearn.linear_model import SGDClassifier
parameters_priors=dict(average=True, loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, n_iter=5, shuffle=True, verbose=0, epsilon=0.1, n_jobs=-1, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False)
parameters_priors.update(dict(vectorizer__r=3,vectorizer__d=5, vectorize__n_jobs=-1, vectorize__fit_flag=False, vectorize__n_blocks=5, vectorize__block_size=None))
predictor = SGDClassifier()
fit_predictor = model(train_graphs_tuple, program=predictor, parameters_priors=parameters_priors)

Positive data: Instances: 424 ; Features: 1048577 with an avg of 942 features per instance
Negative data: Instances: 1364 ; Features: 1048577 with an avg of 839 features per instance
CPU times: user 12.4 s, sys: 3.25 s, total: 15.7 s
Wall time: 28.4 s


In [276]:
%%time
evaluate(pos_test, neg_test, fit_predictor)

Confusion matrix:
[[334   6]
 [ 61  45]]
             precision    recall  f1-score   support

         -1       0.85      0.98      0.91       340
          1       0.88      0.42      0.57       106

avg / total       0.85      0.85      0.83       446

ROC: 0.909
CPU times: user 3.2 s, sys: 1.09 s, total: 4.28 s
Wall time: 7.41 s


---