In [51]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [52]:
%matplotlib inline
import matplotlib.pyplot as plt

In [53]:
from eden.util import configure_logging
import logging
configure_logging(logging.getLogger(),verbosity=1)

In [54]:
from itertools import tee, chain, islice
import numpy as np
import random
from time import time

In [55]:
from graphlearn.graphlearn import GraphLearnSampler
from eden.util import fit,estimate
from eden.graph import Vectorizer

In [56]:
from eden.converter.graph.gspan import gspan_to_eden
def get_graphs(dataset_fname, size=200):
    return  islice(gspan_to_eden(dataset_fname),size)

In [57]:
def fit_sample(graphs):
    graphs, graphs_ = tee(graphs)
    sampler=GraphLearnSampler(radius_list=[0,1],thickness_list=[1,2],
                              min_cip_count=2, min_interface_count=2)
    sampler.fit(graphs, n_jobs=-1)
    graphs = sampler.sample(graphs_,
                            same_radius=False,
                            same_core_size=True,
                            n_samples=2, n_steps=20,
                            n_jobs=-1,
                            accept_annealing_factor=1.5,
                            probabilistic_core_choice=True,
                            generator_mode=True,
                            select_cip_max_tries=400,
                            keep_duplicates=True)
    return graphs

In [58]:
def fit_and_evaluate(pos_original, neg_original,
                     pos_augmented, neg_augmented,
                     pos_test, neg_test):
    # create graph sets...orig augmented and sampled
    pos_orig,pos_orig_ = tee(pos_original)
    neg_orig,neg_orig_ = tee(neg_original)
    
    pos_augmented , pos_sampled = tee(pos_augmented)
    neg_augmented , neg_sampled = tee(neg_augmented)
    
    pos_augmented = chain(pos_augmented,pos_orig_)
    neg_augmented = chain(neg_augmented,neg_orig_)

    predictive_performances = []
    for desc,pos_train,neg_train in [('original',pos_orig, neg_orig),
                                     ( 'original+sample',pos_augmented, neg_augmented),
                                     ('sample',pos_sampled,neg_sampled)]:
        pos_train,pos_train_ = tee(pos_train)
        neg_train,neg_train_ = tee(neg_train)
        pos_size=sum(1 for x in pos_train_)
        neg_size=sum(1 for x in neg_train_)
        if pos_size == 0 or neg_size == 0:
            print "-"*80
            print 'working on %s'%(desc)
            print 'training set sizes: #pos: %d #neg: %d'%(pos_size, neg_size)
            print 'WARNING: empty dataset'
            predictive_performances.append(0)            
        else:
            start=time()
            print "-"*80
            print 'working on %s'%(desc)
            print 'training set sizes: #pos: %d #neg: %d'%(pos_size, neg_size)
            pos_test,pos_test_ = tee(pos_test)
            neg_test,neg_test_ = tee(neg_test)
            local_estimator = fit(pos_train, neg_train, Vectorizer(2), n_jobs=-1, n_iter_search=1)
            apr, roc = estimate(pos_test_, neg_test_, local_estimator, Vectorizer(2))
            predictive_performances.append(roc)
            print 'elapsed: %.1f sec'%(time()-start)
    return predictive_performances

In [None]:
def evaluate(pos_fname, neg_fname, size=None, percentages=None, n_repetitions=None, train_test_split=None):
    # initializing 
    graphs_pos = get_graphs(pos_fname, size=size)
    graphs_neg = get_graphs(neg_fname, size=size)

    # train/test split
    from eden.util import random_bipartition_iter
    pos_train_global,pos_test_global = random_bipartition_iter(graphs_pos,train_test_split)
    neg_train_global,neg_test_global = random_bipartition_iter(graphs_neg,train_test_split)


    original_repetitions = []
    original_sample_repetitions = []
    sample_repetitions = []

    for percentage in percentages:
        originals = []
        originals_samples = []
        samples = []
        for repetition in range(n_repetitions):
            pos_train_global,pos_train_global_ = tee(pos_train_global)
            neg_train_global,neg_train_global_ = tee(neg_train_global)
            pos_test_global,pos_test_global_ = tee(pos_test_global)
            neg_test_global,neg_test_global_ = tee(neg_test_global)

            # use shuffled list to create test and sample set
            pos,pos_reminder = random_bipartition_iter(pos_train_global_,percentage)
            pos,pos_ = tee(pos)
            neg,neg_reminder = random_bipartition_iter(neg_train_global_,percentage)
            neg,neg_ = tee(neg)

            #sample independently from the 2 classes
            sampled_pos = fit_sample(pos)
            sampled_neg = fit_sample(neg)

            #evaluate the predictive performance on held out test set
            start=time()
            print "="*80
            print 'repetition: %d/%d'%(repetition+1, n_repetitions)
            print "training percentage:"+str(percentage)
            perf_orig, perf_orig_samp, perf_samp = fit_and_evaluate(pos_,neg_,
                                                                    sampled_pos,sampled_neg,
                                                                    pos_test_global_,neg_test_global_)
            originals.append(perf_orig)
            originals_samples.append(perf_orig_samp)
            samples.append(perf_samp)

        original_repetitions.append(originals)
        original_sample_repetitions.append(originals_samples)
        sample_repetitions.append(samples)
        print 'Time elapsed for perc: %.2f : %.1f sec'%(percentage,(time()-start))
    
    return original_repetitions, original_sample_repetitions, sample_repetitions

In [None]:
%time
#experiment
percentages=[.05,.1,.15,.2,.25,.3]
original_repetitions, original_sample_repetitions, sample_repetitions = evaluate('bursi.pos.gspan',
                                                                                 'bursi.neg.gspan', 
                                                                                 size=300, 
                                                                                 percentages=percentages, 
                                                                                 n_repetitions=5, 
                                                                                 train_test_split=0.7)

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 5.01 µs


In [None]:
# plot
os = np.mean(original_sample_repetitions, axis=1)
o = np.mean(original_repetitions, axis=1)
s = np.mean(sample_repetitions, axis=1)
plt.figure(figsize=(12,5))
plt.grid()
plt.plot(os, color='g', marker='o', markersize=6, markeredgecolor='g', markerfacecolor='w', label='original+sample')
plt.plot(o, color='r', marker='o', markersize=6, markeredgecolor='r', markerfacecolor='w', label='original')
plt.plot(s, color='b', marker='o', markersize=6, markeredgecolor='b', markerfacecolor='w', label='sample')
plt.legend(loc='lower right')
plt.savefig('plot_predictive_performance_of_samples.pdf')

In [None]:
# plot
from graphlearn.utils.draw import draw_learning_curve
draw_learning_curve(data_first=original_repetitions,
                    data_second=original_sample_repetitions,
                    x_axis=percentages,
                    measure='roc',
                    delta=0.005,scaling=50)