In [16]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib.pyplot as plt
from eden.util import configure_logging
import logging

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
from itertools import tee, chain, islice
import numpy as np
import random
from time import time
from graphlearn.graphlearn import GraphLearnSampler
from eden.util import fit,estimate
from eden.graph import Vectorizer
# get data
from eden.converter.graph.gspan import gspan_to_eden
from itertools import islice
def get_graphs(dataset_fname, size=100):
    return  islice(gspan_to_eden(dataset_fname),size)

In [18]:
def fit_sample(graphs):
    graphs, graphs_ = tee(graphs)
    sampler=GraphLearnSampler(radius_list=[0,1],thickness_list=[1],
                              min_cip_count=2, min_interface_count=2,
                              vectorizer=Vectorizer(5))
    
    sampler.fit(graphs, nu=0.5, n_jobs=-1)

    print('graph grammar stats:')
    interface_counts, core_counts, cip_counts = sampler.grammar().size()
    print('#interfaces: %d   #cores: %d   #core-interface-pairs: %d' % (interface_counts, core_counts, cip_counts))
    graphs = sampler.sample(graphs_,
                            n_steps=5, n_samples=4,
                            target_orig_cip=True,
                            probabilistic_core_choice=False,
                            score_core_choice= True,
                            max_core_size_diff=0,
                            improving_threshold=0.5, accept_static_penalty=0,
                            generator_mode=True,
                            n_jobs=-1,burnin=1,select_cip_max_tries=200,keep_duplicates=True)
    return graphs

In [19]:
def fit_and_evaluate(pos_original, neg_original,
                     pos_sampled, neg_sampled,
                     pos_test, neg_test):
    # create graph sets...orig augmented and sampled
    pos_orig,pos_orig_ = tee(pos_original)
    neg_orig,neg_orig_ = tee(neg_original)
    
    pos_sampled, pos_sampled_ = tee(pos_sampled)
    neg_sampled, neg_sampled_ = tee(neg_sampled)
    
    pos_augmented = chain(pos_orig_,pos_sampled_)
    neg_augmented = chain(neg_orig_,neg_sampled_)

    predictive_performances = []
    for desc,pos_train,neg_train in [('original',pos_orig, neg_orig),
                                     ('sample',pos_sampled,neg_sampled),
                                     ('original+sample',pos_augmented, neg_augmented)]:
        pos_train,pos_train_ = tee(pos_train)
        neg_train,neg_train_ = tee(neg_train)
        pos_size=sum(1 for x in pos_train_)
        neg_size=sum(1 for x in neg_train_)
        if pos_size == 0 or neg_size == 0:
            print "-"*80
            print 'working on %s'%(desc)
            print 'training set sizes: #pos: %d #neg: %d'%(pos_size, neg_size)
            print 'WARNING: empty dataset'
            predictive_performances.append(0)            
        else:
            start=time()
            print "-"*80
            print 'working on %s'%(desc)
            print 'training set sizes: #pos: %d #neg: %d'%(pos_size, neg_size)
            pos_test,pos_test_ = tee(pos_test)
            neg_test,neg_test_ = tee(neg_test)
            local_estimator = fit(pos_train, neg_train, Vectorizer(4), n_jobs=-1, n_iter_search=1)
            apr, roc = estimate(pos_test_, neg_test_, local_estimator, Vectorizer(4))
            predictive_performances.append(roc)
            print 'elapsed: %.1f sec'%(time()-start)
    return predictive_performances

In [20]:
def evaluate(pos_fname, neg_fname, size=None, percentages=None, n_repetitions=None, train_test_split=None):
    # initializing 
    graphs_pos = get_graphs(pos_fname, size=size)
    graphs_neg = get_graphs(neg_fname, size=size)

    # train/test split
    from eden.util import random_bipartition_iter
    pos_train_global,pos_test_global = random_bipartition_iter(graphs_pos,train_test_split)
    neg_train_global,neg_test_global = random_bipartition_iter(graphs_neg,train_test_split)


    original_repetitions = []
    original_sample_repetitions = []
    sample_repetitions = []

    for percentage in percentages:
        originals = []
        originals_samples = []
        samples = []
        for repetition in range(n_repetitions):
            pos_train_global,pos_train_global_ = tee(pos_train_global)
            neg_train_global,neg_train_global_ = tee(neg_train_global)
            pos_test_global,pos_test_global_ = tee(pos_test_global)
            neg_test_global,neg_test_global_ = tee(neg_test_global)

            # use shuffled list to create test and sample set
            pos,pos_reminder = random_bipartition_iter(pos_train_global_,percentage)
            pos,pos_ = tee(pos)
            neg,neg_reminder = random_bipartition_iter(neg_train_global_,percentage)
            neg,neg_ = tee(neg)

            #sample independently from the 2 classes
            print('Positive')
            sampled_pos = fit_sample(pos_)
            print('Negative')
            sampled_neg = fit_sample(neg_)

            #evaluate the predictive performance on held out test set
            start=time()
            print "="*80
            print 'repetition: %d/%d'%(repetition+1, n_repetitions)
            print "training percentage:"+str(percentage)
            perf_orig,\
            perf_samp,\
            perf_orig_samp = fit_and_evaluate(pos,neg,
                                              sampled_pos,sampled_neg,
                                              pos_test_global_,neg_test_global_)
            print 'Time elapsed: %.1f sec'%((time()-start))
            originals.append(perf_orig)
            originals_samples.append(perf_orig_samp)
            samples.append(perf_samp)

        original_repetitions.append(originals)
        original_sample_repetitions.append(originals_samples)
        sample_repetitions.append(samples)
    
    return original_repetitions, original_sample_repetitions, sample_repetitions

In [21]:
%%time
#experiment

dataset_names = !cat NCI60/names
dataset = dataset_names[6]
print 'Working with dataset: %s' % dataset 
pos_dataset_fname = 'NCI60/' + dataset + '_orig_pos.gspan'
neg_dataset_fname = 'NCI60/' + dataset + '_orig_neg.gspan'

#pos_dataset_fname = 'bursi.pos.gspan'
#neg_dataset_fname = 'bursi.neg.gspan'
configure_logging(logging.getLogger(),verbosity=1, filename='%_predictive_performance_of_samples.log'%dataset)


percentages=[.05,.2,.4,.6,.8,.95]

original_repetitions,\
original_sample_repetitions,\
sample_repetitions = evaluate(pos_dataset_fname,neg_dataset_fname,
                              size=600,
                              percentages=percentages,
                              n_repetitions=3,
                              train_test_split=0.7)

Working with dataset: IGROV1_t
Positive
graph grammar stats:
#interfaces: 39   #cores: 40   #core-interface-pairs: 134
Negative
graph grammar stats:
#interfaces: 26   #cores: 37   #core-interface-pairs: 99
repetition: 1/5
training percentage:0.05
--------------------------------------------------------------------------------
working on original
training set sizes: #pos: 28 #neg: 28
Test set
Instances: 480 ; Features: 1048577 with an avg of 437 features per instance
--------------------------------------------------------------------------------
Test Estimate
             precision    recall  f1-score   support

         -1       0.54      0.48      0.51       240
          1       0.53      0.60      0.56       240

avg / total       0.54      0.54      0.54       480

APR: 0.578
ROC: 0.539
Cross-validated estimate
            accuracy: 0.658 +- 0.033
           precision: 0.664 +- 0.045
              recall: 0.650 +- 0.031
                  f1: 0.656 +- 0.026
   average_precision: 0.

  E = np.exp(AB[0] * F + AB[1])
  E = np.exp(AB[0] * F + AB[1])
  TEP_minus_T1P = P * (T * E - T1)


graph grammar stats:
#interfaces: 104   #cores: 103   #core-interface-pairs: 543
Negative
graph grammar stats:
#interfaces: 96   #cores: 102   #core-interface-pairs: 481
repetition: 4/5
training percentage:0.4
--------------------------------------------------------------------------------
working on original
training set sizes: #pos: 224 #neg: 224
Test set
Instances: 480 ; Features: 1048577 with an avg of 437 features per instance
--------------------------------------------------------------------------------
Test Estimate
             precision    recall  f1-score   support

         -1       0.68      0.74      0.71       240
          1       0.71      0.66      0.69       240

avg / total       0.70      0.70      0.70       480

APR: 0.715
ROC: 0.757
Cross-validated estimate
            accuracy: 0.675 +- 0.040
           precision: 0.686 +- 0.060
              recall: 0.662 +- 0.028
                  f1: 0.672 +- 0.024
   average_precision: 0.693 +- 0.038
             roc_auc: 

KeyboardInterrupt: 

In [None]:
# plot
gc={'color':'g'}
rc={'color':'r'}
bc={'color':'b'}
ws = 0.02
os = np.mean(original_sample_repetitions, axis=1)
o = np.mean(original_repetitions, axis=1)
s = np.mean(sample_repetitions, axis=1)
plt.figure(figsize=(18,8))
plt.grid()
plt.boxplot(original_sample_repetitions, positions=percentages, widths=ws, capprops=gc, medianprops=gc, boxprops=gc, whiskerprops=gc, flierprops=gc)
plt.plot(percentages,os, color='g', marker='o', markeredgewidth=1, markersize=7, markeredgecolor='g', markerfacecolor='w', label='original+sample')

plt.boxplot(original_repetitions, positions=percentages, widths=ws, capprops=rc, medianprops=rc, boxprops=rc, whiskerprops=rc, flierprops=rc)
plt.plot(percentages,o, color='r', marker='o', markeredgewidth=1, markersize=7, markeredgecolor='r', markerfacecolor='w', label='original')

plt.boxplot(sample_repetitions, positions=percentages, widths=ws, capprops=bc, medianprops=bc, boxprops=bc, whiskerprops=bc, flierprops=bc)
plt.plot(percentages,s, color='b', marker='o', markeredgewidth=1, markersize=7, markeredgecolor='b', markerfacecolor='w', label='sample')

plt.xlim(percentages[0]-.1,percentages[-1]+.1)
plt.ylabel('ROC AUC',fontsize=16)
plt.xlabel('Dataset size (fraction)',fontsize=16)
plt.legend(loc='lower right')
plt.savefig('%s_plot_predictive_performance_of_samples.pdf' % dataset)

.