In [48]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib.pyplot as plt
from eden.util import configure_logging
import logging

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [49]:
from itertools import tee, chain, islice
import numpy as np
import random
from time import time
from graphlearn.graphlearn import GraphLearnSampler
from eden.util import fit,predict
from eden.graph import Vectorizer
# get data
from eden.converter.graph.gspan import gspan_to_eden
from itertools import islice
def get_graphs(dataset_fname, size=100):
    return  islice(gspan_to_eden(dataset_fname),size)

In [50]:
def fit_sample(graphs):
    graphs, graphs_ = tee(graphs)
    sampler=GraphLearnSampler(radius_list=[0,1],thickness_list=[1],
                              min_cip_count=2, min_interface_count=2,
                              vectorizer=Vectorizer(5))
    
    sampler.fit(graphs, nu=0.3, n_jobs=-1)

    print('graph grammar stats:')
    interface_counts, core_counts, cip_counts = sampler.grammar().size()
    print('#interfaces: %d   #cores: %d   #core-interface-pairs: %d' % (interface_counts, core_counts, cip_counts))
    graphs = sampler.sample(graphs_,
                            n_steps=5, n_samples=4,
                            target_orig_cip=True,
                            probabilistic_core_choice=False,
                            score_core_choice= True,
                            max_core_size_diff=0,
                            generator_mode=True,
                            improving_threshold=0.3, accept_static_penalty=0,
                            n_jobs=-1,burnin=1,select_cip_max_tries=200,keep_duplicates=True)
    return graphs

In [51]:
def fit_and_evaluate(original, sampled, local_estimator):
    outputs = []
    for desc,train in [('original',original),
                           ('sample',sampled)]:
        train,train_ = tee(train)
        size=sum(1 for x in train_)
        print "-"*80
        print 'working on %s'%(desc)
        print 'training set sizes: #: %d'%(size)
        if size == 0:
            print 'WARNING: empty dataset'
            outputs.append(0)
        else:
            start=time()
            predictions = predict(train, 
                              estimator=local_estimator, 
                              vectorizer=Vectorizer(4), 
                              mode='predict_proba',
                              n_jobs=-1)
            avg_score=np.mean(predictions[:,1])
            print 'avg score: %.5f' % avg_score
            outputs.append(avg_score)
            print 'elapsed: %.1f sec'%(time()-start)
    return outputs

In [52]:
def evaluate(data_fname, size=None, percentages=None, n_repetitions=None, train_test_split=None):
    # initializing 
    graphs = get_graphs(data_fname, size=size)

    # train/test split
    from eden.util import random_bipartition_iter
    train_global,test_global = random_bipartition_iter(graphs,train_test_split)

    original_repetitions = []
    sample_repetitions = []

    for percentage in percentages:
        originals = []
        originals_samples = []
        samples = []
        for repetition in range(n_repetitions):
            random.seed(int(313379*percentage+repetition))

            train_global,train_global_ = tee(train_global)
            test_global,test_global_ = tee(test_global)

            from sklearn.linear_model import SGDClassifier
            estimator = SGDClassifier(average=True, class_weight='auto', shuffle=True, loss='log', n_jobs=-1)
            local_estimator = fit(test_global_, 
                                  iterable_neg=None,
                                  vectorizer=Vectorizer(4),
                                  estimator=estimator, n_jobs=-1, n_iter_search=1)
           
            # use shuffled list to create test and sample set
            train,train_reminder = random_bipartition_iter(train_global_,percentage)
            train,train_ = tee(train)
            sampled = fit_sample(train_)

            #evaluate the predictive performance on held out test set
            start=time()
            print "="*80
            print 'repetition: %d/%d'%(repetition+1, n_repetitions)
            print "training percentage:"+str(percentage)
            perf_orig, perf_samp = fit_and_evaluate(train, sampled, local_estimator)
            print 'Time elapsed: %.1f sec'%((time()-start))
            originals.append(perf_orig)
            samples.append(perf_samp)

        original_repetitions.append(originals)
        sample_repetitions.append(samples)
    
    return original_repetitions, sample_repetitions

In [62]:
def plot(dataset, percentages, original_repetitions, sample_repetitions):
    gc={'color':'g'}
    rc={'color':'r'}
    bc={'color':'b'}
    ws = 0.02
    o = np.mean(original_repetitions, axis=1)
    s = np.mean(sample_repetitions, axis=1)
    plt.figure(figsize=(18,8))
    plt.grid()

    plt.boxplot(original_repetitions, positions=percentages, widths=ws, capprops=rc, medianprops=rc, boxprops=rc, whiskerprops=rc, flierprops=rc)
    plt.plot(percentages,o, color='r', marker='o', markeredgewidth=1, markersize=7, markeredgecolor='r', markerfacecolor='w', label='original')

    plt.boxplot(sample_repetitions, positions=percentages, widths=ws, capprops=bc, medianprops=bc, boxprops=bc, whiskerprops=bc, flierprops=bc)
    plt.plot(percentages,s, color='b', marker='o', markeredgewidth=1, markersize=7, markeredgecolor='b', markerfacecolor='w', label='sample')

    plt.xlim(percentages[0]-.05,percentages[-1]+.05)
    plt.title(dataset+'\n',fontsize=17)
    plt.legend(loc='upper right',fontsize=16)
    plt.ylabel('Likelihood',fontsize=16)
    plt.xlabel('Dataset size (fraction)',fontsize=16)
    plt.savefig('%s_plot_probability_of_samples.pdf' % dataset)

In [53]:
%%time
#experiment

dataset_names = !cat NCI60/names
for dataset in dataset_names:
    print 'Working with dataset: %s' % dataset 
    dataset_fname = 'NCI60/' + dataset + '_orig_pos.gspan'
    #dataset_fname = 'bursi.pos.gspan'

    configure_logging(logging.getLogger(),verbosity=1, filename='%s_probability_of_samples.log'%dataset)

    percentages=[.05,.2,.4,.6,.8,.95]

    original_repetitions,\
    sample_repetitions = evaluate(dataset_fname,
                                  size=600,
                                  percentages=percentages,
                                  n_repetitions=5,
                                  train_test_split=0.7)
    plot(dataset, percentages, original_repetitions, sample_repetitions)

Working with dataset: IGROV1_t
graph grammar stats:
#interfaces: 64   #cores: 67   #core-interface-pairs: 292
repetition: 1/7
training percentage:0.2
--------------------------------------------------------------------------------
working on original
training set sizes: #: 84
avg score: 0.99557
elapsed: 1.4 sec
--------------------------------------------------------------------------------
working on sample
training set sizes: #: 252
avg score: 0.99696
elapsed: 4.6 sec
Time elapsed: 25.3 sec
graph grammar stats:
#interfaces: 64   #cores: 67   #core-interface-pairs: 292
repetition: 2/7
training percentage:0.2
--------------------------------------------------------------------------------
working on original
training set sizes: #: 84
avg score: 0.99582
elapsed: 1.2 sec
--------------------------------------------------------------------------------
working on sample
training set sizes: #: 252
avg score: 0.99739
elapsed: 5.7 sec
Time elapsed: 30.1 sec
graph grammar stats:
#interfaces: 6

Traceback (most recent call last):
  File "/Users/costa/anaconda/lib/python2.7/logging/handlers.py", line 77, in emit
    self.doRollover()
  File "/Users/costa/anaconda/lib/python2.7/logging/handlers.py", line 136, in doRollover
    os.rename(sfn, dfn)
OSError: [Errno 2] No such file or directory
Logged from file graphlearn.py, line 224


avg score: 0.99738
elapsed: 4.3 sec
Time elapsed: 29.0 sec
graph grammar stats:
#interfaces: 84   #cores: 76   #core-interface-pairs: 438
repetition: 1/7
training percentage:0.4
--------------------------------------------------------------------------------
working on original
training set sizes: #: 168
avg score: 0.99628
elapsed: 2.6 sec
--------------------------------------------------------------------------------
working on sample
training set sizes: #: 504
avg score: 0.99713
elapsed: 686.2 sec
Time elapsed: 4929.6 sec
graph grammar stats:
#interfaces: 84   #cores: 76   #core-interface-pairs: 438
repetition: 2/7
training percentage:0.4
--------------------------------------------------------------------------------
working on original
training set sizes: #: 168
avg score: 0.99617
elapsed: 2.0 sec
--------------------------------------------------------------------------------
working on sample
training set sizes: #: 504


Traceback (most recent call last):
  File "/Users/costa/anaconda/lib/python2.7/logging/handlers.py", line 77, in emit
    self.doRollover()
  File "/Users/costa/anaconda/lib/python2.7/logging/handlers.py", line 136, in doRollover
    os.rename(sfn, dfn)
OSError: [Errno 2] No such file or directory
Logged from file graphlearn.py, line 572


avg score: 0.99706
elapsed: 8.1 sec
Time elapsed: 44.3 sec
graph grammar stats:
#interfaces: 84   #cores: 76   #core-interface-pairs: 438
repetition: 3/7
training percentage:0.4
--------------------------------------------------------------------------------
working on original
training set sizes: #: 168
avg score: 0.99644
elapsed: 2.3 sec
--------------------------------------------------------------------------------
working on sample
training set sizes: #: 504
avg score: 0.99734
elapsed: 7.7 sec
Time elapsed: 49.5 sec
graph grammar stats:
#interfaces: 84   #cores: 76   #core-interface-pairs: 438
repetition: 4/7
training percentage:0.4
--------------------------------------------------------------------------------
working on original
training set sizes: #: 168
avg score: 0.99600
elapsed: 1.8 sec
--------------------------------------------------------------------------------
working on sample
training set sizes: #: 504
avg score: 0.99707
elapsed: 7.4 sec
Time elapsed: 44.8 sec
graph

Traceback (most recent call last):
  File "/Users/costa/anaconda/lib/python2.7/logging/handlers.py", line 77, in emit
    self.doRollover()
  File "/Users/costa/anaconda/lib/python2.7/logging/handlers.py", line 136, in doRollover
    os.rename(sfn, dfn)
OSError: [Errno 2] No such file or directory
Logged from file graphlearn.py, line 572


avg score: 0.99626
elapsed: 21.7 sec
Time elapsed: 78.9 sec
graph grammar stats:
#interfaces: 112   #cores: 100   #core-interface-pairs: 583
repetition: 7/7
training percentage:0.6
--------------------------------------------------------------------------------
working on original
training set sizes: #: 252
avg score: 0.99618
elapsed: 4.3 sec
--------------------------------------------------------------------------------
working on sample
training set sizes: #: 756
avg score: 0.99721
elapsed: 11.4 sec
Time elapsed: 66.2 sec
graph grammar stats:
#interfaces: 126   #cores: 113   #core-interface-pairs: 677
repetition: 1/7
training percentage:0.8
--------------------------------------------------------------------------------
working on original
training set sizes: #: 336
avg score: 0.99611
elapsed: 3.2 sec
--------------------------------------------------------------------------------
working on sample
training set sizes: #: 1008
avg score: 0.99730
elapsed: 24.8 sec
Time elapsed: 93.5 s

Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/costa/anaconda/lib/python2.7/logging/handlers.py", line 77, in emit
  File "/Users/costa/anaconda/lib/python2.7/logging/handlers.py", line 77, in emit
    self.doRollover()
    self.doRollover()
  File "/Users/costa/anaconda/lib/python2.7/logging/handlers.py", line 136, in doRollover
  File "/Users/costa/anaconda/lib/python2.7/logging/handlers.py", line 136, in doRollover
    os.rename(sfn, dfn)
    os.rename(sfn, dfn)
OSError: [Errno 2] No such file or directory
OSError: [Errno 2] No such file or directory
Logged from file graphlearn.py, line 572
Logged from file graphlearn.py, line 572


avg score: 0.99706
elapsed: 25.7 sec
Time elapsed: 94.0 sec
graph grammar stats:
#interfaces: 126   #cores: 113   #core-interface-pairs: 677
repetition: 4/7
training percentage:0.8
--------------------------------------------------------------------------------
working on original
training set sizes: #: 336
avg score: 0.99536
elapsed: 6.7 sec
--------------------------------------------------------------------------------
working on sample
training set sizes: #: 1008
avg score: 0.99622
elapsed: 15.3 sec
Time elapsed: 101.7 sec
graph grammar stats:
#interfaces: 126   #cores: 113   #core-interface-pairs: 677
repetition: 5/7
training percentage:0.8
--------------------------------------------------------------------------------
working on original
training set sizes: #: 336
avg score: 0.99546
elapsed: 3.6 sec
--------------------------------------------------------------------------------
working on sample
training set sizes: #: 1008


Traceback (most recent call last):
  File "/Users/costa/anaconda/lib/python2.7/logging/handlers.py", line 77, in emit
    self.doRollover()
  File "/Users/costa/anaconda/lib/python2.7/logging/handlers.py", line 136, in doRollover
    os.rename(sfn, dfn)
OSError: [Errno 2] No such file or directory
Logged from file graphlearn.py, line 572


avg score: 0.99662
elapsed: 15.8 sec
Time elapsed: 82.8 sec
graph grammar stats:
#interfaces: 126   #cores: 113   #core-interface-pairs: 677
repetition: 6/7
training percentage:0.8
--------------------------------------------------------------------------------
working on original
training set sizes: #: 336
avg score: 0.99576
elapsed: 6.4 sec
--------------------------------------------------------------------------------
working on sample
training set sizes: #: 1008
avg score: 0.99608
elapsed: 24.8 sec
Time elapsed: 118.0 sec
graph grammar stats:
#interfaces: 126   #cores: 113   #core-interface-pairs: 677
repetition: 7/7
training percentage:0.8
--------------------------------------------------------------------------------
working on original
training set sizes: #: 336
avg score: 0.99578
elapsed: 3.7 sec
--------------------------------------------------------------------------------
working on sample
training set sizes: #: 1008
avg score: 0.99670
elapsed: 16.3 sec
Time elapsed: 82.3

Traceback (most recent call last):
  File "/Users/costa/anaconda/lib/python2.7/logging/handlers.py", line 77, in emit
    self.doRollover()
  File "/Users/costa/anaconda/lib/python2.7/logging/handlers.py", line 136, in doRollover
    os.rename(sfn, dfn)
OSError: [Errno 2] No such file or directory
Logged from file graphlearn.py, line 572


avg score: 0.99723
elapsed: 17.0 sec
Time elapsed: 94.3 sec
graph grammar stats:
#interfaces: 132   #cores: 118   #core-interface-pairs: 731
repetition: 3/7
training percentage:0.95
--------------------------------------------------------------------------------
working on original
training set sizes: #: 399
avg score: 0.99536
elapsed: 5.8 sec
--------------------------------------------------------------------------------
working on sample
training set sizes: #: 1197
avg score: 0.99597
elapsed: 22.9 sec
Time elapsed: 119.9 sec
graph grammar stats:
#interfaces: 132   #cores: 118   #core-interface-pairs: 731
repetition: 4/7
training percentage:0.95
--------------------------------------------------------------------------------
working on original
training set sizes: #: 399
avg score: 0.99550
elapsed: 6.4 sec
--------------------------------------------------------------------------------
working on sample
training set sizes: #: 1197


Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/costa/anaconda/lib/python2.7/logging/handlers.py", line 77, in emit
  File "/Users/costa/anaconda/lib/python2.7/logging/handlers.py", line 77, in emit
    self.doRollover()
    self.doRollover()
  File "/Users/costa/anaconda/lib/python2.7/logging/handlers.py", line 136, in doRollover
  File "/Users/costa/anaconda/lib/python2.7/logging/handlers.py", line 135, in doRollover
    os.rename(sfn, dfn)
    os.remove(dfn)
OSError: [Errno 2] No such file or directory
OSError: [Errno 2] No such file or directory: '/Users/costa/Desktop/BTSync/Projects/graphlearn/example/IGROV1_t_probability_of_samples.log.10'
Logged from file graphlearn.py, line 572
Logged from file graphlearn.py, line 572


avg score: 0.99711
elapsed: 17.5 sec
Time elapsed: 95.6 sec
graph grammar stats:
#interfaces: 132   #cores: 118   #core-interface-pairs: 731
repetition: 5/7
training percentage:0.95
--------------------------------------------------------------------------------
working on original
training set sizes: #: 399
avg score: 0.99561
elapsed: 3.4 sec
--------------------------------------------------------------------------------
working on sample
training set sizes: #: 1197
avg score: 0.99699
elapsed: 22.1 sec
Time elapsed: 100.9 sec
graph grammar stats:
#interfaces: 132   #cores: 118   #core-interface-pairs: 731
repetition: 6/7
training percentage:0.95
--------------------------------------------------------------------------------
working on original
training set sizes: #: 399
avg score: 0.99563
elapsed: 7.2 sec
--------------------------------------------------------------------------------
working on sample
training set sizes: #: 1197
avg score: 0.99647
elapsed: 31.0 sec
Time elapsed: 14

.