# Classification

Install the library 

    pip install git+https://github.com/fabriziocosta/EDeN.git --user

In [1]:
from eden.util import configure_logging
import logging
configure_logging(logging.getLogger(),verbosity=1)

In [2]:
from eden.graph import Vectorizer
vectorizer = Vectorizer(4, triangular_decomposition=True)

In [3]:
def experiment_run(n_iter_search=None, dataset_names=None):
    from time import time
    train_test_split=.7
    results=[]
    for i,dataset in enumerate(dataset_names):
        start=time()
        print
        print '-'*80
        print 'Working with dataset: %s [%d/%d]' % (dataset,i+1,len(dataset_names)) 
        pos_dataset_fname = 'NCI60/' + dataset + '_orig_pos.gspan'
        neg_dataset_fname = 'NCI60/' + dataset + '_orig_neg.gspan'

        from eden.converter.graph.gspan import gspan_to_eden
        iterable_pos = gspan_to_eden(pos_dataset_fname)
        iterable_neg = gspan_to_eden(neg_dataset_fname)

        #split train/test
        from eden.util import random_bipartition_iter
        iterable_pos_train, iterable_pos_test = random_bipartition_iter(iterable_pos, relative_size=train_test_split)
        iterable_neg_train, iterable_neg_test = random_bipartition_iter(iterable_neg, relative_size=train_test_split)

        from eden.util import fit
        estimator = fit(iterable_pos_train, 
                        iterable_neg_train, 
                        vectorizer, 
                        fit_flag=False, 
                        n_jobs=-1, 
                        cv=10, 
                        n_iter_search=n_iter_search, 
                        random_state=1, 
                        block_size=100)

        from eden.util import estimate
        apr, roc = estimate(iterable_pos_test,
                            iterable_neg_test,  
                            estimator, 
                            vectorizer, 
                            block_size=100, 
                            n_jobs=-1)
        delta_time=(time()-start)
        print 'Time elapsed: %.1f sec'%(delta_time)
        results.append((dataset,apr, roc,delta_time))
    return results

### default estimator configuration

In [5]:
dataset_names = !cat NCI60/names
dataset_names = dataset_names[0:2]

In [6]:
%%time
results_deafault = experiment_run(n_iter_search=1, dataset_names=dataset_names)


--------------------------------------------------------------------------------
Working with dataset: HCT_15_t [1/2]
Test set
Instances: 1089 ; Features: 1048577 with an avg of 473 features per instance
--------------------------------------------------------------------------------
Test Estimate
             precision    recall  f1-score   support

         -1       0.72      0.70      0.71       510
          1       0.74      0.76      0.75       579

avg / total       0.73      0.73      0.73      1089

APR: 0.787
ROC: 0.786
Cross-validated estimate
            accuracy: 0.681 +- 0.018
           precision: 0.707 +- 0.015
              recall: 0.688 +- 0.072
                  f1: 0.695 +- 0.032
   average_precision: 0.748 +- 0.033
             roc_auc: 0.732 +- 0.029
Time elapsed: 27.9 sec

--------------------------------------------------------------------------------
Working with dataset: HL_60_TB_t [2/2]
Test set
Instances: 988 ; Features: 1048577 with an avg of 492 features 

### hyper parameters optimization

In [8]:
%%time
results_opt = experiment_run(n_iter_search=10, dataset_names=dataset_names)


--------------------------------------------------------------------------------
Working with dataset: HCT_15_t [1/2]
Test set
Instances: 1089 ; Features: 1048577 with an avg of 473 features per instance
--------------------------------------------------------------------------------
Test Estimate
             precision    recall  f1-score   support

         -1       0.70      0.66      0.68       510
          1       0.72      0.75      0.74       579

avg / total       0.71      0.71      0.71      1089

APR: 0.773
ROC: 0.767
Cross-validated estimate
            accuracy: 0.685 +- 0.018
           precision: 0.715 +- 0.025
              recall: 0.682 +- 0.053
                  f1: 0.696 +- 0.024
   average_precision: 0.748 +- 0.020
             roc_auc: 0.736 +- 0.026
Time elapsed: 249.1 sec

--------------------------------------------------------------------------------
Working with dataset: HL_60_TB_t [2/2]
Test set
Instances: 988 ; Features: 1048577 with an avg of 492 features

In [9]:
for dataset, apr, roc, delta_time in results_deafault:
    print datset

NameError: name 'datset' is not defined

In [None]:
%matplotlib inline
import pylab as plt
d_dat=[roc for dataset, apr, roc, delta_time in results_deafault]
o_dat=[roc for dataset, apr, roc, delta_time in results_opt]
my_xticks = [dataset for dataset, apr, roc, delta_time in results_opt]
x = range(len(results_deafault))

plt.figure(figsize=(12,6))
plt.title("Comparison predictive performance w/o hyper parameter optimization")
plt.xlabel("Dataset")
plt.ylabel("ROC")
plt.grid()
plt.plot(x,d_dat,label='default')
plt.xticks(x, my_xticks,rotation=45)
plt.plot(x,o_dat,label='optimised')
plt.legend(loc='upper center', shadow=True)
plt.show()

.