In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
import sys, os
sys.path.append('..')

import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.grid_search import GridSearchCV
from matplotlib import pyplot as plt

from get_data import get_data, get_splitted_data
from models.active_model import ActiveLearningExperiment
from models.strategy import *
from models.utils import ObstructedY

from misc.config import c
data_dir = c["DATA_DIR"]

import kaggle_ninja

from experiments.experiment_runner import run_experiment, run_experiment_grid
import experiments
from experiments import experiment_runner, fit_active_learning, fit_grid
from experiments.utils import *
from models.strategy import cosine_distance_normalized

from kaggle_ninja import *

In [None]:
protein = '5ht6'
fingerprint = "ExtFP"
seed = 666

random = run_experiment("fit_grid",
                   recalculate_experiments=False,
                   n_jobs = 8,
                   experiment_detailed_name="fit_svm_passive_%s_%s" % (protein, fingerprint),
                   base_experiment="fit_active_learning",
                   seed=666,
                   base_experiment_kwargs={"strategy": "random_query",
                                           "loader_function": "get_splitted_data",
                                           "batch_size": 20,
                                           "base_model": "LinearSVC",
                                           "loader_args": {"n_folds": 2,
                                                           "seed": seed},
                                           "param_grid": {'C': list(np.logspace(-3,4,7))},
                                           "base_model_kwargs": { "loss": 'hinge'}})


uncertian = run_experiment("fit_grid",
                   recalculate_experiments=False,
                   n_jobs = 8,
                   experiment_detailed_name="fit_svm_uncertain_%s_%s" % (protein, fingerprint),
                   base_experiment="fit_active_learning",
                   seed=666,
                   base_experiment_kwargs={"strategy": "uncertanity_sampling",
                                           "loader_function": "get_splitted_data",
                                           "batch_size": 20,
                                           "base_model": "LinearSVC",
                                           "loader_args": {"n_folds": 2,
                                                           "seed": seed},
                                           "param_grid": {'C': list(np.logspace(-5,5,10))},
                                           "base_model_kwargs": { "loss": 'hinge'}})

In [None]:
best_random_exp = get_best(random.experiments, "mean_mcc_valid")
best_uncertain_exp = get_best(uncertian.experiments, "mean_mcc_valid")

In [None]:
plot_monitors([best_random_exp, best_uncertain_exp], folds='mean')

In [None]:
calc_auc([best_random_exp, best_uncertain_exp], folds="mean")

In [None]:
protein = '5ht6'
fingerprints = ["ExtFP"]
seed = 666

grid_result_greedy = run_experiment("fit_grid",
                                        recalculate_experiments=False,
                                        n_jobs = 2,
                                        experiment_detailed_name="fit_svm_greedy_%s_%s" % (protein, fingerprints),
                                        base_experiment="fit_active_learning",
                                        seed=666,
                                        grid_params = {"base_model_kwargs:C": list(np.logspace(-5,5,10)),
                                                       "base_model_kwargs:loss": ['hinge'],
                                                       "strategy_kwargs:c": list(np.linspace(0.1, 0.9, 9))},
                                        base_experiment_kwargs={"strategy": "quasi_greedy_batch",
                                                           "loader_function": "get_splitted_data",
                                                           "batch_size": 20,
                                                           "base_model": "LinearSVC",
                                                           "loader_args": {"n_folds": 2,
                                                                           "seed": 666}})

In [None]:
best_greedy_exp = get_best(grid_result_greedy.experiments, "mean_mcc_valid")

In [None]:
plot_monitors([best_passive_exp, best_uncertain_exp, best_greedy_exp], folds="mean")

# Balanced Models

In [4]:
protein = '5ht6'
fingerprint = "ExtFP"
seed = 666

In [5]:
twelm_uncertain = run_experiment("fit_grid",
                                 recalculate_experiments=False,
                                 n_jobs = 8,
                                 experiment_detailed_name="fit_TWELM_uncertain_%s_%s" % (protein, fingerprint),
                                 base_experiment="fit_active_learning",
                                 seed=666,
                                 base_experiment_kwargs={"strategy": "uncertainty_sampling",
                                                         "loader_function": "get_splitted_data",
                                                         "batch_size": 20,
                                                         "base_model": "TWELM",
                                                         "loader_args": {"n_folds": 2,
                                                                         "seed": seed},
                                                         "param_grid": {'C': list(np.logspace(-3,4,7))}})

fit_grid: 2015-06-07 20:55:31,967 - Added new config entry: "base_experiment_kwargs.base_model"
fit_grid: 2015-06-07 20:55:31,991 - Added new config entry: "base_experiment_kwargs.batch_size"
fit_grid: 2015-06-07 20:55:31,992 - Added new config entry: "base_experiment_kwargs.loader_args"
fit_grid: 2015-06-07 20:55:31,992 - Added new config entry: "base_experiment_kwargs.loader_args.n_folds"
fit_grid: 2015-06-07 20:55:31,992 - Added new config entry: "base_experiment_kwargs.loader_args.seed"
fit_grid: 2015-06-07 20:55:31,993 - Added new config entry: "base_experiment_kwargs.loader_function"
fit_grid: 2015-06-07 20:55:31,993 - Added new config entry: "base_experiment_kwargs.param_grid"
fit_grid: 2015-06-07 20:55:31,993 - Added new config entry: "base_experiment_kwargs.param_grid.C"
fit_grid: 2015-06-07 20:55:31,994 - Added new config entry: "base_experiment_kwargs.strategy"
fit_grid: 2015-06-07 20:55:31,994 - Running command 'main'
fit_grid: 2015-06-07 20:55:31,994 - Started
fit_TWELM_un

Loading  {'grid_params': {}, 'base_experiment': 'fit_active_learning', 'experiment_detailed_name': 'fit_TWELM_uncertain_5ht6_ExtFP', 'base_experiment_kwargs': {u'loader_function': 'get_splitted_data', u'batch_size': 20, u'strategy': 'uncertainty_sampling', u'loader_args': {u'n_folds': 2, u'seed': 666}, u'base_model': 'TWELM', u'param_grid': {u'C': [0.001, 0.014677992676220698, 0.21544346900318845, 3.1622776601683795, 46.415888336127821, 681.29206905796218, 10000.0]}}, 'seed': 666, 'timeout': -1, 'single_fit_timeout': -1}


fit_TWELM_uncertain_5ht6_ExtFP: 2015-06-07 20:55:32,000 - Fitting fit_active_learning for 1 parameters combinations
fit_TWELM_uncertain_5ht6_ExtFP: 2015-06-07 20:55:32,283 - Traceback (most recent call last):
  File "../experiments/fit_grid.py", line 78, in main
    result = run()
  File "/usr/local/lib/python2.7/dist-packages/sacred/config/captured_function.py", line 45, in captured_function
    result = wrapped(*args, **kwargs)
  File "../experiments/fit_grid.py", line 52, in run
    n_jobs=n_jobs, grid_params=grid_params, **base_experiment_kwargs)
  File "../experiments/experiment_runner.py", line 168, in run_experiment_grid
    t.get(10) # First to fail will throw TiemoutErro()
  File "/usr/lib/python2.7/multiprocessing/pool.py", line 558, in get
    raise self._value
AssertionError: Please pass preprocess_fncs

fit_active_learning: 2015-06-07 20:55:32,186 - Added new config entry: "loader_args.n_folds"
fit_active_learning: 2015-06-07 20:55:32,206 - Added new config entry: "loader_

{u'loader_function': 'get_splitted_data', 'name': 'fit_active_learning', 'experiment_detailed_name': 'fit_TWELM_uncertain_5ht6_ExtFP_subfit', u'batch_size': 20, u'strategy': 'uncertainty_sampling', u'loader_args': {u'n_folds': 2, u'seed': 666}, 'seed': 666, 'timeout': -1, u'base_model': 'TWELM', u'param_grid': {u'C': [0.001, 0.014677992676220698, 0.21544346900318845, 3.1622776601683795, 46.415888336127821, 681.29206905796218, 10000.0]}, 'force_reload': False}
Loading  {'loader_function': 'get_splitted_data', 'preprocess_fncs': 0, 'experiment_detailed_name': 'fit_TWELM_uncertain_5ht6_ExtFP_subfit', 'batch_size': 20, 'loader_args': {u'n_folds': 2, u'seed': 666}, 'strategy_projection_h': 0, 'seed': 666, 'fingerprint': 0, 'protein': 0, 'param_grid': {u'C': [0.001, 0.014677992676220698, 0.21544346900318845, 3.1622776601683795, 46.415888336127821, 681.29206905796218, 10000.0]}, 'base_model_kwargs': {}, 'warm_start_percentage': 0, 'strategy': 'uncertainty_sampling', 'base_model': 'TWELM', 'ti

AssertionError: Please pass preprocess_fncs

In [6]:
eem_uncertainty = run_experiment("fit_grid",
                                 recalculate_experiments=False,
                                 n_jobs = 8,
                                 experiment_detailed_name="fit_EEM_uncertainty_%s_%s" % (protein, fingerprint),
                                 base_experiment="fit_active_learning",
                                 seed=666,
                                 base_experiment_kwargs={"strategy": "uncertainty_sampling",
                                                         "loader_function": "get_splitted_data",
                                                         "batch_size": 20,
                                                         "base_model": "EEM",
                                                         "loader_args": {"n_folds": 2,
                                                                         "seed": seed},
                                                         "param_grid": {'C': list(np.logspace(-3,4,7))}})


AssertionError: Please pass preprocess_fncs

In [None]:
svmtan_uncertainty = run_experiment("fit_grid",
                                    recalculate_experiments=False,
                                    n_jobs = 8,
                                    experiment_detailed_name="fit_SVMTAN_uncertainty_%s_%s" % (protein, fingerprint),
                                    base_experiment="fit_active_learning",
                                    seed=666,
                                    base_experiment_kwargs={"strategy": "uncertainty_sampling",
                                                            "loader_function": "get_splitted_data",
                                                            "batch_size": 20,
                                                            "base_model": "SVMTAN",
                                                            "loader_args": {"n_folds": 2,
                                                                            "seed": seed},
                                                            "param_grid": {'C': list(np.logspace(-3,4,7))}})

In [None]:
nb_uncertainty = run_experiment("fit_grid",
                                recalculate_experiments=False,
                                n_jobs = 8,
                                experiment_detailed_name="fit_NB_uncertainty_%s_%s" % (protein, fingerprint),
                                base_experiment="fit_active_learning",
                                seed=666,
                                base_experiment_kwargs={"strategy": "uncertainty_sampling",
                                                        "loader_function": "get_splitted_data",
                                                        "batch_size": 20,
                                                        "base_model": "RandomNB",
                                                        "loader_args": {"n_folds": 2,
                                                                        "seed": seed},
                                                        "param_grid": {'h': list(np.linspace(100,500,5))}})

In [None]:
best_twelm_exp = get_best(twelm_uncertain.experiments, "auc")
best_eem_exp = get_best(eem_uncertainty.experiments, "auc")
best_svmtan_exp = get_best(svmtan_uncertainty.experiments, "auc")
best_nb_exp = get_best(nb_uncertainty.experiments, "auc")

In [None]:
plot_monitors([best_twelm_exp, best_eem_exp, best_svmtan_exp, best_nb_exp], folds='mean')

In [None]:
best_twelm_exp = get_best(twelm_uncertain.experiments, "auc")

In [None]:
plot_monitors(best_twelm_exp, folds='all', keys='metrics')

In [None]:
plot_monitors(best_twelm_exp, folds='mean', keys='times')

In [None]:
best_twelm_exp.monitors[0].keys()

In [None]:
calc_auc(best_twelm_exp, folds='all')