## This script fits simple cross validated SVM using single fit and alpy

## It should be straightforward to go from this script to running experiments on any fingerprint/compound pair

In [None]:
#TODO: - sanity checks for results:
    # * CV not picked on the borders in more than 5% of cases
    # * performance is lower than 98% on labeled in 50% of time

In [21]:
% load_ext autoreload
% autoreload 2
import matplotlib.pylab as plt
% matplotlib inline
import sys
import gzip
import seaborn
from os import path
sys.path.insert(0, "../mlls2015")
sys.path.insert(0, "..")
import training_data
from models.cv import AdaptiveGridSearchCV
from training_data.datasets import CVBaseChemDataset
import bunch
import numpy as np
from bunch import Bunch
import os
import logging
import json
from experiments.utils import wac_score
from misc.config import RESULTS_DIR
from misc.utils import config_log_to_file

from misc.config import BASE_DIR
import cPickle
import gzip

import copy
from models.balanced_models import *
import cPickle
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.svm import SVC

from misc.utils import get_run_properties

import alpy
import alpy_addons
from alpy_addons.active import ActiveLearner
from alpy.datasets.generate import get_2d_gaussians
from alpy_addons.monitors import *
from alpy.oracle import SimulatedOracle
from six import iteritems

from alpy_addons.strategy import UncertaintySampling, PassiveStrategy
from alpy.utils import mask_unknowns

from sklearn.metrics import auc

from training_data.datasets import calculate_jaccard_kernel

from sklearn.grid_search import GridSearchCV





opts_uncert = Bunch({"C_min": -6, 
              "C_max": 5,
              "internal_cv": 4,
              "max_iter": 10000000,
              "n_folds": 5, 
              "preprocess": "clip01", #"max_abs", 
              "fold": 3, 
              "d": 1,
              "output_dir": "test_fit_svm_al",
              "warm_start": 20, # TODO: add cluster-dependent warm_start
              "strategy_kwargs": None,
              "strategy": "UncertaintySampling",
              "compound": "beta2", 
              "representation": "MACCS", 
              "jaccard": 1, 
              "rng": 777, 
              "name": "uncertainty",
              "batch_size": 50})



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [51]:
config_log_to_file(fname=opts.name + ".log", clear_log_file=True)
logger = logging.getLogger("fit_svm_al")

logger.info(opts.__dict__)
logger.info(opts.name)
logger.info("Loading data..")

data = CVBaseChemDataset(compound=opts.compound, representation=opts.representation, n_folds=opts.n_folds,
                         rng=opts.rng,
                         preprocess=opts.preprocess)
(X_train, y_train), (X_valid, y_valid) = data.get_data(fold=opts.fold)
if opts.jaccard:
    X_train, X_valid = calculate_jaccard_kernel(data=data, fold=opts.fold)

y_train_masked = mask_unknowns(y_train,
                               np.random.RandomState(opts.rng).choice(X_train.shape[0],
                                                size=X_train.shape[0] - opts.warm_start, replace=False))

kernel = "precomputed" if opts.jaccard else "linear"

estimator = AdaptiveGridSearchCV(d=opts.d,
                                 estimator=SVC(random_state=opts.rng, kernel=kernel, max_iter=opts.max_iter),
                                 param_grid=
                                 {
                                     "C": [10 ** c for c in range(opts.C_min, opts.C_max + 1)]},
                                 cv=opts.internal_cv,
                                 scoring=wac_scoring,
                                 error_score=0.)

strategy = UncertaintySampling()

al = ActiveLearner(strategy=strategy,
                   random_state=opts.rng,
                   batch_size=opts.batch_size,
                   oracle=SimulatedOracle(sample_budget=np.inf),
                   estimator=estimator)

# TODO: add cluster monitors
monitors = []

monitors.append(ExtendedMetricMonitor(name="wac_score",
                                      short_name="wac_score",
                                      function=wac_score,
                                      ids="all",
                                      frequency=1))

al.fit(X_train, y_train_masked, monitors=monitors)

2016-01-10 10:05:52,612 - fit_svm_al - INFO - {}
2016-01-10 10:05:52,613 - fit_svm_al - INFO - uncertainty
2016-01-10 10:05:52,614 - fit_svm_al - INFO - Loading data..
