## This script fits simple cross validated SVM using single fit and alpy

## It should be straightforward to go from this script to running experiments on any fingerprint/compound pair

In [21]:
% load_ext autoreload
% autoreload 2
import matplotlib.pylab as plt
% matplotlib inline
import sys
import gzip
import seaborn
from os import path
sys.path.insert(0, "../mlls2015")
sys.path.insert(0, "..")
import training_data
from models.cv import AdaptiveGridSearchCV
from training_data.datasets import CVBaseChemDataset
import bunch
import numpy as np
from bunch import Bunch
import os
import logging
import json
from experiments.utils import wac_score
from misc.config import RESULTS_DIR
from misc.utils import config_log_to_file

from misc.config import BASE_DIR
import cPickle
import gzip

import copy
from models.balanced_models import *
import cPickle
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.svm import SVC

from misc.utils import get_run_properties

import alpy
import alpy_addons
from alpy_addons.active import ActiveLearner
from alpy.datasets.generate import get_2d_gaussians
from alpy_addons.monitors import *
from alpy.oracle import SimulatedOracle
from six import iteritems

from alpy_addons.strategy import UncertaintySampling, PassiveStrategy
from alpy.utils import mask_unknowns

from sklearn.metrics import auc

from training_data.datasets import calculate_jaccard_kernel

from sklearn.grid_search import GridSearchCV





opts_uncert = Bunch({"C_min": -6, 
              "C_max": 5,
              "internal_cv": 4,
              "max_iter": 10000000,
              "n_folds": 5, 
              "preprocess": "clip01", #"max_abs", 
              "fold": 3, 
              "d": 1,
              "output_dir": "test_fit_svm_al",
              "warm_start": 20, # TODO: add cluster-dependent warm_start
              "strategy_kwargs": None,
              "strategy": "UncertaintySampling",
              "compound": "beta2", 
              "representation": "MACCS", 
              "jaccard": 1, 
              "rng": 777, 
              "name": "uncertainty",
              "batch_size": 50})

opts_uncert_2 = Bunch(opts_uncert)
opts_uncert_2['name'] = 'uncertainty_2'

opts_passive = Bunch({"C_min": -6, 
              "C_max": 5,
              "internal_cv": 4,
              "max_iter": 10000000,
              "n_folds": 5, 
              "preprocess": "clip01", #"max_abs", 
              "fold": 3, 
              "d": 1,
              "output_dir": "test_fit_svm_al",
              "warm_start": 20, # TODO: add cluster-dependent warm_start
              "strategy_kwargs": None,
              "strategy": "PassiveStrategy",
              "compound": "beta2", 
              "representation": "MACCS", 
              "jaccard": 1, 
              "rng": 777, 
              "name": "passive",
              "batch_size": 50})


opts_passive_2 = Bunch(opts_passive)
opts_passive_2['name'] = 'passive_2'

jobs = [opts_uncert, opts_uncert_2, opts_passive, opts_passive_2]

# Run jobs
for job in jobs:
    print "Running ", cmd
    cmd = "./scripts/fit_svm_al.py " + " ".join("--{} {}".format(k, v) for k,v in job.iteritems() if v)
    print os.system("cd ..;" + cmd)

# Load results and compare/plot
p1 = json.load(open(path.join(BASE_DIR,"results/test_fit_svm_al/passive.json")))
p2 = json.load(open(path.join(BASE_DIR,"results/test_fit_svm_al/passive.json")))
u1 = json.load(open(path.join(BASE_DIR,"results/test_fit_svm_al/uncertainty.json")))
u2 = json.load(open(path.join(BASE_DIR,"results/test_fit_svm_al/uncertainty_2.json")))

# Scores are replicable given rng
for k in u1['scores']:
    if "time" not in k:
        assert u1['scores'][k] == u2['scores'][k], k + " should be replicable"

for k in p1['scores']:
    if "time" not in k:
        assert p1['scores'][k] == p2['scores'][k], k + " should be replicable"

from scripts.fit_svm_al import wac_scoring
p1_mon = cPickle.load(gzip.open(path.join(BASE_DIR,"results/test_fit_svm_al/passive.pkl.gz")))
u1_mon = cPickle.load(gzip.open(path.join(BASE_DIR,"results/test_fit_svm_al/uncertainty.pkl.gz")))

# Converge to same score
for k in u1_mon:
    if "score" in k and "time" not in k:
        assert p1_mon[k][-1] == u1_mon[k][-1], "Last score for " + k + " should be the same"

assert u1['scores']['wac_score_valid_auc'] > p1['scores']['wac_score_valid_auc']

# Check that last WAC score result
opts = jobs[0]

data = CVBaseChemDataset(compound=opts.compound, representation=opts.representation, 
                         n_folds=opts.n_folds, rng=opts.rng,
                         preprocess=opts.preprocess)
(X_train, y_train), (X_valid, y_valid) = data.get_data(fold=opts.fold)
if opts.jaccard:
    K_train, K_valid = calculate_jaccard_kernel(data=data, fold=opts.fold)

C_range = range(opts.C_min, opts.C_max+1)
param_grid = {"C": [10**i for i in C_range]}
m = GridSearchCV(param_grid=param_grid, estimator=
                 SVC(kernel="precomputed", class_weight="balanced", random_state=opts.rng))
m.fit(K_train, y_train)

assert abs(u1_mon['wac_score_valid'][-1] - wac_score(y_valid, m.predict(K_valid))) < 0.01

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# TODO: write test that will:
# - check repeatibility of whole AL experiment
# - check that uncertainty is better than passive
# - check that final accuracy is SAME as fitting just single estimator
# - check that we have a classifier that doesn't overfit!
# - sanity checks for results:
    # * CV not picked on the borders in more than 5% of cases
    # * performance is lower than 98% on labeled in 50% of time