## This script fits simple cross validated SVM using single fit alpy

## It should be straightforward to go from this script to running experiments on any fingerprint/compound pair

In [None]:
% load_ext autoreload
% autoreload 2
import sys
sys.path.insert(0, "../mlls2015")
sys.path.insert(0, "..")
import training_data
from training_data.datasets import CVBaseChemDataset
import bunch
import numpy as np
from bunch import Bunch
import os
import logging
from experiments.utils import wac_score
from misc.utils import config_log_to_file

In [None]:
opts = Bunch({"C_min": -6, "C_max": 3, "n_folds": 5, "preprocess": "max_abs", "fold": 1, "compound": "beta2", 
             "representation": "MACCS", "jaccard": 0, "rng": 777, "name": "test_svm", "output_dir": "/Users/kudkudak/code/mlls2015/"})

In [None]:
config_log_to_file(os.path.join(opts.output_dir, opts.name + ".log"), clear_log_file=True)
logger = logging.getLogger("fit_svm")
logger.info(opts.__dict__)
logger.info(opts.name)

In [None]:
data = CVBaseChemDataset(compound=opts.compound, representation=opts.representation, n_folds=opts.n_folds, rng=opts.rng,
                       preprocess=opts.preprocess)
(X_train, y_train), (X_valid, y_valid) = data.get_data(fold=opts.fold)
if opts.jaccard:
    K_train, K_valid = calculate_jaccard_kernel(data=data, fold=opts.fold)

In [None]:
# Calculate results
results = {}
C_range = range(opts.C_min, opts.C_max+1)
grid = [{"C": 10**i} for i in C_range]
for params in grid:
    logger.info("Testing " + str(params))
    if opts.jaccard:
        m = SVC(kernel="precomputed", class_weight="balanced",random_state=opts.rng,  **params)
        m.fit(K_train, y_train)
        y_pred = m.predict(K_valid)
        results[str(params)] = {"y_pred": y_pred, "wac": wac_score(y_valid, y_pred), "clf": m}
    else:
        m = LinearSVC(loss="hinge",class_weight="balanced", random_state=opts.rng, **params)
        m.fit(X_train, y_train)
        y_pred = m.predict(X_valid)
        results[str(params)] = {"y_pred": y_pred, "wac": wac_score(y_valid, y_pred), "clf": m}

In [None]:
wac_scores = [results[str(params)]["wac"] for params in grid]
logger.info("Max WAC=" + str(np.max(wac_scores)))
best_results = results[str(grid[np.argmax(wac_scores)])]