# Kernel methods for biological sequence classification

MVA 2019 - Kernel methods for machine learning

*Éloïse Berthier, Guillaume Dalle, Clément Mantoux*

In [1]:
import os
import cProfile, pstats
import tqdm
import itertools

import numpy as np
import scipy.sparse as sp
import pandas as pd

import cvxpy as cp
import cvxopt
from qpsolvers import solve_qp
import osqp

import ray
ray.init()

import matplotlib.pyplot as plt

from vector_kernels import *

Academic license - for non-commercial use only


2019-02-13 13:30:27,804	INFO node.py:278 -- Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-02-13_13-30-27_1511/logs.
2019-02-13 13:30:27,911	INFO services.py:396 -- Waiting for redis server at 127.0.0.1:53950 to respond...
2019-02-13 13:30:28,041	INFO services.py:396 -- Waiting for redis server at 127.0.0.1:17694 to respond...
2019-02-13 13:30:28,049	INFO services.py:798 -- Starting Redis shard with 10.0 GB max memory.
2019-02-13 13:30:28,076	INFO services.py:1360 -- Starting the Plasma object store with 6.871947672999999 GB memory using /tmp.



View the web UI at http://localhost:8889/notebooks/ray_ui.ipynb?token=d8e8c17128e92b20e9b7287ab7b7f3d404e5e2e01acbafe5



Reading data

In [2]:
def read_data_mat100(dataset="tr0"):
    folder = "kernel-methods-for-machine-learning-2018-2019"
    features_file = "X" + dataset + "_mat100.csv"
    labels_file = "Y" + dataset + ".csv"
    
    X = pd.read_csv(
        os.path.join(folder, features_file),
        sep=" ",
        header=None
    )
    if "te" in dataset:
        return np.array(X)
    
    elif "tr" in dataset:
        Y = pd.read_csv(
            os.path.join(folder, labels_file),
            sep=",",
            index_col=0,
        )
        return np.array(X), 2 * np.array(Y.iloc[:, 0]) - 1

In [3]:
def read_data_spectr(dataset="tr0", length = 3):
    folder = "kernel-methods-for-machine-learning-2018-2019/"
    features_file = "X" + dataset + '_spectr'+str(length)+'.csv'
    labels_file = "Y" + dataset + ".csv"
    
    X = pd.read_csv(
        os.path.join(folder, features_file),
        sep=" ",
        header=None
    )
    if "te" in dataset:
        return np.array(X)
    
    elif "tr" in dataset:
        Y = pd.read_csv(
            os.path.join(folder, labels_file),
            sep=",",
            index_col=0,
        )
        return np.array(X), 2 * np.array(Y.iloc[:, 0]) - 1

In [4]:
dataset = []
for k in [0, 1, 2]:
    #Xtr, Ytr = read_data_mat100("tr" + str(k))
    #Xte = read_data_mat100("te" + str(k))
    Xtr, Ytr = read_data_spectr("tr" + str(k), length=4)
    Xte = read_data_spectr("te" + str(k), length=4)
    dataset.append([Xtr, Ytr, Xte])

## SVM backend & Cross-Validation

Quadratic Program optimization for kernel SVM with ridge penalty

In [5]:
def compute_predictor(Xtr, Ytr, kernel, lambd, method="qpsolvers"):
    m = Xtr.mean(axis=0)
    s = Xtr.std(axis=0)
    Xc = (Xtr - m)/s

    n = len(Xc)

    I = np.eye(n)
    gram_matrix = kernel(Xc, Xc)
    K = gram_matrix + 1e-9*I
    
    if method == "cvxpy":
    
        alpha = cp.Variable(n)

        constraints = [
            cp.multiply(Ytr, alpha) >= np.zeros(n),
            cp.multiply(Ytr, alpha) <= np.ones(n) / (2 * lambd * n)
        ]

        objective = cp.Minimize(
            - 2 * (Ytr * alpha)
            + cp.quad_form(alpha, K)
        )

        prob = cp.Problem(objective, constraints)
        result = prob.solve(solver=cp.OSQP, verbose=False)
        alpha_opt = alpha.value
        
    elif method == "qpsolvers":
        
        P = K
        q = - Ytr.astype(float)
        # Sparse G
        G = sp.vstack([
            -sp.diags(Ytr),
            sp.diags(Ytr)
        ]).tocsc().astype(float)
        h = np.hstack([
            np.zeros(n),
            np.ones(n) / (2 * lambd * n)
        ]).astype(float)
    
        alpha_opt = solve_qp(P=P, q=q, G=G, h=h, solver="cvxopt")
    
    return lambda x_new: np.sign(alpha_opt.dot(kernel(Xc, (x_new - m)/s)))

Profiling both optimization methods

In [6]:
Xtr, Ytr, _ = dataset[0]

#cProfile.run("f = compute_predictor(Xtr, Ytr, gauss(3), 1, method='cvxpy')", "profiling/stats1")
#pstats.Stats("profiling/stats1").strip_dirs().sort_stats("tottime").print_stats(20)

cProfile.run("f = compute_predictor(Xtr, Ytr, gauss(3), 1, method='qpsolvers')", "profiling/stats2")
pstats.Stats("profiling/stats2").strip_dirs().sort_stats("tottime").print_stats(20)

Wed Feb 13 13:30:32 2019    profiling/stats2

         2848 function calls (2828 primitive calls) in 0.498 seconds

   Ordered by: internal time
   List reduced from 277 to 20 due to restriction <20>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
       16    0.253    0.016    0.253    0.016 {built-in method cvxopt.lapack.potrf}
        1    0.052    0.052    0.052    0.052 {method 'dot' of 'numpy.ndarray' objects}
        1    0.047    0.047    0.102    0.102 vector_kernels.py:9(kernel)
        8    0.036    0.004    0.297    0.037 misc.py:1389(factor)
        4    0.023    0.006    0.023    0.006 cvxopt_.py:29(cvxopt_matrix)
        1    0.018    0.018    0.494    0.494 <ipython-input-5-cdfb343cdacc>:1(compute_predictor)
       30    0.016    0.001    0.016    0.001 {built-in method cvxopt.blas.trsv}
        8    0.008    0.001    0.008    0.001 {built-in method cvxopt.base.syrk}
        1    0.008    0.008    0.008    0.008 twodim_base.py:140(eye)
        1

<pstats.Stats at 0x10e859080>

K-fold cross-validation

In [7]:
kfold = 5

def cross_validate(X, Y, kernel, lambd, shuffle=True, kfold=kfold):
    acc_train, acc_val = np.zeros(kfold), np.zeros(kfold)
    
    # jointly shuffle input datasets X, Y
    n = X.shape[0]
    if shuffle:
        perm = np.random.permutation(n)
        X, Y = X[perm], Y[perm]
    idx = np.arange(n)
    for k in range(kfold):
        # split the datasets
        val_idx = idx[k::kfold]
        train_idx = np.delete(idx, val_idx)
        n_train = len(train_idx)
        n_val = n - n_train
        
        X_train = X[train_idx]
        Y_train = Y[train_idx]
        X_val = X[val_idx]
        Y_val = Y[val_idx]
        
        # fit the predictor
        f = compute_predictor(X_train, Y_train, kernel, lambd)

        Yte_train = f(X_train).reshape(-1)
        Ypred_train = ((Yte_train + 1) / 2).astype(int)

        Yte_val = f(X_val).reshape(-1)
        Ypred_val = ((Yte_val + 1) / 2).astype(int)
        
        Y_train = ((Y_train + 1) / 2).astype(int)
        Y_val = ((Y_val + 1) / 2).astype(int)
        
        # compute metrics
        acc_train[k] = np.mean(Y_train == Ypred_train)
        acc_val[k] = np.mean(Y_val == Ypred_val)
    return acc_train, acc_val

Tool for pretty plots with mean + std

In [8]:
def lineplotCI(x, y, low, up, c, log=False):
    if log:
        plt.xscale("log")
    plt.plot(x, y, lw = 2, color = c, alpha = 1)
    plt.fill_between(x, low, up, color = c, alpha = 0.2)

## 2. Parameter tuning

### 2.1. Grid search on lambda for the linear kernel

In [None]:
lambd_range = np.logspace(-1.5, 3, 100)

acc_train = np.zeros((3, len(lambd_range), kfold))
acc_val = np.zeros((3, len(lambd_range), kfold))

for d, data in enumerate(dataset):
    Xtr, Ytr, _ = data
    for i in tqdm.trange(len(lambd_range), desc="Testing lambda for dataset {}".format(d+1)):
        lambd = lambd_range[i]
        acc_train[d, i], acc_val[d, i] = cross_validate(
            Xtr, Ytr,
            linear(), lambd,
            shuffle=True, kfold=kfold
        )

In [None]:
d = 0
plt.figure()
lineplotCI(
    lambd_range,
    acc_train[d].mean(axis=1),
    acc_train[d].mean(axis=1) + acc_train[d].std(axis=1),
    acc_train[d].mean(axis=1) - acc_train[d].std(axis=1),
    c='r',
    log=True
)
lineplotCI(
    lambd_range,
    acc_val[d].mean(axis=1),
    acc_val[d].mean(axis=1) + acc_val[d].std(axis=1),
    acc_val[d].mean(axis=1) - acc_val[d].std(axis=1),
    c='g',
    log=True
)
plt.legend(['train', 'val'])
plt.xlabel('Value of lambda')
plt.ylabel('Accuracy')
plt.title('Cross-validation on dataset {}'.format(d))
plt.show()

In [None]:
best_lambd = [
    lambd_range[np.argmax(np.mean(acc_val[d], axis=1))]
    for d in range(3)
]
print(best_lambd)

### 2.2 Grid search on lambda & sigma for the gaussian kernel

In [13]:
kfold = 3
lambd_range = np.logspace(-1.5, 1, 20)
sigma_range = np.linspace(3, 10, 20)
tuple_range = list(itertools.product(lambd_range, sigma_range))

acc_train = np.zeros((3, len(tuple_range), kfold))
acc_val = np.zeros((3, len(tuple_range), kfold))

In [14]:
@ray.remote
def joint_crossval(d, data):
    Xtr, Ytr, _ = data
    for i in tqdm.trange(len(tuple_range), desc="Testing (lambd, sigma) for dataset {}".format(d+1)):
        lambd, sigma = tuple_range[i]
        acc_train[d, i], acc_val[d, i] = cross_validate(
            Xtr, Ytr,
            gauss(sigma), lambd,
            shuffle=True, kfold=kfold
        )
    print("Finished dataset {}".format(d))

In [None]:
tasks = []
for d, data in enumerate(dataset):
    tasks.append(joint_crossval.remote(d, data))
for t in tasks:
    ray.get(t)

In [None]:
best_tuple = [
    tuple_range[np.argmax(np.mean(acc_val[d], axis=1))]
    for d in range(3)
]
best_lambd = [bt[0] for bt in best_tuple]
best_sigma = [bt[1] for bt in best_tuple]

In [None]:
print(np.mean(acc_train[0], axis=1))
print(np.mean(acc_train[1], axis=1))
print(np.mean(acc_train[2], axis=1))
print(np.mean(acc_val[0], axis=1))
print(np.mean(acc_val[1], axis=1))
print(np.mean(acc_val[2], axis=1))
print(tuple_range)

Works quite well with **datasets 2 and 3 (not 1!)** and tends to overfit...

## 3. Final predictions

In [None]:
Ypred = []
lambd = 0.

for k in [0, 1, 2]:
    print("\nDATASET {}\n".format(k))

    Xtr, Ytr, Xte = dataset[k]
    
    f = compute_predictor(Xtr, Ytr, gauss(best_sigma[k]), best_lambd[k])
    #f = compute_predictor(Xtr, Ytr, linear(), best_lambd[k])
    print(np.mean(Ytr == f(Xtr)))
    Yte = f(Xte)
    
    Ypred.extend(list(((Yte + 1) / 2).astype(int)))
    
Ypred = pd.Series(
    index=np.arange(len(Ypred)),
    data=Ypred
)
Ypred.index.name = "Id"
Ypred.name = "Bound"
Ypred.to_csv("Ypred.csv", header=True)