20-fold cross-validation on all embeddings for each task. 

In [1]:
import os
import pickle

import numpy as np
from scipy import stats
from sklearn import metrics
import pandas as pd
from sklearn import model_selection

import gpm
import gpk

assert np.__version__ == '1.13.1'
assert pd.__version__ == '0.20.3'

In [2]:
def select_X_and_Y(df, all_X, y_column):
    not_dropped = ~pd.isnull(df[y_column])
    not_dropped = pd.Series(not_dropped, index=df.index)
    Ys = df[not_dropped][y_column]
    Ys.index = df[not_dropped]['name']
    Xs = all_X.loc[Ys.index]
    return Xs, Ys

def score(Y, pred_Y, pred_var):
    r1 = stats.rankdata(Y)
    r2 = stats.rankdata(pred_Y)
    scores = {}
    scores['kendalltau'] = stats.kendalltau(r1, r2).correlation
    scores['R2'] = metrics.r2_score(Y, pred_Y)
    scores['SE'] = metrics.mean_squared_error(Y, pred_Y)
    scores['R'] = np.corrcoef(Y, pred_Y)[0, 1]
    log_ps = -0.5 * np.log(pred_var) - (pred_Y - Y)**2 / 2 / pred_var
    log_ps -= 0.5 * np.log(2 * np.pi)
    scores['log_loss'] = -np.sum(log_ps)
    return scores

def cross_validate(y_col, df, e_dir, fname):
    with open(fname, 'w') as f:
        f.write('task,embedding,kernel,R,R2,kendalltau,log_loss,SE\n')
    for embed in os.listdir(e_dir):
        if embed[0] != 'X':
            continue
        with open(e_dir + embed, 'rb') as f:
            e_X = pickle.load(f)
        if len(e_X) == 2:
            e_X = e_X[0]
        X, y = select_X_and_Y(df, e_X, y_col)
        X = X.values
        y = y.values
        # Cross-validation predictions
        kf = model_selection.KFold(n_splits=20, shuffle=True, random_state=10)
        y_actual = []
        mu_val = {'cubic':[], 'Matern5/2':[]}
        var_val = {'cubic':[], 'Matern5/2':[]}
        mu_test = {}
        var_test = {}
        for i_train, i_val in kf.split(X):
            X_ = X[i_train]
            y_ = y[i_train]
            X_val = X[i_val]
            y_val = y[i_val]
            y_actual.append(y_val)
            k = gpk.MaternKernel('5/2')
            kernel = 'Matern5/2'
            clf = gpm.GPRegressor(k, gueses=(10, 100))
            clf.fit(X_, y_)
            mu, var = clf.predict(X_val)
            mu_val[kernel].append(mu)
            var_val[kernel].append(np.diag(var))
        y_actual = np.concatenate(y_actual)
        mu_val['Matern5/2'] = np.concatenate(mu_val['Matern5/2'])
        var_val['Matern5/2'] = np.concatenate(var_val['Matern5/2'])
        kernels = ['Matern5/2']
        val_scores_dict = {k:score(y_actual, mu_val[k], var_val[k]) for k in kernels}                            
        # Write to file
        for kernel in kernels:
            with open(fname, 'a') as f:
                scores = val_scores_dict[kernel]
                f.write(','.join([y_col, embed, kernel, str(scores['R']),
                                 str(scores['R2']), str(scores['kendalltau']),
                                 str(scores['log_loss']), str(scores['SE'])]))
                f.write('\n')

## Localization

In [12]:
y_col = 'log_GFP'
dataset = '../inputs/localization.txt'
e_dir = '../outputs/localization_embeddings/'
fname = '../outputs/cv_localization.txt'

df = pd.read_csv(dataset)
    
%time cross_validate(y_col, df[df.is_train], e_dir, fname)

CPU times: user 1h 15min 6s, sys: 1min 21s, total: 1h 16min 27s
Wall time: 20min 8s


## Absorption

In [4]:
y_col = 'peak'
dataset = '../inputs/absorption.txt'
e_dir = '../outputs/absorption_embeddings/'
fname = '../outputs/cv_absorption.txt'

df = pd.read_csv(dataset)
    
%time cross_validate(y_col, df[df.is_train], e_dir, fname)

CPU times: user 5min 54s, sys: 3.55 s, total: 5min 57s
Wall time: 4min 32s


## T50

In [5]:
y_col = 'T50'
dataset = '../inputs/T50.txt'
e_dir = '../outputs/T50_embeddings/'
fname = '../outputs/cv_T50.txt'

df = pd.read_csv(dataset)
    
%time cross_validate(y_col, df[df.is_train], e_dir, fname)

CPU times: user 1h 38min 28s, sys: 1min 38s, total: 1h 40min 7s
Wall time: 26min 21s


## Enantioselectivity

In [6]:
y_col = 'e-value'
dataset = '../inputs/enantioselectivity.txt'
e_dir = '../outputs/enantioselectivity_embeddings/'
fname = '../outputs/cv_enantioselectivity.txt'

df = pd.read_csv(dataset)
    
%time cross_validate(y_col, df[df.is_train], e_dir, fname)

CPU times: user 22min 14s, sys: 22.6 s, total: 22min 36s
Wall time: 5min 50s
