### Example imports

In [1]:
import numpy as np

import matplotlib.pyplot as plt

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels \
    import RBF, WhiteKernel, ConstantKernel as C
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import learning_curve

from gp_extras.kernels import ManifoldKernel

np.random.seed(0)

### My imports

In [2]:
from sklearn.model_selection import train_test_split
import sklearn.metrics
import scipy.stats
import sys
import os
import pickle
import pandas as pd
import time

sys.path.append('../../Utils')
from metrics import compute_metrics

current_dir = os.getcwd()
data_dir = os.path.join(current_dir, '../../../Data/')

RNA_PROT_EMBED = data_dir+'ProcessedData/protein_embeddings/rna_protein_u64embeddings_python2.pkl'

### Data import and formatting

In [3]:
with open(RNA_PROT_EMBED,'rb') as file:
    rna_prot_embed = pickle.load(file)

In [4]:
data = rna_prot_embed['AM_04M_F0'].copy()
data.drop(columns='AvgChrs',inplace=True)

In [5]:
#Code Parameters
#-----------------------------------------
cols_drop=['ProteinAUC']
MRNA_THRESH = 0
ZSCORE = True
LOG_TRANS = True
BATCH = 32
SAVE = False
SEED = 10
#-----------------------------------------

In [7]:
if LOG_TRANS:
    #Log transform mRNA, protein, and protein length -> log-normal distributed
    data['mRNA_TMM'] = np.log2(data['mRNA_TMM']+1)
    data['ProteinAUC'] = np.log2(data['ProteinAUC']+1)
    data['ProteinLength'] = np.log2(data['ProteinLength']+1)
    
train,test = train_test_split(data,test_size=0.7,random_state=SEED)

def zscore(train_df):
    assert isinstance(train_df,pd.DataFrame)
    means = train_df.mean(axis=0)
    stds = train_df.std(axis=0)
    zscored = (train_df-means)/stds
    return zscored, means, stds

if ZSCORE:
    print'Data is z-scored'
    train, train_mean, train_std = zscore(train) #zscore data
    #val = (val-train_mean)/train_std #zscore validation data using mean and std from train set
    test = (test-train_mean)/train_std #zscore test data using mean and std from train set

Data is z-scored


In [8]:
x_train = train.drop(columns=cols_drop).values
y_train = train['ProteinAUC'].values

x_test = test.drop(columns=cols_drop).values
y_test = test['ProteinAUC'].values

### Manifold kernel definition

In [9]:
n_samples = x_train.shape[0]
n_features = x_train.shape[1]
n_dim_manifold = 8
n_hidden = 16

In [11]:
architecture=((n_features, n_hidden, n_dim_manifold),)
kernel_nn = C(1.0, (1e-10, 100)) \
    * ManifoldKernel.construct(base_kernel=RBF(0.1, (1.0, 100.0)),
                               architecture=architecture,
                               transfer_fct="tanh", max_nn_weight=1.0) \
    + WhiteKernel(1e-3, (1e-10, 1e-1))
gp_nn = GaussianProcessRegressor(kernel=kernel_nn, alpha=0,
                                 n_restarts_optimizer=3,copy_X_train=False)

In [12]:
start = time.time()

gp_nn.fit(x_train, y_train)

print "Initial kernel: %s" % gp_nn.kernel
print "Log-marginal-likelihood: %s" \
    % gp_nn.log_marginal_likelihood(gp_nn.kernel.theta)

print "Learned kernel: %s" % gp_nn.kernel_
print "Log-marginal-likelihood: %s" \
    % gp_nn.log_marginal_likelihood(gp_nn.kernel_.theta)

runtime = time.time()-start
print "runtime is: %f" % runtime

KeyboardInterrupt: 