# Regression on RBS sequences - Escherichia coli

This notebook shows the regression of RBS sequences.

There are several parts of the implementation. 

- Reading the dataset
- Embedding
    - Label
    - One-hot embedding
    - K-mer 
    - PMW (to be added)
    - Unsupervised (to be added)
- Kernels
    - Spectrum
    - Mixed Spectrum
    - Weighted Degree
    - Weighted Degree with Shifting
- Evaluate
    - R2 score
    - Plot

In [1]:
# direct to proper path
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import numpy as np

import pandas
import matplotlib.pyplot as plt
import itertools
from collections import defaultdict

from codes.embedding import Embedding
from codes.environment import Rewards_env
from codes.ucb import GPUCB
from codes.evaluations import evaluate, plot_eva
from codes.regression import Regression
from codes.kernels_pairwise import spectrum_kernel_pw, mixed_spectrum_kernel_pw, WD_kernel_pw, WD_shift_kernel_pw

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import PairwiseKernel, DotProduct, RBF
from sklearn.kernel_ridge import KernelRidge

from ipywidgets import IntProgress
from IPython.display import display
import warnings
%matplotlib inline

## Reading the Dataset

In [3]:
# Data downloaded from https://github.com/synbiochem/opt-mva
# Paper https://pubs.acs.org/doi/abs/10.1021/acssynbio.8b00398

Path = '../data/trainset.rbs1.v2.csv'

df = pandas.read_csv(Path, delimiter=',')
df.head()

Unnamed: 0,Construct,Barcode,rbs1_1_A,rbs1_1_C,rbs1_1_G,rbs1_1_T,rbs1_2_A,rbs1_2_C,rbs1_2_G,rbs1_2_T,...,rbs2_3_A,rbs2_3_C,rbs2_3_G,rbs2_3_T,Group,ODind,ODhar,FC,rbs1,rbs2
0,RBS629,77DD32,0,0,1,0,0,0,1,0,...,0,0,1,0,Top,0.41,4.523,1.437982,GGG,GGG
1,RBS678,77DD39,0,0,1,0,0,0,1,0,...,0,0,1,0,Top,0.359,2.678,1.277175,GGG,GGG
2,RBS353,77DD18,0,0,1,0,0,0,1,0,...,0,0,1,0,Top,0.334,1.745,1.545141,GGC,GGG
3,RBS403,77DD20,0,0,1,0,0,0,1,0,...,1,0,0,0,Top,0.39,2.166,1.670098,GGC,GGA
4,RBS500,77DD24,0,0,1,0,0,0,1,0,...,0,1,0,0,Top,0.375,2.348,1.62123,GGC,GGC


In [4]:
df['RBS'] = df['rbs1'] + df['rbs2']
data = np.asarray(df[['RBS', 'FC']])
data.shape

(60, 2)

## Regression with cross validation

- Kernel Ridge Regression
- Gaussian Process Regression

Leave one out cross validation with shuffle.   
k = 10   
test size = 0.2

In [5]:
cross_val_flag = True
plot_flag = False
kernels = [spectrum_kernel_pw, mixed_spectrum_kernel_pw, WD_kernel_pw, WD_shift_kernel_pw]
k = 5

### Kernel Ridge Regression 

In [6]:
# Baseline: onehot embedding with DotProduct kernel

reg = Regression(KernelRidge(kernel = DotProduct()), data, 'onehot')
reg.train()
reg.evaluate(cross_val_flag = cross_val_flag, plot_flag = plot_flag, k = k)

Model:  KernelRidge(alpha=1, coef0=1, degree=3, gamma=None,
            kernel=DotProduct(sigma_0=1), kernel_params=None)
[0.25814044 0.37372984 0.36235549 0.32939539 0.21600309]
RMSE : 0.31 (+/- 0.12)


In [7]:
# kmer embedding with DotProduct kernel (l = 3)

reg = Regression(KernelRidge(kernel = DotProduct()), data, 'kmer')
reg.train()
reg.evaluate(cross_val_flag = cross_val_flag, plot_flag = plot_flag, k = k)

Model:  KernelRidge(alpha=1, coef0=1, degree=3, gamma=None,
            kernel=DotProduct(sigma_0=1), kernel_params=None)
[0.28775011 0.46127207 0.34926709 0.33935598 0.19080109]
RMSE : 0.33 (+/- 0.18)


In [8]:
# Show string kernel performance (l = 3)
# label embedding 

for kernel in kernels:
    reg = Regression(KernelRidge(kernel = kernel), data, embedding_method='label')
    reg.train()
    reg.evaluate(cross_val_flag = cross_val_flag, plot_flag = plot_flag, k = k)
    print()

Model:  KernelRidge(alpha=1, coef0=1, degree=3, gamma=None,
            kernel=<function spectrum_kernel_pw at 0x7efec227e2f0>,
            kernel_params=None)
[0.28684508 0.47594838 0.34444124 0.34223795 0.19474536]
RMSE : 0.33 (+/- 0.18)

Model:  KernelRidge(alpha=1, coef0=1, degree=3, gamma=None,
            kernel=<function mixed_spectrum_kernel_pw at 0x7efec227e378>,
            kernel_params=None)
[0.37274496 0.50538574 0.39857689 0.31013285 0.26965814]
RMSE : 0.37 (+/- 0.16)

Model:  KernelRidge(alpha=1, coef0=1, degree=3, gamma=None,
            kernel=<function WD_kernel_pw at 0x7efec227e400>,
            kernel_params=None)
[0.27849425 0.43765123 0.40209041 0.38070551 0.21781228]
RMSE : 0.34 (+/- 0.16)

Model:  KernelRidge(alpha=1, coef0=1, degree=3, gamma=None,
            kernel=<function WD_shift_kernel_pw at 0x7efec227e488>,
            kernel_params=None)
[0.28078538 0.43859451 0.3954927  0.37929358 0.21589545]
RMSE : 0.34 (+/- 0.16)



### Gaussian Process Regression

In [9]:
# # baseline: onehot embedding with DotProduct Kernel 
reg = Regression(GaussianProcessRegressor(kernel = DotProduct()), data, 'onehot')
reg.train()
reg.evaluate(cross_val_flag = cross_val_flag, plot_flag = plot_flag, k = k)

Model:  GaussianProcessRegressor(alpha=1e-10, copy_X_train=True,
                         kernel=DotProduct(sigma_0=1), n_restarts_optimizer=0,
                         normalize_y=False, optimizer='fmin_l_bfgs_b',
                         random_state=None)
[0.26045948 0.37864411 0.36652631 0.33416442 0.23448767]
RMSE : 0.31 (+/- 0.11)


In [10]:
# kmer embedding (l = 3)
reg = Regression(GaussianProcessRegressor(kernel = DotProduct()), data, 'kmer')
reg.train()
reg.evaluate(cross_val_flag = cross_val_flag, plot_flag = plot_flag, k = k)

Model:  GaussianProcessRegressor(alpha=1e-10, copy_X_train=True,
                         kernel=DotProduct(sigma_0=1), n_restarts_optimizer=0,
                         normalize_y=False, optimizer='fmin_l_bfgs_b',
                         random_state=None)
[0.27693065 0.49052313 0.43820171 0.4088913  0.27202406]
RMSE : 0.38 (+/- 0.18)


In [None]:
# Show string kernel performance (l = 3)
# label embedding 

kernels = [spectrum_kernel_pw, mixed_spectrum_kernel_pw, WD_kernel_pw]

for kernel in kernels:
    reg = Regression(GaussianProcessRegressor(kernel = PairwiseKernel(metric = kernel)), data,\
                 embedding_method='label')
    reg.train()
    reg.evaluate(cross_val_flag = cross_val_flag, plot_flag = plot_flag, k = k)
    print()

Model:  GaussianProcessRegressor(alpha=1e-10, copy_X_train=True,
                         kernel=PairwiseKernel(gamma=1.0, metric=<function spectrum_kernel_pw at 0x7efec227e2f0>),
                         n_restarts_optimizer=0, normalize_y=False,
                         optimizer='fmin_l_bfgs_b', random_state=None)
[0.27693254 0.49052647 0.43820379 0.40889041 0.27202653]
RMSE : 0.38 (+/- 0.18)

Model:  GaussianProcessRegressor(alpha=1e-10, copy_X_train=True,
                         kernel=PairwiseKernel(gamma=1.0, metric=<function mixed_spectrum_kernel_pw at 0x7efec227e378>),
                         n_restarts_optimizer=0, normalize_y=False,
                         optimizer='fmin_l_bfgs_b', random_state=None)
[0.27693407 0.490508   0.4382023  0.40888562 0.27201237]
RMSE : 0.38 (+/- 0.18)

