# GFP Sequence to Function Model

Code adapted from [David Brookes' CbAS](https://github.com/MauriceR71/CbAS-Restored)

In [None]:
# Source: https://github.com/igemto-drylab

In [27]:
!git clone https://github.com/igemto-drylab/lec6.git
%cd lec6

Cloning into 'lec6'...
remote: Enumerating objects: 6, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 6 (delta 0), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (6/6), done.
/content/lec6/lec6


In [28]:
import keras
from keras.layers import Input, Dense, Reshape, Flatten
from keras import layers, initializers
from keras.models import Model, load_model
import keras.backend as K
import numpy as np
from keras.callbacks import EarlyStopping
import pandas as pd
import random

In [29]:
data_df = pd.read_csv("gfp_data.csv")

In [30]:
data_df.head(5)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,nucSequence,numNucMutations,numAAMutations,uniqueBarcodes,medianBrightness,std,aaSequence
0,0,0,0,,0,0,2444,3.718386,0.111561,skgeelftgvvpilveldgdvnghkfsvsgegegdatygkltlkfi...
1,1,1,1,,1,0,2,3.622869,0.145991,skgeelftgvvpilveldgdvnghkfsvsgegegdatygkltlkfi...
2,2,2,2,,1,0,25,3.722241,0.112935,skgeelftgvvpilveldgdvnghkfsvsgegegdatygkltlkfi...
3,3,3,3,,2,0,1,3.697823,0.0,skgeelftgvvpilveldgdvnghkfsvsgegegdatygkltlkfi...
4,4,4,4,,2,0,1,3.804003,0.0,skgeelftgvvpilveldgdvnghkfsvsgegegdatygkltlkfi...


In [31]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58417 entries, 0 to 58416
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        58417 non-null  int64  
 1   Unnamed: 0.1      58417 non-null  int64  
 2   Unnamed: 0.1.1    58417 non-null  int64  
 3   nucSequence       0 non-null      float64
 4   numNucMutations   58417 non-null  int64  
 5   numAAMutations    58417 non-null  int64  
 6   uniqueBarcodes    58417 non-null  int64  
 7   medianBrightness  58417 non-null  float64
 8   std               58417 non-null  float64
 9   aaSequence        58417 non-null  object 
dtypes: float64(3), int64(6), object(1)
memory usage: 4.5+ MB


In [32]:
random.seed(1)

AA = ['a', 'r', 'n', 'd', 'c', 'q', 'e', 'g', 'h', 'i', 'l', 'k', 'm', 'f', 'p', 's', 't', 'w', 'y', 'v', 'x']
AA_upper = []
for m_aa in AA:
    AA_upper.append(m_aa.upper())
  
AA_IDX = {AA_upper[i]:i for i in range(len(AA_upper))}

def one_hot_encode_aa(aa_str):
    M = len(aa_str)
    aa_arr = np.zeros((M, 21), dtype=int)
    for i in range(M):
        aa_arr[i, AA_IDX[aa_str[i].upper()]] = 1
    return aa_arr

In [33]:
def get_data(train_size, ignore_stops=True):
    data_df = pd.read_csv("gfp_data.csv")

    idx = data_df.index
    data_df = data_df.loc[idx]
    
    if ignore_stops:
        idx = data_df.loc[~data_df['aaSequence'].str.contains('!')].index
    data_df = data_df.loc[idx]

    seqs = data_df['aaSequence']
      
    M = len(seqs[0])
    N = len(seqs)
    X = np.zeros((N, M, 21))
    j = 0
    for i in idx:
        X[j] = one_hot_encode_aa(seqs[i])
        j += 1
    y = np.array(data_df['medianBrightness'][idx])

    # zip and shuffle data
    data_list = list(zip(X, y))
    random.shuffle(data_list)
    X, y = zip(*data_list)

    return np.array(X)[:train_size], np.array(y)[:train_size]

In [34]:
def build_model(M):
    x = Input(shape=(M, 21,))
    y = Flatten()(x)
    y = Dense(50, activation='elu')(y)
    y = Dense(2)(y)
    model = Model(inputs=x, outputs=y)
    return model

In [35]:
def neg_log_likelihood(y_true, y_pred):
    y_true = y_true[:, 0]
    mean = y_pred[:, 0]
    variance = K.softplus(y_pred[:, 1]) + 1e-6
    log_variance = K.log(variance)
    return 0.5 * K.mean(log_variance, axis = -1) + 0.5 * K.mean(K.square(y_true - mean) / variance, axis = -1) + 0.5 * K.log(2 * np.pi)

In [36]:
def train_oracles_helper(X_train, y_train, num_models, batch_size=25):
    for i in range(num_models):
        model = build_model(X_train.shape[1])
        model.compile(optimizer='adam',
                      loss=neg_log_likelihood,
                      )
        early_stop = EarlyStopping(monitor='val_loss', 
                                   min_delta=0, 
                                   patience=5, 
                                   verbose=1)
        # print(model.summary())
        model.fit(X_train, y_train, 
                  epochs=10, 
                  batch_size=batch_size, 
                  validation_split=0.1, 
                  callbacks=[early_stop],
                  verbose=2)
        model.save("/content/lec6/oracle_model_%i.h5" % i)

In [37]:
def train_oracles():
    i = 1
    num_models = 5
    for i in range(num_models):
        X_train, y_train = get_data(train_size=500)  # data has 58417 seqs
        
        suffix = '_%i' % num_models
        train_oracles_helper(X_train, y_train, num_models, batch_size=64)

In [44]:

def make_prediction(s):
    num_models = 5
    predictions = []
    for i in range(num_models):
        vec = np.array([one_hot_encode_aa(s)])
        model = load_model("/content/lec6/oracle_model_%i.h5" % i)
        predictions.append(model.predict(vec))
    print(predictions)

In [39]:
train_oracles()

Epoch 1/10
8/8 - 2s - loss: 2.6175 - val_loss: 2.2619 - 2s/epoch - 259ms/step
Epoch 2/10
8/8 - 0s - loss: 1.9975 - val_loss: 1.9332 - 110ms/epoch - 14ms/step
Epoch 3/10
8/8 - 0s - loss: 1.9715 - val_loss: 1.9444 - 126ms/epoch - 16ms/step
Epoch 4/10
8/8 - 0s - loss: 1.9182 - val_loss: 1.9083 - 105ms/epoch - 13ms/step
Epoch 5/10
8/8 - 0s - loss: 1.8566 - val_loss: 1.8347 - 131ms/epoch - 16ms/step
Epoch 6/10
8/8 - 0s - loss: 1.7733 - val_loss: 1.7320 - 106ms/epoch - 13ms/step
Epoch 7/10
8/8 - 0s - loss: 1.6600 - val_loss: 1.6300 - 92ms/epoch - 12ms/step
Epoch 8/10
8/8 - 0s - loss: 1.4964 - val_loss: 1.4476 - 101ms/epoch - 13ms/step
Epoch 9/10
8/8 - 0s - loss: 1.3328 - val_loss: 1.5118 - 141ms/epoch - 18ms/step
Epoch 10/10
8/8 - 0s - loss: 1.2604 - val_loss: 1.4882 - 175ms/epoch - 22ms/step
Epoch 1/10
8/8 - 2s - loss: 2.3896 - val_loss: 2.2702 - 2s/epoch - 202ms/step
Epoch 2/10
8/8 - 0s - loss: 2.0966 - val_loss: 2.0107 - 196ms/epoch - 25ms/step
Epoch 3/10
8/8 - 0s - loss: 1.9510 - val_los

In [45]:
# seq must have the same number of amino acids as those in the training set
# missing amino acids may be denoted by 'x'
seq = "HWGYGKHNGPEHWHKDFPIAKGERQSPVDIDTHTAKYDPSLKPLSV"

make_prediction(seq)

ValueError: ignored