In [1]:
# modules to import 
import pandas as pd
import numpy as np

import sys
import seaborn as sns
# this to set the plot styles 
sns.set_context('paper', font_scale = 1.5, rc = {"lines.linewidth": 1.5})
sns.set_style('ticks', {'axes.grid': False, 
                        'grid.linestyle': '', 
                        'font.family':'sans-serif', 
                        'font.sans-serif':'Myriad Pro',
                        'text.color': '0',
                        'xtick.color': '0',
                        'ytick.color': '0'
                           })


sys.path.insert(1, '/work/06658/pramesh/maverick2/notebooks/')
from nlp_wvec_utils import *
# Ignore  the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:
# load in the toehold dataset and preprocess for ML 
df = pd.read_csv('/work/06658/pramesh/maverick2/excels/newQC_toehold_data.csv', comment = '#')

# rename the columns 
rename_dict = {
    "Unnamed: 0" : "toehold_id",
    "onoff_value" : "delta_onoff",
    "onoff_qc" : "delta_qc_onoff",
    "switch_sequence" : "min_toehold_sequence"
}

# clean up df to get rid of NaN and low qc reads 
df = df.rename(columns = rename_dict)
df = df.dropna() # throw out nan's 
ngs_qc_onind = df['on_qc'] >= 1.1 # keep all the acceptable reads for the ON
ngs_qc_offind = df['off_qc'] >= 1.1 # keep all the acceptable reads for the OFF
df = df.loc[ngs_qc_onind & ngs_qc_offind, :]

# bin the toeholds by their ON/OFF ratio into quartlies since the data were collected as 4 bins
df['toehold_quartile'] = pd.qcut(df['delta_onoff'], q = 4, labels = ['Q1', 'Q2', 'Q3', 'Q4'])

df_top = df[df['toehold_quartile'] == 'Q4']
df_top['quartile_label'] = 'Good'
df_rest = df[df['toehold_quartile'] != 'Q4']
df_rest['quartile_label'] = 'Bad'
df_classify = pd.concat([df_top, df_rest], axis = 0)

# instantiate the label encoder
label_encoder = LabelEncoder()
df_classify['quartile_rating'] = label_encoder.fit_transform(df_classify['quartile_label'])
df_classify = df_classify.sample(frac = 1)

# we now have a df of sequences that we will train a classifier
df_classify.head()

Unnamed: 0,off_id,on_id,source_sequence,sequence_id,pre_seq,promoter,trigger,loop1,switch,loop2,...,on_value,off_value,delta_onoff,on_qc,off_qc,delta_qc_onoff,min_toehold_sequence,toehold_quartile,quartile_label,quartile_rating
85396,AACCAAACACACAAACGCACCCTGGGAAGGTGTCAACTGCTGATGT...,CTACACATCAGCAGTTGACACCTTCCCAGGAACCAAACACACAAAC...,human_ZNF281,human_ZNF281_tile_227,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,CTACACATCAGCAGTTGACACCTTCCCAGG,AACCAAACACACAAACGCAC,CCTGGGAAGGTGTCAACTGCTGATGTGTAG,AACAGAGGAGA,...,0.154633,0.039786,0.114848,3.0,1.1,1.1,CCTGGGAAGGTGTCAACTGCTGATGTGTAGAACAGAGGAGACTACA...,Q2,Bad,0
154635,AACCAAACACACAAACGCACGGGACCAACATTACAATGCATTCTAA...,CCCTTTAGAATGCATTGTAATGTTGGTCCCAACCAAACACACAAAC...,random_sequences,random_sequence_86097,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,CCCTTTAGAATGCATTGTAATGTTGGTCCC,AACCAAACACACAAACGCAC,GGGACCAACATTACAATGCATTCTAAAGGG,AACAGAGGAGA,...,0.034915,0.142617,-0.107702,2.0,1.1,1.1,GGGACCAACATTACAATGCATTCTAAAGGGAACAGAGGAGACCCTT...,Q1,Bad,0
188168,AACCAAACACACAAACGCACTATTATTCAGTTTGTAAACGAGGATA...,TTTTTATCCTCGTTTACAAACTGAATAATAAACCAAACACACAAAC...,smallpox,smallpox_tile_21424,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,TTTTTATCCTCGTTTACAAACTGAATAATA,AACCAAACACACAAACGCAC,TATTATTCAGTTTGTAAACGAGGATAAAAA,AACAGAGGAGA,...,0.624165,0.305005,0.319161,4.0,3.0,3.0,TATTATTCAGTTTGTAAACGAGGATAAAAAAACAGAGGAGATTTTT...,Q3,Bad,0
156905,AACCAAACACACAAACGCACGGGCTTTCTCCACGAAAATGACCTTG...,ACTCCAAGGTCATTTTCGTGGAGAAAGCCCAACCAAACACACAAAC...,human_NFATC1,human_NFATC1_tile_188,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,ACTCCAAGGTCATTTTCGTGGAGAAAGCCC,AACCAAACACACAAACGCAC,GGGCTTTCTCCACGAAAATGACCTTGGAGT,AACAGAGGAGA,...,0.05795,0.063176,-0.005226,2.0,1.1,1.1,GGGCTTTCTCCACGAAAATGACCTTGGAGTAACAGAGGAGAACTCC...,Q1,Bad,0
128239,AACCAAACACACAAACGCACGATTATATAGGACAATTTGACATGAG...,GAATCTCATGTCAAATTGTCCTATATAATCAACCAAACACACAAAC...,smallpox,smallpox_tile_11433,CTCTGGGCTAACTGTCGCGC,TAATACGACTCACTATAGGG,GAATCTCATGTCAAATTGTCCTATATAATC,AACCAAACACACAAACGCAC,GATTATATAGGACAATTTGACATGAGATTC,AACAGAGGAGA,...,1.0,0.515572,0.484428,1.1,4.0,1.1,GATTATATAGGACAATTTGACATGAGATTCAACAGAGGAGAGAATC...,Q4,Good,1


In [3]:
# parameters to train the skip-gram embedding (takes a long time)
ngram_size = 3
ngram_stride = 3
dim_embedding = 400

# train the skip-gram embedding and save it as a text file 
df_classify, embeddings_index =  train_sg_embedding(df_classify = df_classify, 
                                                    ngram_size = ngram_size,
                                                    ngram_stride = ngram_stride, 
                                                    dim_embedding = dim_embedding)

Embeddings trained and saved!


In [4]:
# split the data into training and testing sets and get the embedding lookup
x_train, x_test, y_train, y_test, embedding_matrix, max_length = create_ds_splits(df_classify = df_classify, 
                                                                                  embeddings_index = embeddings_index, 
                                                                                  dim_embedding = dim_embedding)

In [8]:
# the optimal parameters were determined using grid-search so we define a series of functions for the 
# different architectures 

# create lstm grid-search model
def create_lstm_gs_model(embedding_matrix = embedding_matrix, max_length = max_length, neurons = 64, 
                         dropout_rate1 = 0.0, dropout_rate2 = 0.0, weight_constraint = 0):
    """
    This function creates a simple LSTM with the embedding as the inputs. 
    """
    
    num_words = embedding_matrix.shape[0]
    dim_embedding = embedding_matrix.shape[1]
    
    # instantiate model
    model = Sequential()
    embedding_layer = Embedding(input_dim = num_words, output_dim = dim_embedding, 
                                weights = [embedding_matrix],
                                input_length = max_length,
                                trainable = False)
    model.add(embedding_layer)
    model.add(Dropout(dropout_rate1))
    model.add(LSTM(units = neurons, kernel_constraint = max_norm(weight_constraint)))
    model.add(Dropout(dropout_rate2))
    model.add(Dense(1, activation = 'sigmoid', kernel_initializer = 'he_uniform')) # binary classification 
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    
    return model

def create_bidlstm_gs_model(embedding_matrix = embedding_matrix, max_length = max_length, neurons = 64,
                            dropout_rate1 = 0.0, dropout_rate2 = 0.0,  weight_constraint = 0):
    """
    This function creates a Bi-directional LSTM with the embedding as the inputs. 
    """
    
    num_words = embedding_matrix.shape[0]
    dim_embedding = embedding_matrix.shape[1]
    
    # instantiate model
    model = Sequential()
    embedding_layer = Embedding(input_dim = num_words, output_dim = dim_embedding, 
                                weights = [embedding_matrix],
                                input_length = max_length,
                                trainable = False)
    model.add(embedding_layer)
    model.add(Dropout(dropout_rate1))
    model.add(Bidirectional(LSTM(units = neurons, kernel_constraint = max_norm(weight_constraint))))
    model.add(Dropout(dropout_rate2))
    model.add(Dense(1, activation = 'sigmoid', kernel_initializer = 'he_uniform')) # binary classification 
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    
    return model

def create_self_atten_gs_model(embedding_matrix = embedding_matrix, max_length = max_length, neurons = 64,
                               dropout_rate1 = 0.0, dropout_rate2 = 0.0,  weight_constraint = 0):
    """
    This function creates a Bi-directional LSTM model with a Self-Attention layer
    """
    
    num_words = embedding_matrix.shape[0]
    dim_embedding = embedding_matrix.shape[1]
    
    # instantiate model
    model = Sequential()
    embedding_layer = Embedding(input_dim = num_words, output_dim = dim_embedding, 
                                weights = [embedding_matrix],
                                input_length = max_length,
                                trainable = False)
    model.add(embedding_layer)
    model.add(Dropout(dropout_rate1))
    model.add(Bidirectional(LSTM(units = neurons, return_sequences = True, recurrent_activation = 'relu', 
                                 kernel_constraint = max_norm(weight_constraint))))
    model.add(SeqSelfAttention(attention_width = 6, attention_activation = 'sigmoid'))
    model.add(Flatten()) # to get one output neuron 
    model.add(Dropout(dropout_rate2))
    model.add(Dense(1, activation = 'sigmoid', kernel_initializer = 'he_uniform')) # binary classification 
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
 
    return model

def create_cnn_lstm_model(embedding_matrix = embedding_matrix, max_length = max_length, neurons = 64,
                          dropout_rate1 = 0.0, dropout_rate2 = 0.0, weight_constraint = 0):
    """
    This function creates a CNN-LSTM model 
    """
    
    num_words = embedding_matrix.shape[0]
    dim_embedding = embedding_matrix.shape[1]
    
    # instantiate model
    model = Sequential()
    embedding_layer = Embedding(input_dim = num_words, output_dim = dim_embedding, 
                                weights = [embedding_matrix],
                                input_length = max_length,
                                trainable = False)
    model.add(embedding_layer)
    model.add(Dropout(dropout_rate1))
    model.add(Conv1D(filters = filters, kernel_size = 2, padding = 'same', activation = 'relu'))
    model.add(MaxPooling1D(pool_size = 4))
    model.add(Dropout(dropout_rate2))
    model.add(LSTM(units = neurons, kernel_constraint = max_norm(weight_constraint)))
    model.add(Dense(1, activation = 'sigmoid', kernel_initializer = 'he_uniform')) # binary classification 
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
 
    return model

In [9]:
# define the grid search parameters (takes a very long time)
weight_constraint = [1, 2, 3, 4]
dropout_rate1 = [0.2, 0.3, 0.4]
dropout_rate2 = [0.1, 0.2, 0.3]
neurons = [64, 128, 256]
param_grid = dict(dropout_rate1 = dropout_rate1,
                  dropout_rate2 = dropout_rate2,
                  neurons = neurons,
                  weight_constraint = weight_constraint)

In [None]:
# step through each model architecture and find the best parameters
model = KerasClassifier(build_fn = create_lstm_gs_model, epochs = 100, batch_size = 128, verbose = 1)

# instantiate the loss-history class
history = LossHistory()
grid = GridSearchCV(estimator = model, param_grid = param_grid, n_jobs = -1, cv = 5)
    
grid_result = grid.fit(x_train, y_train, validation_split = 0.1, callbacks = [history])
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']

In [None]:
history, accs, aucs, model = train_lstm(x_train, x_test, y_train, y_test, embedding_matrix, max_length)