
# Goal: Quickly predict the ON and OFF values for sequences in bulk by using trained model. 

### Instructions: Please change the file_name in the second code block to sequences you are interested in predicting the ON and OFF values for. The format should be a .csv file with one column, and the column should not have a header or column title.

In [1]:
# import statements 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from tensorflow.python.saved_model import tag_constants
%matplotlib inline

import keras as keras
from keras.models import load_model

from pysster.One_Hot_Encoder import One_Hot_Encoder
from sklearn import preprocessing
from keras.utils import to_categorical

Using TensorFlow backend.


# Part 1: Load in sequence data. 
## Change file_name here!

In [2]:
# enter a .csv with sequences
data_dir = 'data/toehold_sequences/'
file_name = 'example_switches.csv' # CHANGE FILENAME!
data_df = pd.read_csv(data_dir + file_name,sep=',', header=None)
data_df.rename(columns = {0:'switch_sequence'}, inplace = True) 
data_df.head(3)

Unnamed: 0,switch_sequence
0,AAAAAAAAAAAAAAAAAATGGAAAACAGTTAACAGAGGAGAAACTG...
1,AAAAAAAAAAAAATGGAAAACAGTTACTAAAACAGAGGAGATTAGT...
2,AAAAAAAAATTACTACTATTGTTAATTTAGAACAGAGGAGACTAAA...


In [3]:
seqs = data_df['switch_sequence']
seq_len = len(seqs[0])
print('Toehold length: ', seq_len)
num_seqs = len(data_df)
print('Number of sequences: ', num_seqs)

Toehold length:  59
Number of sequences:  99


# Part 2. Transform Data. One-hot encode sequences and extact target on and off values.

In [4]:
# create DNA alphabet- may need to change if you have RNA toeholds. Just change to 'AUCG' in the first line
alph_letters = sorted('ATCG')
alph = list(alph_letters)

# one-hot encode with pysster (very fast and simple encoding)  
one = One_Hot_Encoder(alph_letters)
def _get_one_hot_encoding(seq):
    one_hot_seq = one.encode(seq)                         
    return one_hot_seq

# Part 3. Load in final models.

In [5]:
def predict_with_given_model(final_model_path, final_weights_path, seqs):

    model = load_model(final_model_path)
    model.load_weights(final_weights_path)

    X = np.stack([_get_one_hot_encoding(s) for s in seqs]).astype(np.float32)
    predictions = model.predict(X)
    
    onoff = predictions
    
    data_df = pd.DataFrame(columns=['Toehold','Predicted ON/OFF Ratio'])
    data_df['Toehold'] = seqs
    num_seqs = len(data_df)
    data_df['Predicted ON/OFF Ratio'] = np.reshape(onoff, [num_seqs,])
    data_df = data_df.sort_values(by='Predicted ON/OFF Ratio', ascending=False)
    return data_df

In [6]:
# load in model trained on Angenent-Mari et al 2020 data
model_dir = 'clean_figures/fig4/models/'
final_model_path = model_dir + 'onoff_original_model.h5'
final_weights_path = model_dir + 'onoff_original_model_weights.h5'
original_model_data_df = predict_with_given_model(final_model_path, final_weights_path, seqs)
original_model_data_df = original_model_data_df.rename(columns={"Toehold": "seq", "Predicted ON/OFF Ratio": "original"})





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.











In [7]:
# load in model trained only on Green et al 2014 data
final_model_path = model_dir + 'only_green_trained_model.h5'
final_weights_path = model_dir + 'only_green_trained_model_weights.h5'
green_model_data_df = predict_with_given_model(final_model_path, final_weights_path, seqs)
green_model_data_df = green_model_data_df.rename(columns={"Toehold": "seq", "Predicted ON/OFF Ratio": "green"})

In [8]:
# load in model trained with transfer learning
final_model_path = model_dir + 'freeze_weights_tf_onoff_model.h5'
final_weights_path = model_dir + 'freeze_weights_tf_onoff_model_weights.h5'
freeze_weights_data_df = predict_with_given_model(final_model_path, final_weights_path, seqs)
freeze_weights_data_df = freeze_weights_data_df.rename(columns={"Toehold": "seq", "Predicted ON/OFF Ratio": "freeze_weights"})

# Part 4. Put together a dataframe on ON/OFF values.

In [9]:
preds = original_model_data_df
preds = preds.merge(right = green_model_data_df, on = 'seq')
preds = preds.merge(right = freeze_weights_data_df, on = 'seq')

# let's normalize each column by the max value
preds['original_pct_of_max'] = preds['original'] / np.max(preds['original'])
preds['green_pct_of_max'] = preds['green'] / np.max(preds['green'])
preds['freeze_weights_pct_of_max'] = preds['freeze_weights'] / np.max(preds['freeze_weights'])

# Part 5. Look at predictions.

In [10]:
rbs = 'AACAGAGGAGA'
start_codon = 'ATG'

# Make function to generate reverse compliment of the DNA strand
def make_rev_complement(string):
    new_str = ''
    for s in string:
        char = ''
        if s == 'A':
            char = 'T'
        elif s == 'T':
            char = 'A'
        elif s == 'C':
            char = 'G'
        elif s == 'G':
            char = 'C'
        else:
            print('UH OH! Character not A, T, C, or G')
        new_str += char
    new_str = new_str[::-1]
    return new_str

In [11]:
toeholds_30nt = [make_rev_complement(x[0:30]) for x in preds['seq']]
preds['30nt_we_are_sensing'] = toeholds_30nt

In [12]:
preds = preds.sort_values('freeze_weights', ascending = False)
preds

Unnamed: 0,seq,original,green,freeze_weights,original_pct_of_max,green_pct_of_max,freeze_weights_pct_of_max,30nt_we_are_sensing
1,AAAAAAATTATAATATTCGTATTAGCATCAAACAGAGGAGATGATG...,0.606177,0.122590,0.402334,0.977306,0.396618,1.000000,TGATGCTAATACGAATATTATAATTTTTTT
4,AAAAAAGACGATCAAGAATCCACAATATCAAACAGAGGAGATGATA...,0.564488,0.106070,0.388935,0.910093,0.343173,0.966695,TGATATTGTGGATTCTTGATCGTCTTTTTT
0,AAAAAATAAAACTAGAGATTGAAACGTGTTAACAGAGGAGAAACAC...,0.620253,0.037205,0.367080,1.000000,0.120370,0.912375,AACACGTTTCAATCTCTAGTTTTATTTTTT
2,AAAAAATAGCAAGGTATCTATTGAAGATGTAACAGAGGAGAACATC...,0.591277,0.109067,0.364902,0.953283,0.352866,0.906962,ACATCTTCAATAGATACCTTGCTATTTTTT
3,AAAAAACTCTTTGGTCATGTCTTTCTCTTTAACAGAGGAGAAAAGA...,0.589039,0.015357,0.363877,0.949675,0.049684,0.904416,AAAGAGAAAGACATGACCAAAGAGTTTTTT
...,...,...,...,...,...,...,...,...
96,AAAAAAATGACATTATTTAAGAGCATTCTAAACAGAGGAGATAGAA...,0.022622,0.042017,0.015981,0.036472,0.135937,0.039720,TAGAATGCTCTTAAATAATGTCATTTTTTT
84,AAAAAAAATAACGTAGGACTACTACTTGGAAACAGAGGAGATCCAA...,0.061793,0.244613,0.015027,0.099625,0.791403,0.037348,TCCAAGTAGTAGTCCTACGTTATTTTTTTT
85,AAAAAATGTCTAGTGTATCTAGATGAAAATAACAGAGGAGAATTTT...,0.058335,0.013765,0.013889,0.094051,0.044536,0.034521,ATTTTCATCTAGATACACTAGACATTTTTT
98,AAAAAACTTGTAAATATTAAAACATATGAAAACAGAGGAGATTCAT...,-0.016873,0.057240,0.002763,-0.027204,0.185191,0.006866,TTCATATGTTTTAATATTTACAAGTTTTTT


# Part 6: Get best and worst sequences.

In [13]:
# get good toeholds
num_best = 5
threshold = 1
increment = 0.0001

while True:
    good_toeholds = preds[preds['original_pct_of_max'] > threshold]
    # We note that the Green model is likely enormously underfit but we had an abundance of toeholds pass both the
    # original model and transfer learning model threshold so decided to use another filtering step
    good_toeholds = good_toeholds[good_toeholds['green_pct_of_max'] > threshold]
    good_toeholds = good_toeholds[good_toeholds['freeze_weights_pct_of_max'] > threshold]
    if (len(good_toeholds)) >= num_best:
        break
    else:
        threshold = threshold - increment

good_toeholds

Unnamed: 0,seq,original,green,freeze_weights,original_pct_of_max,green_pct_of_max,freeze_weights_pct_of_max,30nt_we_are_sensing
1,AAAAAAATTATAATATTCGTATTAGCATCAAACAGAGGAGATGATG...,0.606177,0.12259,0.402334,0.977306,0.396618,1.0,TGATGCTAATACGAATATTATAATTTTTTT
7,AAAAAATACATGCTCCTTTAATTTTGTTAAAACAGAGGAGATTAAC...,0.531353,0.214239,0.352012,0.856672,0.693135,0.874925,TTAACAAAATTAAAGGAGCATGTATTTTTT
25,AAAAAACACCTTCTCAACTTTAGATGCTTGAACAGAGGAGACAAGC...,0.390108,0.309088,0.286906,0.62895,1.0,0.713103,CAAGCATCTAAAGTTGAGAAGGTGTTTTTT
20,AAAAAAAAAAAAATGGAAAACAGTTACTAAAACAGAGGAGATTAGT...,0.419171,0.121461,0.26016,0.675807,0.392966,0.646627,TTAGTAACTGTTTTCCATTTTTTTTTTTTT
41,AAAAAAGGATTTCTCACCATAGCCAGCATCAACAGAGGAGAGATGC...,0.294863,0.135844,0.158091,0.475392,0.439501,0.392935,GATGCTGGCTATGGTGAGAAATCCTTTTTT


In [14]:
# get bad toeholds
num_worst = 5
threshold = -1
increment = 0.0001

while True:
    bad_toeholds = preds[preds['original'] < threshold]
    bad_toeholds = bad_toeholds[bad_toeholds['green'] < threshold]
    bad_toeholds = bad_toeholds[bad_toeholds['freeze_weights'] < threshold]
    if (len(bad_toeholds)) >= num_worst:
        break
    else:
        threshold = threshold + increment

bad_toeholds

Unnamed: 0,seq,original,green,freeze_weights,original_pct_of_max,green_pct_of_max,freeze_weights_pct_of_max,30nt_we_are_sensing
94,AAAAAATACATATATTAGAATGAATACAACAACAGAGGAGAGTTGT...,0.039368,0.004367,0.035933,0.063471,0.014129,0.089311,GTTGTATTCATTCTAATATATGTATTTTTT
87,AAAAAAAGAAATTGGCCACAGGACCAAAGGAACAGAGGAGACCTTT...,0.048591,-0.00272,0.028047,0.07834,-0.0088,0.069711,CCTTTGGTCCTGTGGCCAATTTCTTTTTTT
91,AAAAAACTAGTCCATTTAAGAGGTATGTAGAACAGAGGAGACTACA...,0.043079,0.013636,0.016751,0.069454,0.044115,0.041634,CTACATACCTCTTAAATGGACTAGTTTTTT
96,AAAAAAATGACATTATTTAAGAGCATTCTAAACAGAGGAGATAGAA...,0.022622,0.042017,0.015981,0.036472,0.135937,0.03972,TAGAATGCTCTTAAATAATGTCATTTTTTT
97,AAAAAATCTTAACTATATAATATTCTAATAAACAGAGGAGATATTA...,0.002365,0.005108,-0.002922,0.003813,0.016525,-0.007261,TATTAGAATATTATATAGTTAAGATTTTTT
