Goal: use models to predict best and worst SARS-CoV-2 toeholds.

In [1]:
# import statements 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
from tensorflow.python.saved_model import tag_constants
%matplotlib inline

import keras as keras
from keras.models import load_model

from pysster.One_Hot_Encoder import One_Hot_Encoder
from sklearn import preprocessing
from keras.utils import to_categorical

from keras import backend as K 
from scipy.stats import pearsonr, spearmanr 

Using TensorFlow backend.


In [2]:
# create DNA alphabet
alph_letters = sorted('ATCG')
alph = list(alph_letters)
one = One_Hot_Encoder(alph_letters)

# one-hot encode with pysster (very fast and simple encoding)  
def _get_one_hot_encoding(seq):
    one_hot_seq = one.encode(seq)                         
    return one_hot_seq

In [3]:
rbs = 'AACAGAGGAGA'
start_codon = 'ATG'

# Make function to generate reverse compliment of the DNA strand
def make_rev_complement(string):
    new_str = ''
    for s in string:
        char = ''
        if s == 'A':
            char = 'T'
        elif s == 'T':
            char = 'A'
        elif s == 'C':
            char = 'G'
        elif s == 'G':
            char = 'C'
        else:
            print('UH OH! Character not A, T, C, or G')
        new_str += char
    new_str = new_str[::-1]
    return new_str

In [4]:
# Make function to generate compliment of the DNA strand
def make_complement(string):
    new_str = ''
    for s in string:
        char = ''
        if s == 'A':
            char = 'T'
        elif s == 'T':
            char = 'A'
        elif s == 'C':
            char = 'G'
        elif s == 'G':
            char = 'C'
        else:
            print('UH OH! Character not A, T, C, or G')
        new_str += char
    return new_str

In [5]:
## revised so that we only consider no in-frame stops
def check_for_stop(toehold):
    stop_codons = ['TAG', 'TAA', 'TGA']
    bad_locations = [47, 50, 53, 56]
    search = False
    for stop in stop_codons:
        for bad_loc in bad_locations:
            stop_index = toehold.find(stop, bad_loc, bad_loc + 3) # val, start search, end search
            search_test = stop_index == bad_loc
            search = search | search_test
    return search

In [6]:
# Make function to actually turn trigger into toehold
def turn_switch_to_toehold(switch):
    stem1 = make_rev_complement(switch[24:30])
    stem2 = make_rev_complement(switch[12:21])
    toehold = switch + rbs + stem1 + start_codon + stem2
    return toehold

In [7]:
def turn_long_seq_to_switches(long_genome):
    n = 30
    switches = [(long_genome[i:i+n]) for i in range(0, len(long_genome)-n+1)] 
    seqs = [turn_switch_to_toehold(x) for x in switches]
    
    no_stop = [x for x in seqs if not check_for_stop(x)]

    return no_stop

In [8]:
def predict_with_given_model(final_model_path, final_weights_path, seqs):

    model = load_model(final_model_path)
    model.load_weights(final_weights_path)

    X = np.stack([_get_one_hot_encoding(s) for s in seqs]).astype(np.float32)
    predictions = model.predict(X)
    
    onoff = predictions
    
    data_df = pd.DataFrame(columns=['Toehold','Predicted ON/OFF Ratio'])
    data_df['Toehold'] = seqs
    num_seqs = len(data_df)
    data_df['Predicted ON/OFF Ratio'] = np.reshape(onoff, [num_seqs,])
    data_df = data_df.sort_values(by='Predicted ON/OFF Ratio', ascending=False)
    return data_df

In [11]:
# load in wuhan genome fasta (https://www.ncbi.nlm.nih.gov/nuccore/MN908947.3)
wuhan = pd.read_csv('data/4h_wuhan_fasta', sep = '\t')
long_genome = str(wuhan.iloc[0,0])
long_genome = make_rev_complement(long_genome)
seqs = turn_long_seq_to_switches(long_genome)

In [13]:
# load in model trained on Angenent-Mari et al 2020 data
model_dir = 'models/'
final_model_path = model_dir + 'onoff_original_model.h5'
final_weights_path = model_dir + 'onoff_original_model_weights.h5'
original_model_data_df = predict_with_given_model(final_model_path, final_weights_path, seqs)
original_model_data_df = original_model_data_df.rename(columns={"Toehold": "seq", "Predicted ON/OFF Ratio": "original"})

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.


In [14]:
# load in model trained only on Green et al 2014 data
final_model_path = model_dir + 'only_green_trained_model.h5'
final_weights_path = model_dir + 'only_green_trained_model_weights.h5'
green_model_data_df = predict_with_given_model(final_model_path, final_weights_path, seqs)
green_model_data_df = green_model_data_df.rename(columns={"Toehold": "seq", "Predicted ON/OFF Ratio": "green"})

In [15]:
# load in model trained with transfer learning
final_model_path = model_dir + 'freeze_weights_tf_onoff_model.h5'
final_weights_path = model_dir + 'freeze_weights_tf_onoff_model_weights.h5'
freeze_weights_data_df = predict_with_given_model(final_model_path, final_weights_path, seqs)
freeze_weights_data_df = freeze_weights_data_df.rename(columns={"Toehold": "seq", "Predicted ON/OFF Ratio": "freeze_weights"})

In [16]:
preds = original_model_data_df
preds = preds.merge(right = green_model_data_df, on = 'seq')
preds = preds.merge(right = freeze_weights_data_df, on = 'seq')

In [17]:
# let's normalize each column by the max value
preds['original_pct_of_max'] = preds['original'] / np.max(preds['original'])
preds['green_pct_of_max'] = preds['green'] / np.max(preds['green'])
preds['freeze_weights_pct_of_max'] = preds['freeze_weights'] / np.max(preds['freeze_weights'])

In [18]:
toeholds_30nt = [make_rev_complement(x[0:30]) for x in preds['seq']]
preds['30nt_we_are_sensing'] = toeholds_30nt

In [19]:
preds = preds.sort_values('freeze_weights', ascending = False)
preds

Unnamed: 0,seq,original,green,freeze_weights,original_pct_of_max,green_pct_of_max,freeze_weights_pct_of_max,30nt_we_are_sensing
96,TGGAAAGGCAGAAACTTTTTGTTAGACTCAAACAGAGGAGATGAGT...,0.696069,-0.000932,0.506414,0.875128,-0.002052,1.000000,TGAGTCTAACAAAAAGTTTCTGCCTTTCCA
4,AGTCTGAAGTGAAGTAACTGTGTAATACAAAACAGAGGAGATTGTA...,0.770522,-0.058264,0.498901,0.968735,-0.128247,0.985165,TTGTATTACACAGTTACTTCACTTCAGACT
14,CACTTTTTAAGCACTGTCTTTGCCTCCTCTAACAGAGGAGAAGAGG...,0.743829,0.014207,0.496922,0.935175,0.031272,0.981257,AGAGGAGGCAAAGACAGTGCTTAAAAAGTG
1195,CCTCATAATAATTAGTAATATCTCTGCTATAACAGAGGAGAATAGC...,0.590535,0.102507,0.494631,0.742447,0.225632,0.976733,ATAGCAGAGATATTACTAATTATTATGAGG
17,CTTGGACTGAGATCTTTCATTTTACCGTCAAACAGAGGAGATGACG...,0.741766,0.016909,0.489988,0.932580,0.037219,0.967565,TGACGGTAAAATGAAAGATCTCAGTCCAAG
7,ACACCACCAAAAGAACATGGTGTAATGTCAAACAGAGGAGATGACA...,0.763507,0.018065,0.487464,0.959914,0.039763,0.962580,TGACATTACACCATGTTCTTTTGGTGGTGT
650,ACATGTCTTGGACAGTAAACTACGTCATCAAACAGAGGAGATGATG...,0.619671,0.081537,0.478472,0.779078,0.179474,0.944824,TGATGACGTAGTTTACTGTCCAAGACATGT
284,ATTTCTGTTCACCAATATTCCAGGCACCTTAACAGAGGAGAAAGGT...,0.652282,0.212431,0.478373,0.820078,0.467590,0.944629,AAGGTGCCTGGAATATTGGTGAACAGAAAT
485,AGAGATTCATTTAAATTCTTGGCAACCTCAAACAGAGGAGATGAGG...,0.633824,0.061176,0.475353,0.796871,0.134656,0.938665,TGAGGTTGCCAAGAATTTAAATGAATCTCT
86,TCATTTTCTTTTGTAACATTTTTAGTCTTAAACAGAGGAGATAAGA...,0.700750,0.085674,0.474874,0.881013,0.188580,0.937719,TAAGACTAAAAATGTTACAAAAGAAAATGA


In [20]:
# filter it so we only keep those with normalized original, green, and freeze weights above 65%
good_toeholds = preds[preds['original_pct_of_max'] > 0.70]
good_toeholds = good_toeholds[good_toeholds['green_pct_of_max'] > 0.70]
good_toeholds = good_toeholds[good_toeholds['freeze_weights_pct_of_max'] > 0.70]

good_toeholds
#good_toeholds.to_csv('good_toeholds.csv')

Unnamed: 0,seq,original,green,freeze_weights,original_pct_of_max,green_pct_of_max,freeze_weights_pct_of_max,30nt_we_are_sensing
1229,CTTAAGCTTAAGTACACAATTTTGCATAGAAACAGAGGAGATCTAT...,0.589332,0.318168,0.389551,0.740934,0.700331,0.769234,TCTATGCAAAATTGTGTACTTAAGCTTAAG
392,TGCCACCAACACCCAACAATTTAATGTTGAAACAGAGGAGATCAAC...,0.640887,0.333391,0.383458,0.805751,0.733839,0.757203,TCAACATTAAATTGTTGGGTGTTGGTGGCA
397,AATTTTGAAGGTCACACTTTTCTAATAGCAAACAGAGGAGATGCTA...,0.640544,0.337879,0.381722,0.805319,0.743717,0.753774,TGCTATTAGAAAAGTGTGACCTTCAAAATT
1380,ACTTCTACTAAGCCACAAGTGCCATCTTTAAACAGAGGAGATAAAG...,0.582001,0.333875,0.380942,0.731717,0.734905,0.752235,TAAAGATGGCACTTGTGGCTTAGTAGAAGT
1235,CACTAGTGTAGGTGCACTTAATGGCATTACAACAGAGGAGAGTAAT...,0.589139,0.336096,0.362494,0.740691,0.739793,0.715806,GTAATGCCATTAAGTGCACCTACACTAGTG


In [26]:
# filter it so we only keep those with normalized original, green, and freeze weights below 5%
bad_toeholds = preds[preds['original'] < 0.01]
bad_toeholds = bad_toeholds[bad_toeholds['green'] < 0.01]
bad_toeholds = bad_toeholds[bad_toeholds['freeze_weights'] < 0.01]

bad_toeholds
#bad_toeholds.to_csv('bad_toeholds.csv')

Unnamed: 0,seq,original,green,freeze_weights,original_pct_of_max,green_pct_of_max,freeze_weights_pct_of_max,30nt_we_are_sensing
25012,AAATGGTAATTGTTTTAAATTAACAAAAGCAACAGAGGAGAGCTTT...,-0.000205,-0.014152,0.009814,-0.000258,-0.03115,0.019379,GCTTTTGTTAATTTAAAACAATTACCATTT
25016,AAATGTTTCACCTAAATTCAAGGCTTTAAGAACAGAGGAGACTTAA...,-0.00224,-0.010012,0.007229,-0.002816,-0.022038,0.014274,CTTAAAGCCTTGAATTTAGGTGAAACATTT
25017,TGTATAAACCCACAAATGTAAGTGAAAAAAAACAGAGGAGATTTTT...,-0.004137,-0.013009,0.006246,-0.005201,-0.028635,0.012335,TTTTTTCACTTACATTTGTGGGTTTATACA
25024,AATGTCCACACCCAAATTATTGAGTATTTTAACAGAGGAGAAAAAT...,-0.010577,-0.002248,0.006127,-0.013298,-0.004948,0.012099,AAAATACTCAATAATTTGGGTGTGGACATT
25027,GTTGTTTAATCCTTTAATAAAGTATAAATAAACAGAGGAGATATTT...,-0.031826,-0.016043,0.00494,-0.040013,-0.035312,0.009754,TATTTATACTTTATTAAAGGATTAAACAAC
25008,ATCAAAGTGTCCCTTATTTACAACATTAAAAACAGAGGAGATTTAA...,0.001745,0.001286,0.003786,0.002194,0.002831,0.007477,TTTAATGTTGTAAATAAGGGACACTTTGAT
25009,AACACCCAACAATTTAATGTTGAGTTTGAAAACAGAGGAGATTCAA...,0.001364,0.000682,0.00357,0.001715,0.0015,0.007049,TTCAAACTCAACATTAAATTGTTGGGTGTT
25023,TTGTTTTGATAATAAAGAACTGACTTAAAGAACAGAGGAGACTTTA...,-0.01002,-0.00215,-0.009237,-0.012597,-0.004733,-0.01824,CTTTAAGTCAGTTCTTTATTATCAAAACAA
