In [1]:
import pandas as pd
import numpy as np
from tensorflow import keras
from tensorflow.keras import backend
import tensorflow as tf
import joblib
gpus= tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpus[0], True)


In [2]:

class RawData():
    def __init__(self, fname):
        self.df = pd.read_csv(fname)
        self.onehotmtx = onehot_encode(self.df, self.get_seqs())

    def get_df(self):
        return self.df

    def get_seqs(self):
        return self.df['utr']

    def get_onehotmtxs(self):
        return self.onehotmtx

    def get_data_size(self):
        return len(self.df)


def onehot_encode(data,seqs):

    nuc_d = {'a': [1, 0, 0, 0], 'c': [0, 1, 0, 0], 'g': [0, 0, 1, 0], 't': [0, 0, 0, 1]}

    onehotmtx = np.zeros([len(data),50,4])  ## padding with 0

    for i in range(len(data)):
        seq = seqs.iloc[i]
        seq = seq.lower()[:50]  ## trim the input data
        for n, x in enumerate(seq):
            onehotmtx[i][n] = np.array(nuc_d[x])
    return onehotmtx


## load customized loss

def weighted_squared_error(y_true, y_pred):

    ltavg = y_true > 0

    mse = K.square(y_pred - y_true)
    weighted_mse = (1 + y_true) * K.square(y_pred - y_true)

    return K.mean(tf.where(ltavg, weighted_mse, mse))


losses = {
    "rl_output": weighted_squared_error,
    "decoded_output": 'categorical_crossentropy',
}



In [3]:

## load model
modelpath = "../models/Smart5UTR/Smart5UTR_egfp_m1pseudo2_Model.h5"
scalerpath = "../models/egfp_m1pseudo2.scaler"


np.set_printoptions(threshold=np.inf)
np.set_printoptions(suppress=True)


### load model and scaler
model = keras.models.load_model(modelpath, compile=False)
scaler = joblib.load(scalerpath)

model.compile(loss=losses,
                metrics={'rl_output': 'mse', 'decoded_output': 'accuracy'})




In [4]:
## load example data 
rawdata = RawData('../data/example_testUTRs.csv')

rawdata.get_df()


Unnamed: 0,name,utr
0,1-INS-50AX2,GACTGAACAATTCAAACATTACAAACATTACTAACAAACCACTAAT...
1,2-INS-50AX2,GACACAAACTGAGAGACAAGAATTCAAGAGACGAACAAATAAAGAA...
2,3-INS-50AX2,GGATAATAACGGAAATAATAGAAGTGATAACTATTAAACTTAATAA...
3,4-INS-50AX2,GGCAAATAATAGAAATAATAATTATTAACACAATTAAACACAACGT...
4,5-INS-A50X2,AGGATTGCGGATATCATTATTATTGGAGAACCTTATTCGGGGGGGGCC
5,6-INS-A50X2,CCGGGAATTTGTTGTAGTGAGTTTTGTTTAAGAGTTTGATAGTAAG...
6,7-INS-A50X2,GCTGATTGTATTGCTCGTCTGTGATAACTCTAATAAACCTAGAAGG...
7,8-INSI-A50X2,TGTCGTGGATAAAGCTGACACCCAATTACTACGATTGTACAACGTA...


In [5]:
## predict MRL by decoder
test_data = rawdata.get_onehotmtxs()
pred_labels = model.predict(test_data)[1].reshape(-1)
pred_labels = scaler.inverse_transform(pred_labels)


results = rawdata.get_df()
results[ "predictive MRL"] = pred_labels


In [6]:
results


Unnamed: 0,name,utr,predictive MRL
0,1-INS-50AX2,GACTGAACAATTCAAACATTACAAACATTACTAACAAACCACTAAT...,7.692509
1,2-INS-50AX2,GACACAAACTGAGAGACAAGAATTCAAGAGACGAACAAATAAAGAA...,7.574214
2,3-INS-50AX2,GGATAATAACGGAAATAATAGAAGTGATAACTATTAAACTTAATAA...,7.536551
3,4-INS-50AX2,GGCAAATAATAGAAATAATAATTATTAACACAATTAAACACAACGT...,7.737272
4,5-INS-A50X2,AGGATTGCGGATATCATTATTATTGGAGAACCTTATTCGGGGGGGGCC,6.582326
5,6-INS-A50X2,CCGGGAATTTGTTGTAGTGAGTTTTGTTTAAGAGTTTGATAGTAAG...,6.918125
6,7-INS-A50X2,GCTGATTGTATTGCTCGTCTGTGATAACTCTAATAAACCTAGAAGG...,6.818521
7,8-INSI-A50X2,TGTCGTGGATAAAGCTGACACCCAATTACTACGATTGTACAACGTA...,5.411743


In [7]:
results.to_csv("../data/example_testUTRs_prediction.csv")