Codes for model training

In [None]:
import pandas as pd
import numpy as np
import os
import random
import json
from time import time

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    tf.random.set_seed(seed)

    # Reproducibility에 크게 신경쓰지 않고 빠른 학습을 원한다면 밑의 코드 주석 처리
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
    tf.config.threading.set_inter_op_parallelism_threads(1)
    tf.config.threading.set_intra_op_parallelism_threads(1)

# onehot encoding script for TESR sequence data set
def onehot(seq):
    module = np.array([[1,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,1]])
    i = 0
    onehot_result = []
    while i < len(seq):
        seqlist = []
        for base in seq[i]:
            if base == 'a' or base == 'A':
                seqlist.append(module[0])
            elif base == 't' or base == 'T':
                seqlist.append(module[1])
            elif base == 'g' or base == 'G':
                seqlist.append(module[2])
            elif base == 'c' or base == 'C':
                seqlist.append(module[3])
            else:
                seqlist.append([0,0,0,0])
        onehot_result.append(seqlist)
        i = i + 1
    result = np.zeros((len(seq),9,1,4))
    result = np.float32(result)
    i = 0
    while i < len(seq):
        j = 0
        while j < len(seq[0]):
            result[i,j,0,:] = onehot_result[i][j]
            j = j + 1
        i = i + 1
    
    return result

In [None]:
# Global variable
MODEL_FOLDER_DIR = './model'

In [None]:
# Load data
print("...Loading data started...")
seq_data = pd.read_excel("./data/Ecoli codon variant transformer.xlsx", sheet_name = "score 1", engine='openpyxl')
seq = seq_data['sequence'].values

total_seq, total_label = onehot(seq), seq_data['score avg']
total_seq = total_seq.reshape(-1,9,4)
print("...Loading data finished...")

In [None]:
# Initialize the CNN+ model
def init_model_CNN_plus(seed=0):
    # Setting seed
    set_seed(seed)

    # Input shape is (Batch_size, 9, 4)
    input_seq = keras.Input(shape=(9,4,))
    y = keras.layers.Conv1D(256, 5, activation='relu', padding='same')(input_seq)
    y = keras.layers.MaxPool1D(3)(y)
    y = keras.layers.Conv1D(256, 4, activation='relu', padding='same')(y)
    y = keras.layers.MaxPool1D(3)(y)
    y = keras.layers.Conv1D(256, 3, activation='relu', padding='same')(y)
    output_seq = keras.layers.Flatten()(y)

    z = keras.layers.Dense(1024, activation = 'relu')(output_seq)
    z = keras.layers.Dense(256, activation = 'relu')(z)
    z = keras.layers.Dense(1, activation = 'linear')(z)

    model = keras.Model(inputs=input_seq, outputs=z)
    model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['mean_absolute_error'])

    # Print model structure
    print(model.summary())

    return model

In [None]:
# 5-Fold cross validation
def train_test_DeepTEST(total_seq, total_label, seed=0, cv_seed=0, path_index=0):
    """
    total_seq: mRNA sequence data, shape=(Batch_size, 9, 4),
    total_label: TESR score,
    seed: seed value for initializing model weights,
    cv_seed: seed value for splitting the data (default=0),
    path_index: index number for model directory
    """
    # Model save directory
    MODEL_DIR = MODEL_FOLDER_DIR+str(path_index)+'/'
    if not os.path.exists(MODEL_DIR):
        os.mkdir(MODEL_DIR)

    # for 5-fold cross validation
    kfold = KFold(n_splits=5, shuffle=True, random_state=cv_seed)

    print(f"Seed {seed} Training model...")
    start = time()
    fold = 0
    for train, test in kfold.split(total_label):
        print(f"Seed {seed} Fold {fold}")
        modelpath = MODEL_DIR+"Seed"+str(seed)+"-"+"Fold"+str(fold)+".hdf5"
        model = init_model_CNN_plus(seed=seed) # Initailize model

        # Setting checkpoint and earlystopping callbacks
        checkpointer = ModelCheckpoint(filepath=modelpath, monitor='val_loss', verbose=0, save_best_only=True)
        early_stopping_callback = EarlyStopping(monitor='val_loss', patience=10)

        # Split Train and validation dataset
        train_seq, valid_seq, train_label, valid_label = train_test_split(total_seq[train], total_label[train], test_size=0.1, random_state=0)
        # ... Training ...
        history = model.fit(train_seq, train_label, validation_data=(valid_seq, valid_label),
                            epochs = 100, batch_size = 64, verbose = 1, callbacks = [early_stopping_callback, checkpointer])
        # Load the best model
        model = keras.models.load_model(modelpath)

        # Predict the TESR score on the train, valid, and test set
        train_predict = model.predict(train_seq)
        val_predict = model.predict(valid_seq)
        test_predict = model.predict(total_seq[test])

        # Save train, validation, test results
        with open(f"{MODEL_DIR}/Seed{seed}-Fold{fold}.json", "w") as f:
            json.dump(history.history, f, ensure_ascii=False, indent=2) # Training results (train loss, val loss, ...)
        np.save(f"{MODEL_DIR}/Seed{seed}-Fold{fold}-train_predict", train_predict) # TESR score for training set
        np.save(f"{MODEL_DIR}/Seed{seed}-Fold{fold}-valid_predict", val_predict) # TESR score for validation set
        np.save(f"{MODEL_DIR}/Seed{seed}-Fold{fold}-test_predict", test_predict) # TESR score for test set

        # Print MAE and r2 score
        print(f"Train MAE: {mean_absolute_error(train_label, train_predict):.4f}\tTrain r2 score: {r2_score(train_label, train_predict):.4f}")
        print(f"Valid MAE: {mean_absolute_error(valid_label, val_predict):.4f}\tValid r2 score: {r2_score(valid_label, val_predict):.4f}")
        print(f"Test MAE: {mean_absolute_error(total_label[test], test_predict):.4f}\tTest r2 score: {r2_score(total_label[test], test_predict):.4f}")
        fold += 1
    
    end = time()
    print(f"Seed {seed} Model train complete... (time:{(end-start)/60:.2f}m)")

In [None]:
path_index = '_CNN+'

# Train 25 models (5 seed & 5 fold cross validation) 
for seed in range(0,5):
    train_test_DeepTEST(total_seq, total_label, seed=seed, cv_seed=0, path_index=path_index)

Codes for figures & xlsx files

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import json

from tensorflow import keras

from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_absolute_error

# The model directory
MODEL_FOLDER_DIR = './model_CNN+'

# onehot encoding script for TESR sequence data set
def onehot(seq):
    module = np.array([[1,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,1]])
    i = 0
    onehot_result = []
    while i < len(seq):
        seqlist = []
        for base in seq[i]:
            if base == 'a' or base == 'A':
                seqlist.append(module[0])
            elif base == 't' or base == 'T':
                seqlist.append(module[1])
            elif base == 'g' or base == 'G':
                seqlist.append(module[2])
            elif base == 'c' or base == 'C':
                seqlist.append(module[3])
            else:
                seqlist.append([0,0,0,0])
        onehot_result.append(seqlist)
        i = i + 1
    result = np.zeros((len(seq),9,1,4))
    result = np.float32(result)
    i = 0
    while i < len(seq):
        j = 0
        while j < len(seq[0]):
            result[i,j,0,:] = onehot_result[i][j]
            j = j + 1
        i = i + 1
    
    return result

In [None]:
""" For TESR score xlsx file """

# mRNA sequence: Codon alphabet
codon_table = {
    'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
    'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
    'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
    'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
    'TAT': 'Y', 'TAC': 'Y', 'TAA': '*', 'TAG': '*', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
    'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
    'TGT': 'C', 'TGC': 'C', 'TGA': '*', 'TGG': 'W', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
    'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'
} # 64

model_list = []
# Assume that there are 25 models on the model directory (5 seed & 5 fold cross validation)
for i in range(5):
    for j in range(5):
        model_list.append(f'Seed{i}-Fold{j}.hdf5')

# Read the mRNA sequence data
seq_data = pd.read_excel("./data/All_seq_aaseq.xlsx", sheet_name="No_stop", engine='openpyxl')
seq = seq_data['sequence'].values

# Convert mRNA sequence to codon
total_codon = []
for s in seq:
    first, second, third = s[:3], s[3:-3], s[-3:]
    total_codon.append(codon_table[first][0]+codon_table[second][0]+codon_table[third][0])
total_codon = np.array(total_codon)

# Convert mRNA string sequence to one hot sequence
total_seq = onehot(seq)
total_seq = total_seq.reshape(-1,9,4)

for model_name in model_list:
    fold_number = int(model_name[-6])
    full_model_name = os.path.join(MODEL_FOLDER_DIR, model_name)
    # Load target model
    model = keras.models.load_model(full_model_name)

    # Predict the TESR score
    all_predict = model.predict(total_seq)
    # Saving the mRNA string sequence & Predicted TESR score & Codon string sequence
    all_df = pd.DataFrame({"sequence": seq, "score": all_predict.reshape(-1), "AA sequence": total_codon})
    all_df.to_excel(os.path.join(MODEL_FOLDER_DIR, model_name[:-5]+'request.xlsx'), index=False)

In [None]:
""" For Figure 1 """

model_list = []
# Assume that there are 25 models on the model directory (5 seed & 5 fold cross validation)
"""for i in range(5):
    for j in range(5):
        model_list.append(f'Seed{i}-Fold{j}')"""
model_list = ['Seed0-Fold0', 'Seed0-Fold1']

# Read the mRNA sequence data
seq_data = pd.read_excel("./data/Ecoli codon variant transformer.xlsx", sheet_name="score 1", engine='openpyxl')
total_label = seq_data['score avg']

# For getting the test set
kfold = KFold(n_splits=5, shuffle=True, random_state=0)
kfold_split = kfold.split(total_label)

test_indexes = []
for _, test in kfold.split(total_label):
    test_indexes.append(test)

for model_name in model_list:
    # Read the training results (train loss, val_loss, ...)
    with open(os.path.join(MODEL_FOLDER_DIR, model_name+'.json'), 'r') as f:
        train_history = json.load(f)
    train_length = np.arange(len(train_history['loss']))

    # 1. Visualizing training results (training loss, validation loss)
    plt.figure(figsize=(7,7), dpi=300)
    plt.plot(train_length, train_history['val_loss'], "o", linestyle = "solid", c="red", markersize=0, label = "validation loss")
    plt.plot(train_length, train_history['loss'], "o", linestyle = "solid", c="blue", markersize=0, label = "training loss")
    plt.legend(loc = (0.55, 0.83), fontsize = 15)
    plt.title("DeepTESR training result", fontsize = 15)
    plt.xlabel("Epochs", fontsize = 15)
    plt.ylabel("Mean absolute error", fontsize = 15)
    # Figure saving directory
    plt.savefig(os.path.join(MODEL_FOLDER_DIR, model_name+'-train_figure.jpg'))

    # 2. Visualizing test results (Compare score between predicted score and true score)
    fold_number = int(model_name[-1])
    test_predict = np.load(os.path.join(MODEL_FOLDER_DIR, model_name+'-test_predict.npy'))

    plt.figure(figsize = (5,5), dpi = 300)
    plt.scatter(total_label[test_indexes[fold_number]], test_predict, alpha = 0.03)
    plt.title("GFP score(Test set) - TESR score(Prediction)", fontsize = 10)
    plt.xlabel("GFP score(Test set)", fontsize = 10)
    plt.ylabel("TESR score(Prediction)", fontsize = 10)
    plt.xlim(1.0, 5.0)
    plt.xticks([1, 2, 3, 4, 5])
    plt.ylim(1.0, 5.0)
    plt.yticks([1, 2, 3, 4, 5])
    # Figure saving directory
    plt.savefig(os.path.join(MODEL_FOLDER_DIR, model_name+'-test_figure.jpg'))