In [7]:
import numpy as np
import pandas as pd
from time import time
import pickle

test_dataset_samples_file = 'datasets/splits/test/samples.tsv'

output_samples_eval_file_atg = 'outputs/TIS_Predictor_samples_eval_ATG.tsv'
output_samples_eval_file_nc = 'outputs/TIS_Predictor_samples_eval_NC.tsv'

In [8]:
upstream_size = 10
downstream_size = 13

df_samples = pd.read_csv(test_dataset_samples_file, sep='\t', header=0)


def decrease_length_samples(samples, upstream_size, downstream_size):

    tis_start_idx = 100

    up_first_idx = tis_start_idx - upstream_size
    down_last_idx = tis_start_idx + downstream_size

    trans_samples = []
    for sample in samples:
        new_sample = sample[up_first_idx:tis_start_idx] + sample[tis_start_idx:down_last_idx]
        trans_samples.append(new_sample)

    return trans_samples


def TIS_Predictor_encoding(samples):

    coded_samples = []
    
    for sample in samples:

        coded_sample = []
        for symbol in sample:
            if   symbol == 'A': cod_symbol = [1,0,0,0,0]
            elif symbol == 'C': cod_symbol = [0,1,0,0,0]
            elif symbol == 'G': cod_symbol = [0,0,1,0,0]
            elif symbol == 'T': cod_symbol = [0,0,0,0,1]
            else:               cod_symbol = [0,0,0,1,0]

            coded_sample += cod_symbol
        
        coded_samples.append(np.array(coded_sample))

    return coded_samples

### ATG prediction

In [9]:
df_samples_atg = df_samples[df_samples['TIS_type'] == 'ATG'].copy()
df_samples_atg.shape

(19303, 7)

In [10]:
X_test = df_samples_atg['sample_na'].tolist()
X_test = decrease_length_samples(X_test, upstream_size, downstream_size)
X_test = TIS_Predictor_encoding(X_test)

y_test = np.array(df_samples_atg['sample_label'])

In [5]:
model_file = 'TIS_Predictor/atg_rfc.sav'  # model for ATG start codons

start_time = time()

rfc = pickle.load(open(model_file, 'rb'))

y_pred_labels = []
y_pred_scores = []

for i in range(len(X_test)):
    y_pred_labels = np.append(y_pred_labels, rfc.predict(X_test[i].reshape(1,-1)))
    y_pred_scores = np.append(y_pred_scores, rfc.predict_proba(X_test[i].reshape(1,-1))[:,1])

y_pred_labels = np.where(y_pred_labels == 0, -1, y_pred_labels)  # change labels 0 to -1

end_time = time()
pred_time = end_time - start_time
pred_time

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


2078.3097007274628

In [11]:
df_samples_atg['TIS_predictor_real_label'] = y_test
df_samples_atg['TIS_predictor_pred_label'] = y_pred_labels
df_samples_atg['TIS_predictor_score'] = y_pred_scores

df_samples_atg.to_csv(output_samples_eval_file_atg, sep='\t', index=False)

### Near-cognate prediction

In [12]:
df_samples_nc = df_samples[df_samples['TIS_type'] != 'ATG'].copy()
df_samples_nc.shape

(52880, 7)

In [13]:
X_test = df_samples_nc['sample_na'].tolist()
X_test = decrease_length_samples(X_test, upstream_size, downstream_size)
X_test = TIS_Predictor_encoding(X_test)

y_test = np.array(df_samples_nc['sample_label'])

In [14]:
model_file = 'TIS_Predictor/near-cognate_rfc.sav'  # model for near cognates start codons

start_time = time()

rfc = pickle.load(open(model_file, 'rb'))

y_pred_labels = []
y_pred_scores = []

for i in range(len(X_test)):
    y_pred_labels = np.append(y_pred_labels, rfc.predict(X_test[i].reshape(1,-1)))
    y_pred_scores = np.append(y_pred_scores, rfc.predict_proba(X_test[i].reshape(1,-1))[:,1])

y_pred_labels = np.where(y_pred_labels == 0, -1, y_pred_labels)  # change labels 0 to -1

end_time = time()
pred_time = end_time - start_time
pred_time

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


5771.190366268158

In [15]:
df_samples_nc['TIS_predictor_real_label'] = y_test
df_samples_nc['TIS_predictor_pred_label'] = y_pred_labels
df_samples_nc['TIS_predictor_score'] = y_pred_scores

df_samples_nc.to_csv(output_samples_eval_file_nc, sep='\t', index=False)