# This .ipynb code is used for data set generation and processing

## Data set generation

It is recommended to run the code below in Juterlab

#Due to the file upload limit on GitHub, we split the training data into rows with a size of 25MB.               
#The first 429 rows of the synthesized data frame are tRNA expression data, and the subsequent data is mRNA data.                
#The division code is shown below:

In [None]:
import os
import pandas as pd

def split_csv_file(file_path, chunk_size=24*1024*1024, output_dir='Rawdata_tRNA_mRNA'):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    with open(file_path, 'r') as file:
        header = file.readline()
        part_num = 1
        while True:
            current_chunk = file.read(chunk_size)
            if not current_chunk:
                break
            output_file_path = os.path.join(output_dir, f'Rawdata_tRNA_mRNA_{part_num}.csv')
            with open(output_file_path, 'w') as output_file:
                output_file.write(header)
                output_file.write(current_chunk)
            part_num += 1
split_csv_file("Rawdata_tRNA_mRNA.csv")

##########      
By running the code below, you can merge the data to obtain the complete training data.               
↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓                
↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓                 

In [4]:
import os
import pandas as pd

def merge_csv_files(input_dir, output_file_path):
    files = sorted(os.listdir(input_dir))
    with open(output_file_path, 'w') as output_file:
        for i, file_name in enumerate(files):
            file_path = os.path.join(input_dir, file_name)
            with open(file_path, 'r') as file:
                if i == 0:
                    output_file.write(file.read())
                else:
                    file.readline()
                    output_file.write(file.read())
merge_csv_files("Rawdata_tRNA_mRNA", "Rawdata_tRNA_mRNA_1.csv")

get "Rawdata_tRNA_mRNA.csv"            
↑↑↑                  
Among "Rawdata_tRNA_mRNA.csv", the first 429 rows are tRNA expression data, which comes from tRend. The subsequent data is mRNA expression data, which comes from TCGA.

## Data set processing

The data obtained above is raw data. You can choose the method of interest to standardize the data. ↑          
Here are two standardization methods that do not affect the prediction results. ↓             
In application, we strongly recommend the first standardization method.↓

####### tRNA,log2.           
####### mRNA,rank.

In [15]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
df = pd.read_csv('Rawdata_tRNA_mRNA.csv',index_col='gene_id')
tRNA=df.iloc[0:429,:]
mRNA_rank=df.iloc[429:df.shape[0],:].rank()

tRNA_log2=tRNA.applymap(lambda x: np.log2(x+1))

normalized_data=pd.DataFrame()
normalized_data=pd.concat([tRNA_log2,mRNA_rank])

normalized_data = pd.DataFrame(normalized_data, columns=df.columns)
normalized_data['gene_id']=df.index.values
normalized_data.set_index('gene_id',inplace=True)
normalized_data.to_csv("tRNA_log2_mRNA_rank.csv",index=True)

####### tRNA,log2,norm.    
####### mRNA,log2,norm.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
df = pd.read_csv('Rawdata_tRNA_mRNA.csv',index_col='gene_id')
tRNA=df.iloc[0:429,:]
mRNA=df.iloc[429:df.shape[0],:]
tRNA_log2=tRNA.applymap(lambda x: np.log2(x+1))
scaler = MinMaxScaler(feature_range=(0, 10000))
tRNA_log2_norm = scaler.fit_transform(tRNA_log2)
tRNA_log2_norm  = pd.DataFrame(tRNA_log2_norm , columns=tRNA.columns)
mRNA_log2=mRNA.applymap(lambda x: np.log2(x))
mRNA_log2_norm = scaler.fit_transform(mRNA_log2)
mRNA_log2_norm  = pd.DataFrame(mRNA_log2_norm , columns=mRNA.columns)
normalized_data=pd.DataFrame()
normalized_data=pd.concat([tRNA_log2_norm,mRNA_log2_norm])
normalized_data = pd.DataFrame(normalized_data, columns=df.columns)
normalized_data['gene_id']=df.index.values
normalized_data.set_index('gene_id',inplace=True)
normalized_data.to_csv("tRNA_log2_norm_mRNA_log2_norm.csv",index=True)