# This .ipynb code is used for data set generation and processing

## Data set generation

It is recommended to run the code below in Juterlab

Due to the file upload limit on GitHub, we split the training data into rows with a size of 25MB.               
The first 429 rows of the synthesized data frame are tRNA expression data, and the subsequent data is mRNA data.                
The division code is shown below:

In [None]:
# import os
# import pandas as pd

# def split_csv(input_csv_path, output_dir):
#     df = pd.read_csv(input_csv_path)
#     num_columns = df.shape[1]
#     num_splits = (num_columns + 99) // 100
#     os.makedirs(output_dir, exist_ok=True)
#     for i in range(num_splits):
#         start_col = i * 100
#         end_col = min((i + 1) * 100, num_columns)
#         chunk_df = df.iloc[:, start_col:end_col]
#         chunk_csv_path = os.path.join(output_dir, f"Rawdata_tRNA_mRNA_{i + 1}.csv")
#         chunk_df.to_csv(chunk_csv_path, index=False)
#         print(f"saved {i + 1} in {chunk_csv_path}")
        
# input_csv_path = "Rawdata_tRNA_mRNA.csv"
# output_dir = "1.Prepare data/Rawdata_tRNA_mRNA"
# split_csv(input_csv_path, output_dir)

##########      
By running the code below, you can merge the data to obtain the complete training data.               
↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓                
↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓↓                 

In [None]:
def concatenate_csv(input_dir, output_csv_path):
    csv_files = [f for f in os.listdir(input_dir) if f.endswith(".csv")]
    dfs = []
    for csv_file in csv_files:
        chunk_df = pd.read_csv(os.path.join(input_dir, csv_file))
        dfs.append(chunk_df)
    concatenated_df = pd.concat(dfs, axis=1)
    concatenated_df.to_csv(output_csv_path, index=False)
    print(f"save merged CSV in {output_csv_path}")

input_dir = "1.Prepare data/Rawdata_tRNA_mRNA"
output_csv_path = "Rawdata_tRNA_mRNA_1.csv"
concatenate_csv(input_dir, output_csv_path)

get "Rawdata_tRNA_mRNA_1.csv"            
↑↑↑                  
Among "Rawdata_tRNA_mRNA.csv", the first 429 rows are tRNA expression data, which comes from tRend. The subsequent data is mRNA expression data, which comes from TCGA.

## Data set processing

The data obtained above is raw data. You can choose the method of interest to standardize the data. ↑          
Here are two standardization methods that do not affect the prediction results. ↓             
In application, we strongly recommend the first standardization method.↓

####### tRNA,log2.           
####### mRNA,rank.

In [15]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
df = pd.read_csv('Rawdata_tRNA_mRNA_1.csv',index_col='gene_id')
tRNA=df.iloc[0:429,:]
mRNA_rank=df.iloc[429:df.shape[0],:].rank()

tRNA_log2=tRNA.applymap(lambda x: np.log2(x+1))

normalized_data=pd.DataFrame()
normalized_data=pd.concat([tRNA_log2,mRNA_rank])

normalized_data = pd.DataFrame(normalized_data, columns=df.columns)
normalized_data['gene_id']=df.index.values
normalized_data.set_index('gene_id',inplace=True)
normalized_data.to_csv("tRNA_log2_mRNA_rank.csv",index=True)

####### tRNA,log2,norm.    
####### mRNA,log2,norm.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
df = pd.read_csv('Rawdata_tRNA_mRNA_1.csv',index_col='gene_id')
tRNA=df.iloc[0:429,:]
mRNA=df.iloc[429:df.shape[0],:]
tRNA_log2=tRNA.applymap(lambda x: np.log2(x+1))
scaler = MinMaxScaler(feature_range=(0, 10000))
tRNA_log2_norm = scaler.fit_transform(tRNA_log2)
tRNA_log2_norm  = pd.DataFrame(tRNA_log2_norm , columns=tRNA.columns)
mRNA_log2=mRNA.applymap(lambda x: np.log2(x))
mRNA_log2_norm = scaler.fit_transform(mRNA_log2)
mRNA_log2_norm  = pd.DataFrame(mRNA_log2_norm , columns=mRNA.columns)
normalized_data=pd.DataFrame()
normalized_data=pd.concat([tRNA_log2_norm,mRNA_log2_norm])
normalized_data = pd.DataFrame(normalized_data, columns=df.columns)
normalized_data['gene_id']=df.index.values
normalized_data.set_index('gene_id',inplace=True)
normalized_data.to_csv("tRNA_log2_norm_mRNA_log2_norm.csv",index=True)