# 按信号肽的存在切割数据集

In [4]:
t2se_data_path = "data/T2SE/T2SE.fasta"
non_t2se_data_path = "data/T2SE/noT2SE_GDB.fasta"

dataset_out_path = "data/T2SE/"

In [45]:
import json
import pandas as pd
import itertools

import numpy as np
random_seed = 42
np.random.seed(random_seed)

from Bio import SeqIO, SeqRecord, Seq

In [31]:
t2se_data = list(SeqIO.parse(t2se_data_path, "fasta"))
t2se_data_dict = {
    seq.id: seq for seq in t2se_data
}

In [21]:
pred_result = pd.read_csv("data/T2SE/prediction_results.txt", header=0, comment="#", sep="\t")

In [30]:
with_nosp = pred_result[pred_result['Prediction'] == "OTHER"]['ID'].to_list()
with_sp = pred_result[pred_result['Prediction'] != "OTHER"]['ID'].to_list()

保存序列

In [37]:
with open(f"{dataset_out_path}/spT2SE.fasta", "w+", encoding="UTF-8") as f:
    SeqIO.write(
        [
            t2se_data_dict[seq_id]
            for seq_id in with_sp
        ],
        f,
        "fasta"
    )

In [38]:
with open(f"{dataset_out_path}/nospT2SE.fasta", "w+", encoding="UTF-8") as f:
    SeqIO.write(
        [
            t2se_data_dict[seq_id]
            for seq_id in with_nosp
        ],
        f,
        "fasta"
    )

获取相关特征

In [42]:
def dual_with_X(seq:SeqRecord):
    seq.seq = Seq.Seq(str(seq.seq).replace("X", "A").replace("Z", "A").replace("*", "A").replace("-", "A").replace("B", "A").replace("U", "A"))
    return seq

In [43]:
def make_neg_dataset(
    need_size:int,
    non_t2se_data_path:str
):

    non_t2se_data = list(
        SeqIO.parse(non_t2se_data_path, "fasta")
    )
    choised_index_list = np.random.choice(
        np.arange(len(non_t2se_data)),
        size=need_size,
        replace=False
    )

    return [
        dual_with_X(non_t2se_data[choised_index])
        for choised_index in choised_index_list
    ]

In [46]:
with open(f"{dataset_out_path}/sp_paired_non_t2se.fasta", "w+", encoding="UTF-8") as f:
    SeqIO.write(
        sequences=make_neg_dataset(
            need_size=len(with_sp),
            non_t2se_data_path=non_t2se_data_path
        ),
        handle=f,
        format="fasta"
    )

In [47]:
with open(f"{dataset_out_path}/nosp_paired_non_t2se.fasta", "w+", encoding="UTF-8") as f:
    SeqIO.write(
        sequences=make_neg_dataset(
            need_size=len(with_nosp),
            non_t2se_data_path=non_t2se_data_path
        ),
        handle=f,
        format="fasta"
    )