In [3]:
import numpy as np
import os

def process_data(filepath, save_path):
    """
    Convert fasta(.txt) files to numpy files(.npy)
    The numpy file saves the train and test data in the format of a dictionary, 
    where the key value is a sequence, and the value is the label of the sequence
    format: {seq1: label1, seq2: label2, ....}   seq(str), label(numpy.array)
    param: filepath: The path of the fasta file that saves the training and testing data
           save_path:  Processed numpy file path
    return: None
    """
    seq_label_dict = dict()
    with open(filepath) as f:
        for raw in f.readlines():
            if raw[0] == '>':
                label = raw[1:-1]
                label = [int(l) for l in label]
                label = np.array(label)
            else:
                if raw[-1] == '\n':
                    seq = raw[:-1]
                else:
                    seq = raw
                seq_label_dict[seq] = label
    np.save(save_path, seq_label_dict)


In [4]:
data_dir = os.path.dirname(os.path.dirname(os.path.abspath("__file__")))
# filepath
train_data_path = os.path.join(data_dir,'MFTP', 'train.txt')   # MFTP
test_data_path = os.path.join(data_dir,'MFTP', 'test.txt')
# save_path
process_train_data_path = os.path.join(data_dir,'MFTP', 'train_data.npy')   # MFTP
process_test_data_path = os.path.join(data_dir,'MFTP', 'test_data.npy')

process_data(train_data_path, process_train_data_path)
process_data(test_data_path, process_test_data_path)

In [5]:
import numpy as np 
peptide_type = ['AAP', 'ABP', 'ACP', 'ACVP', 'ADP', 'AEP', 'AFP', 'AHIVP', 'AHP', 'AIP', 'AMRSAP', 'APP', 'ATP',
             'AVP',
             'BBP', 'BIP',
             'CPP', 'DPPIP',
             'QSP', 'SBP', 'THP']
train_data_npy = np.load(process_train_data_path, allow_pickle=True).item()
test_data_npy = np.load(process_test_data_path, allow_pickle=True).item()

def count_peptide_num(data_npy, peptide_type):
    peptide_num = dict()
    for seq in data_npy.keys():
        label = data_npy[seq]
        for i in range(len(label)):
            if label[i] == 1:
                if peptide_type[i] not in peptide_num.keys():
                    peptide_num[peptide_type[i]] = 1
                else:
                    peptide_num[peptide_type[i]] += 1
    
    return peptide_num

train_peptide_num = count_peptide_num(train_data_npy, peptide_type)
test_peptide_num = count_peptide_num(test_data_npy, peptide_type)
print(train_peptide_num)
print(test_peptide_num)
for key in train_peptide_num.keys():
    print(key, train_peptide_num[key] + test_peptide_num[key], train_peptide_num[key],"/", test_peptide_num[key])

{'CPP': 366, 'AIP': 1624, 'ABP': 1735, 'AHP': 758, 'ACP': 850, 'AFP': 1079, 'BIP': 274, 'AVP': 568, 'THP': 531, 'QSP': 171, 'ACVP': 98, 'AAP': 115, 'ADP': 400, 'SBP': 89, 'AMRSAP': 147, 'AEP': 48, 'APP': 218, 'DPPIP': 250, 'AHIVP': 82, 'ATP': 182, 'BBP': 92}
{'AFP': 273, 'ABP': 419, 'BIP': 61, 'AHP': 190, 'AMRSAP': 21, 'AAP': 18, 'ACP': 193, 'AVP': 143, 'AIP': 425, 'THP': 120, 'QSP': 49, 'AHIVP': 19, 'CPP': 93, 'ACVP': 28, 'ADP': 109, 'APP': 61, 'DPPIP': 63, 'BBP': 25, 'ATP': 60, 'SBP': 15, 'AEP': 10}
CPP 459 366 / 93
AIP 2049 1624 / 425
ABP 2154 1735 / 419
AHP 948 758 / 190
ACP 1043 850 / 193
AFP 1352 1079 / 273
BIP 335 274 / 61
AVP 711 568 / 143
THP 651 531 / 120
QSP 220 171 / 49
ACVP 126 98 / 28
AAP 133 115 / 18
ADP 509 400 / 109
SBP 104 89 / 15
AMRSAP 168 147 / 21
AEP 58 48 / 10
APP 279 218 / 61
DPPIP 313 250 / 63
AHIVP 101 82 / 19
ATP 242 182 / 60
BBP 117 92 / 25
