In [1]:
import os
data_dir = os.path.dirname(os.path.dirname(os.path.abspath("__file__")))
# File path for each functional peptide
ACP_file_path = os.path.join(data_dir, 'MFBP','raw_dataset', 'ACP', 'ACPCD_.txt')   
ADP_file_path = os.path.join(data_dir, 'MFBP','raw_dataset', 'ADP', 'ADPCD_.txt')   
AHP_file_path = os.path.join(data_dir, 'MFBP','raw_dataset', 'AHP', 'AHPCD_.txt')   
AIP_file_path = os.path.join(data_dir, 'MFBP','raw_dataset', 'AIP', 'AIPCD_.txt')   
AMP_file_path = os.path.join(data_dir, 'MFBP','raw_dataset', 'AMP', 'AMPCD_.txt')   

In [2]:
def get_seq_from_fasta(file):
    """
    Get sequences from files in fasta format
    param: file: Path of the fasta file to be processed
    return:  list of sequences    list(seq1, seq2, ...)
    """
    seq_list = []
    with open(file) as f:
        for raw in f.readlines():
            if raw[0] == '>':
                continue
            else:
                if raw[-1] == '\n':
                    seq_list.append(raw[:-1])
                else:
                    seq_list.append(raw)
    print(len(seq_list))
    return seq_list
ACP = get_seq_from_fasta(ACP_file_path)
ADP = get_seq_from_fasta(ADP_file_path)
AHP = get_seq_from_fasta(AHP_file_path)
AIP = get_seq_from_fasta(AIP_file_path)
AMP = get_seq_from_fasta(AMP_file_path)

all_data_dict = {"ACP":ACP, "ADP":ADP, "AHP":AHP, "AIP":AIP, "AMP":AMP}
print(f"ACP:{len(ACP)},  ADP:{len(ADP)}, AHP:{len(AHP)}, AIP:{len(AIP)}, AMP:{len(AMP)}")

646
514
868
1678
2409
ACP:646,  ADP:514, AHP:868, AIP:1678, AMP:2409


In [3]:
import numpy as np
def get_label_dict(all_data_dict):
    """
    Get a dictionary of all peptide sequences and their labels
    param: all_data_dict: Dictionary of peptides saved by category  {"ACP":ACP_seq_list, "ADP":ADP_seq_list, ...}
    return: dictionary of all peptide sequences   {seq1:label1, seq2:label2, ...}  
    """
    label2id = {"ACP":0, "ADP":1, "AHP":2, "AIP":3, "AMP":4}
    seq_label = {}
    for peptide_class in all_data_dict.keys():
        seq_list = all_data_dict[peptide_class]   # Get all the sequences of a class of functional peptides
        for seq in seq_list:
            if seq not in seq_label.keys():
                label = np.array([0,0,0,0,0])
                label[label2id[peptide_class]] = 1
                seq_label[seq] = label
            else:
                seq_label[seq][label2id[peptide_class]] = 1     
    return seq_label  

seq_label = get_label_dict(all_data_dict)
all_data_file_path = os.path.join(data_dir, 'MFBP', 'all_data.npy')
np.save(all_data_file_path, seq_label)

### Get training data and test data
Process the training set and test set sequence files obtained by get_train_test.py,  <br>
and get the corresponding labels of the training sequence and test sequence, and save them as numpy files. <br>
The numpy file is saved as a dictionary file, where the key value is a sequence, and the value is the label of the sequence <br>
- train_data.npy  {seq1:label1, seq2:label2, ....}   <br>
- test_data.npy  {seq1:label1, seq2:label2, ....}  <br>

In [11]:
import numpy as np
train_seq_path = os.path.join(data_dir, 'MFBP', 'seq_data', 'tr_seq.npy')
test_seq_path = os.path.join(data_dir, 'MFBP', 'seq_data', 'te_seq.npy')
all_data_file_path = os.path.join(data_dir, 'MFBP', 'all_data.npy')

train_seq =  np.load(train_seq_path)
test_seq = np.load(test_seq_path)
all_seq_label_data = np.load(all_data_file_path, allow_pickle=True).item()

def get_train_test_seq_label(seq, all_seq_label_data, save_path):
    seq_set = set(seq)
    seq_label_dict = dict()
    for seq in seq_set:
        label = all_seq_label_data[seq]
        seq_label_dict[seq] = label
    np.save(save_path, seq_label_dict)

train_save_path = os.path.join(data_dir, 'MFBP', 'train_data.npy')
test_save_path = os.path.join(data_dir, 'MFBP', 'test_data.npy')
get_train_test_seq_label(train_seq, all_seq_label_data, train_save_path)
get_train_test_seq_label(test_seq, all_seq_label_data, test_save_path)

## Data analysis