In [2]:
import pandas as pd
import os
data_dir_path = os.getcwd()
data_path = data_dir_path + '/DAASP_RNN_dataset.csv'

df = pd.read_csv(data_path)
amp_df = df[df['activity'] == 1]
amp_df = amp_df.loc[:,['ID', 'Name', 'Sequence', 'activity']]
max_len = max([len(amp_df.iloc[i]['Sequence']) for i in range(len(amp_df))])
amp_df.to_csv(data_dir_path + '/amp.csv')

# Construct finetune dataset

In [6]:
import pandas as pd
import os
data_dir_path = os.getcwd()

data_path = data_dir_path + '/DAASP_RNN_dataset.csv'
df = pd.read_csv(data_path)
amp_df = df[df['activity'] == 1]
max_len = max([len(amp_df.iloc[i]['Sequence']) for i in range(len(amp_df))])

with open(data_dir_path +'/amp_finetunexxx.txt', 'w') as f:
    special_token = '<|endoftext|>'
    seq_set = set()
    print(len(amp_df))
    for i in range(len(amp_df)):
        seq = amp_df.iloc[i]['Sequence'].upper()
        seq_set.add(seq)
    print(len(seq_set))
    for seq in seq_set:
        f.write(seq + '\n')

4774
4505


## Divide the sequence dataset into test set and training set

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os
data_dir_path = os.getcwd()

with open(data_dir_path + '/amp_finetune.txt') as f:
    amp_list = list()
    for row in f.readlines():
        amp_list.append(row[:-1])
    df = pd.DataFrame(amp_list,columns=['Sequence'])
    x = df.iloc[:,:]
    y = [0 for i in range(len(x))]

    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    print(len(X_train),len(X_test), len(X_train)+len(X_test))
    X_train.to_csv(data_dir_path + '/train_amp.csv', encoding='utf-8')
    X_test.to_csv(data_dir_path + '/test_amp.csv', encoding='utf-8')

3604 901 4505


## Define datasets for low-data scenarios

In [1]:
import pandas as pd
import os
data_dir_path = os.getcwd()
train_amp_df = pd.read_csv(data_dir_path + '/train_amp.csv')

sample_num = 2000
sampled_data = train_amp_df.sample(n=sample_num, random_state=42)   # 50 100 200 500 1000 2000 3000
sampled_data.to_csv(data_dir_path + f'/train_amp{sample_num}.csv', index=False)

## Generate sequences exhibiting amino acid compositions in accordance with the distribution observed in the training sequence

In [7]:
# Statistically analyze the amino acid distribution within the AMP training dataset.
import pandas as pd
import os
data_dir_path = os.getcwd()

alpha_data_path = data_dir_path + '/test_amp.csv'
alpha_df = pd.read_csv(alpha_data_path)
seq_list = [alpha_df.iloc[i]['Sequence'] for i in range(len(alpha_df))]
len_list = [len(seq) for seq in seq_list]

amino_acids = ['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
amino_acids_count = {'A':0, 'R':0, 'N':0, 'D':0, 'C':0, 'E':0, 'Q':0, 'G':0, 'H':0, 'I':0, 'L':0, 'K':0, 'M':0, 'F':0, 'P':0, 'S':0, 'T':0, 'W':0, 'Y':0, 'V':0}
amino_acids_rate = {'A':0, 'R':0, 'N':0, 'D':0, 'C':0, 'E':0, 'Q':0, 'G':0, 'H':0, 'I':0, 'L':0, 'K':0, 'M':0, 'F':0, 'P':0, 'S':0, 'T':0, 'W':0, 'Y':0, 'V':0}
for seq in seq_list:
    for aa in seq:
        if aa not in amino_acids:
            continue
        amino_acids_count[aa] += 1
for aa in amino_acids_count.keys():
    amino_acids_rate[aa]  = amino_acids_count[aa] / sum(list(amino_acids_count.values()))

In [2]:
import random
import numpy as np
import os
data_dir_path = os.getcwd()

amino_acids = ['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
amino_acid_distribution = list(amino_acids_rate.values())

seq_num = 1000
random_seq_list = list()
for i in range(seq_num):
    sequence_length = random.randint(10, 60)
    random_sequence = random.choices(amino_acids, weights=amino_acid_distribution, k=sequence_length)
    random_sequence = ''.join(random_sequence)
    random_seq_list.append(random_sequence)
np.save(data_dir_path + '/random_seq_list.npy', random_seq_list)