### Prepare test sets

In [6]:
import pandas as pd
from Bio import SeqIO
import random

FOLD_NO = "6"


def read_fasta(file_path, label):
    """Read a FASTA file and return a list of (sequence, label) tuples."""
    sequences = []
    with open(file_path, "r") as fasta_file:
        for record in SeqIO.parse(fasta_file, "fasta"):
            sequences.append((str(record.seq), label))
    return sequences


def combine_and_shuffle(sequences1, sequences2):
    """Combine two lists of sequences, shuffle them, and return a DataFrame."""
    combined = sequences1 + sequences2
    random.shuffle(combined)
    return pd.DataFrame(combined, columns=['seq', 'label'])


# Paths to the FASTA files
path_negative_train = "data(train+val)/negative/PB40/PB40_1z20_clu50_trn" + FOLD_NO + ".fa"
path_positive_train = "data(train+val)/positive/bass_motif/pad/bass_ctm_motif_trn" + FOLD_NO + ".fa"
path_negative_val = "data(train+val)/negative/PB40/PB40_1z20_clu50_val" + FOLD_NO + ".fa"
path_positive_val = "data(train+val)/positive/bass_motif/pad/bass_ctm_motif_val" + FOLD_NO + ".fa"
path_negative_test = "data(train+val)/negative/PB40/PB40_1z20_clu50_val" + FOLD_NO + ".fa"
path_positive_test = "data(train+val)/positive/bass_motif/bass_ntm_motif_test.fa"

# Read sequences and assign labels
negative_sequences_train = read_fasta(path_negative_train, 0)
positive_sequences_train = read_fasta(path_positive_train, 1)
negative_sequences_val = read_fasta(path_negative_val, 0)
positive_sequences_val = read_fasta(path_positive_val, 1)

# Combine and shuffle datasets
shuffled_data_train = combine_and_shuffle(negative_sequences_train, positive_sequences_train)
shuffled_data_val = combine_and_shuffle(negative_sequences_val, positive_sequences_val)

# Save to CSV
shuffled_data_train.to_csv("data(train+val)/prepared/" + FOLD_NO + "/bass_pb40.train.csv", index=False)
shuffled_data_val.to_csv("data(train+val)/prepared/" + FOLD_NO + "/bass_pb40.val.csv", index=False)