## Creating the dataset

In [3]:
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt

In [4]:
negatives_path = r"data\negatives\non_afp_raw.faa"
positives_path = r"data\positives\afp_all_raw.faa"

In [5]:
def get_unique_characters(filepath: list[str]):
    """
    Grabs the list of unique characters 
    """
    unique_chars = set()
    print("\n--- Checking for Unique Characters in FASTA Files ---")
    
    
    with open(filepath, 'r') as f:
        for line in f:
            line = line.strip()
            # Only process lines that do NOT start with the FASTA header indicator '>'
            if not line.startswith('>'):
                # Convert to uppercase and add all characters to the set
                unique_chars.update(line.upper())

    return list(unique_chars)

In [6]:
"""
Assigns each amino acid to a number and returns dictionaries
like {'A': 1, 'R': 2} and
{1: 'A', 2: 'R'}
"""
amino_acids = 'ARNDCQEGHILKMFPSTWYV' # Standard 20 amino acids
aa_to_int = {aa: i for i, aa in enumerate(amino_acids)}
int_to_aa = {i: aa for aa, i in aa_to_int.items()}


In [7]:
unique_pos_characters = get_unique_characters(positives_path)

print(unique_pos_characters)
print("Length:", len(unique_pos_characters))

for i in unique_pos_characters:
    if i not in aa_to_int.keys():
        print(i)
        


--- Checking for Unique Characters in FASTA Files ---
['Q', 'K', 'P', 'C', 'S', 'V', 'N', 'E', 'M', 'Y', 'A', 'W', 'T', 'X', 'H', 'D', 'R', 'L', 'I', 'F', 'B', 'G']
Length: 22
X
B


In [8]:
unique_neg_characters = get_unique_characters(negatives_path)

print(unique_neg_characters)
print("Length:", len(unique_neg_characters))

for i in unique_neg_characters:
    if i not in aa_to_int.keys():
        print(i)
        


--- Checking for Unique Characters in FASTA Files ---
['Q', 'K', 'U', 'P', 'C', 'S', 'V', 'N', 'E', 'M', 'Y', 'A', 'W', 'T', 'X', 'H', 'D', 'R', 'L', 'I', 'Z', 'F', 'B', 'G']
Length: 24
U
X
Z
B


In [9]:
"""
Mapping of non-standard codes:
    - B (Asx: Asparagine or Aspartic Acid) is mapped to N (Asparagine), Index 2
    - X (Any/Unknown) is mapped to A (Alanine, a neutral placeholder), Index 0
    - U (Selenocysteine) is mapped to C (Cysteine, as they are chemically related), Index 4
    - Z (Glx: Glutamine or Glutamic Acid) is mapped to E (Glutamic Acid), Index 6
    Mappings based on this article: https://www.matrixscience.com/blog/non-standard-amino-acid-residues.html
"""

if 'N' in aa_to_int: aa_to_int['B'] = aa_to_int['N'] # B -> N
if 'A' in aa_to_int: aa_to_int['X'] = aa_to_int['A'] # X -> A
if 'C' in aa_to_int: aa_to_int['U'] = aa_to_int['C'] # U -> C
if 'E' in aa_to_int: aa_to_int['Z'] = aa_to_int['E'] # Z -> E

In [10]:
def parse_fasta_file(filepath, aa_to_int_map):
    """
    Reads a FASTA-formatted file, extracts protein sequences, encodes them
    to integers, and prepares the data for HMM training.

    Args:
        filepath (str): Path to the FASTA file.
        aa_to_int_map (dict): Mapping from amino acid character to integer.

    Returns:
        tuple: (X, lengths)
            X (np.ndarray): Concatenated, integer-encoded sequences (shape: (Total Length, 1)).
            lengths (np.ndarray): Array of individual sequence lengths (shape: (Number of Sequences,)).
    """
    all_sequences_int = []
    sequence_lengths = []
    current_sequence = ""
    # Initialize last_header to handle the first sequence and error reporting properly
    last_header = "N/A (first sequence)" 

    print(f"--- Processing file: {filepath} ---")
    with open(filepath, 'r') as f:
        for line in f:
            line = line.strip()

            if line.startswith('>'):
                # Start of a new sequence header
                if current_sequence:
                    # Process the previous sequence
                    encoded_seq = []
                    
                    for aa in current_sequence:
                        if aa in aa_to_int_map:
                            encoded_seq.append(aa_to_int_map[aa])
                        else:
                            # Explicitly report any skipped, non-standard characters
                            print(f"Warning: Non-standard character '{aa}' found in sequence '{last_header}'. Skipping.")
                    
                    if encoded_seq:
                        all_sequences_int.extend(encoded_seq)
                        sequence_lengths.append(len(encoded_seq))

                last_header = line.split('|')[1] if '|' in line and len(line.split('|')) > 1 else line[1:30].strip() + "..."
                current_sequence = ""
            elif line:
                # Append sequence data (ignoring whitespaces/newlines)
                current_sequence += line.upper()

        # Process the very last sequence in the file
        if current_sequence:
            encoded_seq = []
            for aa in current_sequence:
                if aa in aa_to_int_map:
                    encoded_seq.append(aa_to_int_map[aa])
                else:
                    print(f"Non-standard character '{aa}' found in final sequence '{last_header}'. Skipping.")
                    
            if encoded_seq:
                all_sequences_int.extend(encoded_seq)
                sequence_lengths.append(len(encoded_seq))

    # Convert lists to NumPy arrays in the required format
    X = np.array(all_sequences_int).reshape(-1, 1)
    lengths = np.array(sequence_lengths, dtype=np.int32)

    return X, lengths

In [11]:
def shuffle_sequences(X, lengths, random_state=42):
    """
    Shuffles the individual sequences contained within the concatenated array X 
    while maintaining the integrity of each sequence.

    Args:
        X (np.ndarray): Concatenated, integer-encoded sequences (shape: (Total Length, 1)).
        lengths (np.ndarray): Array of individual sequence lengths.
        random_state (int, optional): Seed for reproducibility.

    Returns:
        tuple: (X_shuffled, lengths_shuffled)
            X_shuffled (np.ndarray): Concatenated sequences in shuffled order.
            lengths_shuffled (np.ndarray): Lengths corresponding to the shuffled sequences.
    """
    if random_state is not None:
        np.random.seed(random_state)
        np.random.seed(random_state) # Using standard random for list shuffle

    print("\n--- Shuffling Sequences ---")
    # np.split uses cumulative sums of lengths to find the indices for splitting
    sequences = np.split(X, np.cumsum(lengths)[:-1])
    # Each item in combined_data is (sequence_array, length_integer)
    combined_data = list(zip(sequences, lengths))
    np.random.shuffle(combined_data)
    
    shuffled_sequences, shuffled_lengths = zip(*combined_data)
    
    X_shuffled = np.concatenate(shuffled_sequences)
    
    # Convert shuffled lengths back to a NumPy array
    lengths_shuffled = np.array(shuffled_lengths, dtype=np.int32)
    
    print(f"Sequence order shuffled successfully. Total sequences: {len(lengths_shuffled)}")

    return X_shuffled, lengths_shuffled

In [12]:
positive_X, positive_lengths = parse_fasta_file(positives_path, aa_to_int)
negative_X, negative_lengths = parse_fasta_file(negatives_path, aa_to_int)

positive_X_shuffled, positive_lengths_shuffled = shuffle_sequences(positive_X, positive_lengths)
negative_X_shuffled, negative_lengths_shuffled = shuffle_sequences(negative_X, negative_lengths)

--- Processing file: data\positives\afp_all_raw.faa ---
--- Processing file: data\negatives\non_afp_raw.faa ---

--- Shuffling Sequences ---
Sequence order shuffled successfully. Total sequences: 48

--- Shuffling Sequences ---
Sequence order shuffled successfully. Total sequences: 5633


In [13]:
negative_lengths

array([ 294,  494, 1136, ...,  499,  206,  427],
      shape=(5633,), dtype=int32)

In [14]:
positive_lengths

array([ 82, 790, 218,  82, 168, 163,  87,  66, 128,  31,  37,  85,  33,
        45,  91,  97,  19,  16,  63,  63,  64,  88,  88,  87,  87,  91,
        87,  64,  63,  64,  64,  62,  33,  40,  91,  66,  64, 134, 175,
       125, 124, 261, 276, 253, 243, 112,  20, 892], dtype=int32)

In [15]:
lengths = [82, 790, 218,  82, 168, 163,  87,  66, 128,  31,  37,  85,  33,
        45,  91,  97,  19,  16,  63,  63,  64,  88,  88,  87,  87,  91,
        87,  64,  63,  64,  64,  62,  33,  40,  91,  66,  64, 134, 175,
       125, 124, 261, 276, 253, 243, 112,  20, 892]

sum_of_lengths = 0

for length in lengths:
    sum_of_lengths += length

print(sum_of_lengths)

6182


In [16]:
positive_X

array([[12],
       [ 0],
       [10],
       ...,
       [11],
       [ 0],
       [ 2]], shape=(6182, 1))

## Training the model

In [26]:
from hmmlearn.hmm import CategoricalHMM

In [27]:
positive_model = CategoricalHMM(
            n_components=1, # just 1 state for POC,
            n_features=20, # amount of amino acids, 
            n_iter=10,
            random_state=42
        ) 

positive_model.fit(positive_X_shuffled, positive_lengths_shuffled)

0,1,2
,n_components,1
,startprob_prior,1.0
,transmat_prior,1.0
,emissionprob_prior,1.0
,n_features,20
,algorithm,'viterbi'
,random_state,RandomState(M... 0x1D31B240940
,n_iter,10
,tol,0.01
,verbose,False


In [28]:
# Train negative model
negative_model = CategoricalHMM(
    n_components=1, # just 1 state for POC,
    n_features=20, # amount of amino acids, 
    n_iter=10,
    random_state=42
)
negative_model.fit(negative_X_shuffled, negative_lengths_shuffled)

0,1,2
,n_components,1
,startprob_prior,1.0
,transmat_prior,1.0
,emissionprob_prior,1.0
,n_features,20
,algorithm,'viterbi'
,random_state,RandomState(M... 0x1D33DE96040
,n_iter,10
,tol,0.01
,verbose,False


In [29]:
# Just basic testing to see if the model itself works
# not worried about data leakage for now
test_seq = positive_X[:82]
log_prob_pos = positive_model.score(test_seq)
log_prob_neg = negative_model.score(test_seq)

In [30]:
log_prob_neg

-228.51173766228416

In [31]:
log_prob_pos

-188.06176699019352

In [32]:
prediction = "positive" if log_prob_pos > log_prob_neg else "negative"
prediction

'positive'

### Hyperparameter tuning + K fold cross validation

In [33]:
def split_to_kfolds(sequence, sequence_length, kfolds):
    """
    takes in the sequence and the sequence length for the HMM
    and splits the sequences into kfolds
    so sequence_length looks like:
    [6, 7, 41, 22, 213, 6], if kfolds = 2, then we turn sequence_length into [[6, 7, 41], [22, 213, 6]]
    and we do the same for the sequence as well, except the sequence is structured a little different 
    array([[12],
       [ 0],
       [10],
       ...,
       [11],
       [ 0],
       [ 2]], shape=(6182, 1))
        
    Returns:
    --------
    sequence_folds : list of arrays
        List of k arrays, each containing concatenated sequences for that fold
    length_folds : list of arrays
        List of k arrays, each containing sequence lengths for that fold
    """
    
    n_sequences = len(sequence_length)
    
    # Split sequence_length into k folds
    length_folds = np.array_split(sequence_length, kfolds)
    
    # Calculate cumulative positions to know where each sequence starts
    cumsum_lengths = np.concatenate([[0], np.cumsum(sequence_length)])
    
    # Split sequences into k folds
    sequence_folds = []
    
    # Calculate how many complete sequences go into each fold
    sequences_per_fold = np.array_split(np.arange(n_sequences), kfolds)
    
    for fold_indices in sequences_per_fold:
        # Extract sequences for this fold
        fold_sequences = []
        for idx in fold_indices:
            start = cumsum_lengths[idx]
            end = cumsum_lengths[idx + 1]
            fold_sequences.append(sequence[start:end])
        
        # Concatenate all sequences in this fold
        if len(fold_sequences) > 0:
            sequence_folds.append(np.vstack(fold_sequences))
        else:
            sequence_folds.append(np.array([]).reshape(0, 1))
    
    return sequence_folds, length_folds
    

In [34]:
# making sure the negative data set is the same length as the positive one to save on training times
negative_lengths_shuffled = negative_lengths_shuffled[:48]

sum_of_lengths = 0

for length in negative_lengths_shuffled:
    sum_of_lengths += length

print(sum_of_lengths)
negative_X_shuffled = negative_X_shuffled[:15794]


15794


In [35]:
positive_sequence_folds, positive_length_folds = split_to_kfolds(positive_X_shuffled, positive_lengths_shuffled, 10)
positive_length_folds

[array([ 64, 124,  87, 253,  87], dtype=int32),
 array([134,  33,  63, 168,  91], dtype=int32),
 array([128,  82,  87, 125,  40], dtype=int32),
 array([ 45,  16, 112,  97,  31], dtype=int32),
 array([19, 64, 33, 20, 82], dtype=int32),
 array([ 62,  64, 163,  85,  91], dtype=int32),
 array([790, 243,  88, 218,  64], dtype=int32),
 array([ 66,  87, 261,  37,  88], dtype=int32),
 array([ 63, 892,  64,  66], dtype=int32),
 array([276,  91,  63, 175], dtype=int32)]

In [36]:
negative_sequence_folds, negative_length_folds = split_to_kfolds(negative_X_shuffled, negative_lengths_shuffled, 10)
negative_length_folds

[array([336, 400, 433, 464, 240], dtype=int32),
 array([855, 155, 196, 247, 412], dtype=int32),
 array([ 50, 299, 540, 143, 516], dtype=int32),
 array([261,  74, 102, 407, 175], dtype=int32),
 array([ 331,  109,  500,  477, 1026], dtype=int32),
 array([407, 149, 227, 662, 129], dtype=int32),
 array([521, 114, 366, 391,  90], dtype=int32),
 array([126, 220, 214, 539, 482], dtype=int32),
 array([410, 516,  34, 135], dtype=int32),
 array([ 71, 241, 453, 549], dtype=int32)]

In [37]:
def concatenate_array(data):
    """
    Removes the outermost list and returns the underlying values.
    If data is a list of arrays, concatenates them.
    If data is already an array, returns it as is.
    """

    if isinstance(data, list):
        if len(data) > 0 and isinstance(data[0], np.ndarray):
            # For sequence data (2D arrays), use vstack
            if len(data[0].shape) == 2:
                return np.vstack(data)
            # For length data (1D arrays), use concatenate
            else:
                return np.concatenate(data)
        else:
            # If it's a list of numbers or other types
            return np.array(data)
    else:
        return data


def training_model(positive_folds, positive_length_folds, negative_folds, negative_length_folds, components):
    """
    Train positive and negative HMM models using k-fold cross-validation and compare their performance.
    
    Parameters:
    -----------
    positive_folds : list of arrays
        List of k positive sequence arrays for each fold
    positive_length_folds : list of arrays
        List of k positive length arrays for each fold
    negative_folds : list of arrays
        List of k negative sequence arrays for each fold
    negative_length_folds : list of arrays
        List of k negative length arrays for each fold
    components : int
        Number of hidden states for the HMM
        
    Returns:
    --------
    results : dict
        Dictionary containing:
        - 'average_accuracy': Overall accuracy across all folds
        - 'fold_accuracies': List of accuracies for each fold
        - 'fold_details': Detailed results for each fold
    """
    
    fold_accuracies = []
    fold_details = []
    
    for i in range(len(positive_folds)):  # goes through all the folds
        
        print(f"\n{'='*60}")
        print(f"FOLD {i+1}/{len(positive_folds)}")
        print(f"{'='*60}")
        
        # Create copies for training/testing split
        pos_seq_copy = positive_folds.copy()
        pos_len_copy = positive_length_folds.copy()
        neg_seq_copy = negative_folds.copy()
        neg_len_copy = negative_length_folds.copy()
        
        # Get testing folds
        testing_pos_sequences = pos_seq_copy[i]
        testing_pos_lengths = pos_len_copy[i]
        testing_neg_sequences = neg_seq_copy[i]
        testing_neg_lengths = neg_len_copy[i]
        
        # Remove testing folds
        pos_seq_copy.pop(i)
        pos_len_copy.pop(i)
        neg_seq_copy.pop(i)
        neg_len_copy.pop(i)
        
        # Combine training folds
        training_pos_sequence = concatenate_array(pos_seq_copy)
        training_pos_lengths = concatenate_array(pos_len_copy)
        training_neg_sequence = concatenate_array(neg_seq_copy)
        training_neg_lengths = concatenate_array(neg_len_copy)
        
        # Train positive model
        positive_model = CategoricalHMM(
            n_components=components,
            n_features=20,
            n_iter=100,
            random_state=42
        )
        positive_model.fit(training_pos_sequence, training_pos_lengths)
        
        # Train negative model
        negative_model = CategoricalHMM(
            n_components=components,
            n_features=20,
            n_iter=100,
            random_state=42
        )
        negative_model.fit(training_neg_sequence, training_neg_lengths)
        
        print(f"Models trained successfully")
        
        # Determine how many sequences to test (minimum of both)
        n_pos_sequences = len(testing_pos_lengths)
        n_neg_sequences = len(testing_neg_lengths)
        n_test_sequences = min(n_pos_sequences, n_neg_sequences)
        
        print(f"Testing with {n_test_sequences} positive and {n_test_sequences} negative sequences")
        
        # Test positive sequences
        pos_correct = 0
        current_index = 0
        for j in range(n_test_sequences):
            sequence = testing_pos_sequences[current_index:current_index + testing_pos_lengths[j]]
            
            log_prob_pos = positive_model.score(sequence)
            log_prob_neg = negative_model.score(sequence)
            
            if log_prob_pos > log_prob_neg:
                pos_correct += 1
            
            current_index += testing_pos_lengths[j]
        
        # Test negative sequences (use first n_test_sequences)
        neg_correct = 0
        current_index = 0
        for j in range(n_test_sequences):
            sequence = testing_neg_sequences[current_index:current_index + testing_neg_lengths[j]]
            
            log_prob_pos = positive_model.score(sequence)
            log_prob_neg = negative_model.score(sequence)
            
            if log_prob_neg > log_prob_pos:
                neg_correct += 1
            
            current_index += testing_neg_lengths[j]
        
        # Calculate accuracy for this fold
        total_correct = pos_correct + neg_correct
        total_sequences = n_test_sequences * 2
        fold_accuracy = total_correct / total_sequences
        
        fold_accuracies.append(fold_accuracy)
        
        fold_detail = {
            'fold': i + 1,
            'positive_correct': pos_correct,
            'positive_total': n_test_sequences,
            'negative_correct': neg_correct,
            'negative_total': n_test_sequences,
            'accuracy': fold_accuracy
        }
        fold_details.append(fold_detail)
        
        print(f"\nResults:")
        print(f"  Positive sequences: {pos_correct}/{n_test_sequences} correct ({pos_correct/n_test_sequences*100:.1f}%)")
        print(f"  Negative sequences: {neg_correct}/{n_test_sequences} correct ({neg_correct/n_test_sequences*100:.1f}%)")
        print(f"  Fold Accuracy: {fold_accuracy*100:.2f}%")
    
    # Calculate overall statistics
    overall_accuracy = np.mean(fold_accuracies)
    std_accuracy = np.std(fold_accuracies)
    
    print(f"\n{'='*60}")
    print(f"OVERALL RESULTS")
    print(f"{'='*60}")
    print(f"Average Accuracy: {overall_accuracy*100:.2f}% (+/- {std_accuracy*100:.2f}%)")
    print(f"Fold Accuracies: {[f'{acc*100:.2f}%' for acc in fold_accuracies]}")
    
    results = {
        'average_accuracy': overall_accuracy,
        'std_accuracy': std_accuracy,
        'fold_accuracies': fold_accuracies,
        'fold_details': fold_details
    }
    
    return results

In [38]:
results_1_component = training_model(positive_sequence_folds, positive_length_folds, negative_sequence_folds, negative_length_folds, 1)


FOLD 1/10
Models trained successfully
Testing with 5 positive and 5 negative sequences

Results:
  Positive sequences: 3/5 correct (60.0%)
  Negative sequences: 5/5 correct (100.0%)
  Fold Accuracy: 80.00%

FOLD 2/10
Models trained successfully
Testing with 5 positive and 5 negative sequences

Results:
  Positive sequences: 1/5 correct (20.0%)
  Negative sequences: 5/5 correct (100.0%)
  Fold Accuracy: 60.00%

FOLD 3/10
Models trained successfully
Testing with 5 positive and 5 negative sequences

Results:
  Positive sequences: 3/5 correct (60.0%)
  Negative sequences: 5/5 correct (100.0%)
  Fold Accuracy: 80.00%

FOLD 4/10
Models trained successfully
Testing with 5 positive and 5 negative sequences

Results:
  Positive sequences: 4/5 correct (80.0%)
  Negative sequences: 5/5 correct (100.0%)
  Fold Accuracy: 90.00%

FOLD 5/10
Models trained successfully
Testing with 5 positive and 5 negative sequences

Results:
  Positive sequences: 5/5 correct (100.0%)
  Negative sequences: 5/5 corre

In [39]:
results_1_component

{'average_accuracy': np.float64(0.8875),
 'std_accuracy': np.float64(0.1220911544707478),
 'fold_accuracies': [0.8, 0.6, 0.8, 0.9, 1.0, 0.9, 1.0, 1.0, 1.0, 0.875],
 'fold_details': [{'fold': 1,
   'positive_correct': 3,
   'positive_total': 5,
   'negative_correct': 5,
   'negative_total': 5,
   'accuracy': 0.8},
  {'fold': 2,
   'positive_correct': 1,
   'positive_total': 5,
   'negative_correct': 5,
   'negative_total': 5,
   'accuracy': 0.6},
  {'fold': 3,
   'positive_correct': 3,
   'positive_total': 5,
   'negative_correct': 5,
   'negative_total': 5,
   'accuracy': 0.8},
  {'fold': 4,
   'positive_correct': 4,
   'positive_total': 5,
   'negative_correct': 5,
   'negative_total': 5,
   'accuracy': 0.9},
  {'fold': 5,
   'positive_correct': 5,
   'positive_total': 5,
   'negative_correct': 5,
   'negative_total': 5,
   'accuracy': 1.0},
  {'fold': 6,
   'positive_correct': 4,
   'positive_total': 5,
   'negative_correct': 5,
   'negative_total': 5,
   'accuracy': 0.9},
  {'fold':

In [40]:
import json
# Dictionary to store all results
all_results = {}

# Test components from 1 to 10
for n_components in range(1, 11):
    print(f"\n\n{'#'*70}")
    print(f"TESTING n_components = {n_components}")
    print(f"{'#'*70}")
    
    results = training_model(
        positive_sequence_folds, 
        positive_length_folds, 
        negative_sequence_folds, 
        negative_length_folds, 
        components=n_components
    )
    
    # Store results for this component
    all_results[f'n_components_{n_components}'] = {
        'n_components': n_components,
        'average_accuracy': float(results['average_accuracy']),
        'std_accuracy': float(results['std_accuracy']),
        'fold_accuracies': [float(acc) for acc in results['fold_accuracies']],
        'fold_details': results['fold_details']
    }

# Find best performing model
best_component = max(all_results.items(), 
                     key=lambda x: x[1]['average_accuracy'])

print(f"\n\n{'='*70}")
print(f"BEST MODEL SUMMARY")
print(f"{'='*70}")
print(f"Best n_components: {best_component[1]['n_components']}")
print(f"Best Average Accuracy: {best_component[1]['average_accuracy']*100:.2f}%")
print(f"Standard Deviation: {best_component[1]['std_accuracy']*100:.2f}%")

# Add summary to results
all_results['summary'] = {
    'best_n_components': best_component[1]['n_components'],
    'best_average_accuracy': best_component[1]['average_accuracy'],
    'best_std_accuracy': best_component[1]['std_accuracy']
}

# Save to JSON file
output_filename = 'model_results\hmm_cross_validation_results.json'
with open(output_filename, 'w') as f:
    json.dump(all_results, f, indent=4)

print(f"\nResults saved to '{output_filename}'")

print(f"\n{'='*70}")
print(f"SUMMARY TABLE")
print(f"{'='*70}")
print(f"{'Components':<12} {'Avg Accuracy':<15} {'Std Dev':<15}")
print(f"{'-'*42}")
for i in range(1, 11):
    key = f'n_components_{i}'
    acc = all_results[key]['average_accuracy']
    std = all_results[key]['std_accuracy']
    print(f"{i:<12} {acc*100:>6.2f}%{'':<8} {std*100:>6.2f}%")



######################################################################
TESTING n_components = 1
######################################################################

FOLD 1/10
Models trained successfully
Testing with 5 positive and 5 negative sequences

Results:
  Positive sequences: 3/5 correct (60.0%)
  Negative sequences: 5/5 correct (100.0%)
  Fold Accuracy: 80.00%

FOLD 2/10
Models trained successfully
Testing with 5 positive and 5 negative sequences

Results:
  Positive sequences: 1/5 correct (20.0%)
  Negative sequences: 5/5 correct (100.0%)
  Fold Accuracy: 60.00%

FOLD 3/10
Models trained successfully
Testing with 5 positive and 5 negative sequences

Results:
  Positive sequences: 3/5 correct (60.0%)
  Negative sequences: 5/5 correct (100.0%)
  Fold Accuracy: 80.00%

FOLD 4/10
Models trained successfully
Testing with 5 positive and 5 negative sequences

Results:
  Positive sequences: 4/5 correct (80.0%)
  Negative sequences: 5/5 correct (100.0%)
  Fold Accuracy: 90.00%

FO