## Creating the dataset

In [41]:
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt

In [42]:
negatives_path = r"data\negatives\non_afp_raw.faa"
positives_path = r"data\positives\afp_all_raw.faa"

In [43]:
def get_unique_characters(filepath: list[str]):
    """
    Grabs the list of unique characters 
    """
    unique_chars = set()
    print("\n--- Checking for Unique Characters in FASTA Files ---")
    
    
    with open(filepath, 'r') as f:
        for line in f:
            line = line.strip()
            # Only process lines that do NOT start with the FASTA header indicator '>'
            if not line.startswith('>'):
                # Convert to uppercase and add all characters to the set
                unique_chars.update(line.upper())

    return list(unique_chars)

In [38]:
"""
Assigns each amino acid to a number and returns dictionaries
like {'A': 1, 'R': 2} and
{1: 'A', 2: 'R'}
"""
amino_acids = 'ARNDCQEGHILKMFPSTWYV' # Standard 20 amino acids
aa_to_int = {aa: i for i, aa in enumerate(amino_acids)}
int_to_aa = {i: aa for aa, i in aa_to_int.items()}


In [44]:
unique_pos_characters = get_unique_characters(positives_path)

print(unique_pos_characters)
print("Length:", len(unique_pos_characters))

for i in unique_pos_characters:
    if i not in aa_to_int.keys():
        print(i)
        


--- Checking for Unique Characters in FASTA Files ---
['F', 'C', 'N', 'R', 'W', 'L', 'D', 'S', 'V', 'Q', 'Y', 'X', 'T', 'B', 'I', 'G', 'H', 'M', 'P', 'E', 'K', 'A']
Length: 22
X
B


In [45]:
unique_neg_characters = get_unique_characters(negatives_path)

print(unique_neg_characters)
print("Length:", len(unique_neg_characters))

for i in unique_neg_characters:
    if i not in aa_to_int.keys():
        print(i)
        


--- Checking for Unique Characters in FASTA Files ---
['F', 'C', 'N', 'R', 'W', 'L', 'D', 'U', 'S', 'V', 'Z', 'Q', 'Y', 'X', 'T', 'B', 'I', 'G', 'H', 'M', 'P', 'E', 'K', 'A']
Length: 24
U
Z
X
B


In [46]:
"""
Mapping of non-standard codes:
    - B (Asx: Asparagine or Aspartic Acid) is mapped to N (Asparagine), Index 2
    - X (Any/Unknown) is mapped to A (Alanine, a neutral placeholder), Index 0
    - U (Selenocysteine) is mapped to C (Cysteine, as they are chemically related), Index 4
    - Z (Glx: Glutamine or Glutamic Acid) is mapped to E (Glutamic Acid), Index 6
    Mappings based on this article: https://www.matrixscience.com/blog/non-standard-amino-acid-residues.html
"""

if 'N' in aa_to_int: aa_to_int['B'] = aa_to_int['N'] # B -> N
if 'A' in aa_to_int: aa_to_int['X'] = aa_to_int['A'] # X -> A
if 'C' in aa_to_int: aa_to_int['U'] = aa_to_int['C'] # U -> C
if 'E' in aa_to_int: aa_to_int['Z'] = aa_to_int['E'] # Z -> E

In [None]:
def parse_fasta_file(filepath, aa_to_int_map):
    """
    Reads a FASTA-formatted file, extracts protein sequences, encodes them
    to integers, and prepares the data for HMM training.

    Args:
        filepath (str): Path to the FASTA file.
        aa_to_int_map (dict): Mapping from amino acid character to integer.

    Returns:
        tuple: (X, lengths)
            X (np.ndarray): Concatenated, integer-encoded sequences (shape: (Total Length, 1)).
            lengths (np.ndarray): Array of individual sequence lengths (shape: (Number of Sequences,)).
    """
    all_sequences_int = []
    sequence_lengths = []
    current_sequence = ""
    # Initialize last_header to handle the first sequence and error reporting properly
    last_header = "N/A (first sequence)" 

    print(f"--- Processing file: {filepath} ---")
    with open(filepath, 'r') as f:
        for line in f:
            line = line.strip()

            if line.startswith('>'):
                # Start of a new sequence header
                if current_sequence:
                    # Process the previous sequence
                    encoded_seq = []
                    
                    for aa in current_sequence:
                        if aa in aa_to_int_map:
                            encoded_seq.append(aa_to_int_map[aa])
                        else:
                            # Explicitly report any skipped, non-standard characters
                            print(f"Warning: Non-standard character '{aa}' found in sequence '{last_header}'. Skipping.")
                    
                    if encoded_seq:
                        all_sequences_int.extend(encoded_seq)
                        sequence_lengths.append(len(encoded_seq))

                # Update header for the new sequence. Using split('|')[1] is a common way to get UniProt ID.
                last_header = line.split('|')[1] if '|' in line and len(line.split('|')) > 1 else line[1:30].strip() + "..."
                current_sequence = ""
            elif line:
                # Append sequence data (ignoring whitespaces/newlines)
                current_sequence += line.upper()

        # Process the very last sequence in the file
        if current_sequence:
            encoded_seq = []
            for aa in current_sequence:
                if aa in aa_to_int_map:
                    encoded_seq.append(aa_to_int_map[aa])
                else:
                    print(f"Non-standard character '{aa}' found in final sequence '{last_header}'. Skipping.")
                    
            if encoded_seq:
                all_sequences_int.extend(encoded_seq)
                sequence_lengths.append(len(encoded_seq))

    # Convert lists to NumPy arrays in the required format
    X = np.array(all_sequences_int).reshape(-1, 1)
    lengths = np.array(sequence_lengths, dtype=np.int32)

    return X, lengths

In [48]:
positive_X, positive_lengths = parse_fasta_file(positives_path, aa_to_int)
negative_X, negative_lengths = parse_fasta_file(negatives_path, aa_to_int)

--- Processing file: data\positives\afp_all_raw.faa ---
--- Processing file: data\negatives\non_afp_raw.faa ---


In [49]:
negative_lengths

array([ 294,  494, 1136, ...,  499,  206,  427],
      shape=(5633,), dtype=int32)

In [50]:
positive_lengths

array([ 82, 790, 218,  82, 168, 163,  87,  66, 128,  31,  37,  85,  33,
        45,  91,  97,  19,  16,  63,  63,  64,  88,  88,  87,  87,  91,
        87,  64,  63,  64,  64,  62,  33,  40,  91,  66,  64, 134, 175,
       125, 124, 261, 276, 253, 243, 112,  20, 892], dtype=int32)

In [None]:
lengths = [82, 790, 218,  82, 168, 163,  87,  66, 128,  31,  37,  85,  33,
        45,  91,  97,  19,  16,  63,  63,  64,  88,  88,  87,  87,  91,
        87,  64,  63,  64,  64,  62,  33,  40,  91,  66,  64, 134, 175,
       125, 124, 261, 276, 253, 243, 112,  20, 892]

sum_of_lengths = 0

for length in lengths:
    sum_of_lengths += length

print(sum_of_lengths)

6182


In [51]:
positive_X

array([[12],
       [ 0],
       [10],
       ...,
       [11],
       [ 0],
       [ 2]], shape=(6182, 1))

## Training the model

In [58]:
from hmmlearn.hmm import CategoricalHMM

In [59]:
positive_model = CategoricalHMM(
            n_components=1, # just 1 state for POC,
            n_features=20, # amount of amino acids, 
            n_iter=10,
            random_state=42
        ) 

positive_model.fit(positive_X, positive_lengths)

0,1,2
,n_components,1
,startprob_prior,1.0
,transmat_prior,1.0
,emissionprob_prior,1.0
,n_features,20
,algorithm,'viterbi'
,random_state,RandomState(M... 0x1F7D386C240
,n_iter,10
,tol,0.01
,verbose,False


In [60]:
# Train negative model
negative_model = CategoricalHMM(
    n_components=1, # just 1 state for POC,
    n_features=20, # amount of amino acids, 
    n_iter=10,
    random_state=42
)
negative_model.fit(negative_X, lengths=negative_lengths)

0,1,2
,n_components,1
,startprob_prior,1.0
,transmat_prior,1.0
,emissionprob_prior,1.0
,n_features,20
,algorithm,'viterbi'
,random_state,RandomState(M... 0x1F7D386D340
,n_iter,10
,tol,0.01
,verbose,False


In [63]:
# Just basic testing to see if the model itself works
# not worried about data leakage for now
test_seq = positive_X[:82]
log_prob_pos = positive_model.score(test_seq)
log_prob_neg = negative_model.score(test_seq)

In [64]:
log_prob_neg

-229.9824194597464

In [65]:
log_prob_pos

-188.06176699019352

In [67]:
prediction = "positive" if log_prob_pos > log_prob_neg else "negative"
prediction

'positive'