In [1]:
import numpy as np

def parse_hmm_data(text):
    # Split the input by the separator '--------'
    parts = text.strip().split('--------')
    parts = [p.strip() for p in parts]

    # First part is the sequence of observations
    observations = parts[0]
    
    # Second part is the alphabet
    alphabet = parts[1].split()
    
    # Third part is the set of states
    states = parts[2].split()

    # Fourth part is the transition matrix
    transition_lines = parts[3].split('\n')
    transition_header = transition_lines[0].split()
    transition_matrix = {}
    for line in transition_lines[1:]:
        row = line.split()
        from_state = row[0]
        transition_matrix[from_state] = {to_state: float(p) for to_state, p in zip(transition_header, row[1:])}

    # Fifth part is the emission matrix
    emission_lines = parts[4].split('\n')
    emission_header = emission_lines[0].split()
    emission_matrix = {}
    for line in emission_lines[1:]:
        row = line.split()
        from_state = row[0]
        emission_matrix[from_state] = {symbol: float(p) for symbol, p in zip(emission_header, row[1:])}
    
    return observations, states, transition_matrix, emission_matrix

def viterbi(observations, states, start_probs, transitions, emissions):
    num_states = len(states)
    len_obs = len(observations)
    
    # dp_table stores the max probability of a path ending at a certain state and time
    dp_table = np.zeros((num_states, len_obs))
    # backpointer table to reconstruct the path
    backpointers = np.zeros((num_states, len_obs), dtype=int)
    
    # Initialization step
    first_obs = observations[0]
    for i, state in enumerate(states):
        dp_table[i, 0] = start_probs[state] * emissions[state][first_obs]
    
    # Recursion step
    for t in range(1, len_obs):
        obs = observations[t]
        for j, current_state in enumerate(states):
            max_prob = -1
            best_prev_state = -1
            
            for i, prev_state in enumerate(states):
                prob = dp_table[i, t-1] * transitions[prev_state][current_state]
                if prob > max_prob:
                    max_prob = prob
                    best_prev_state = i
            
            dp_table[j, t] = max_prob * emissions[current_state][obs]
            backpointers[j, t] = best_prev_state
            
    # Termination and path backtracking
    best_path = []
    last_state_idx = np.argmax(dp_table[:, -1])
    best_path.append(states[last_state_idx])
    
    for t in range(len_obs - 1, 0, -1):
        last_state_idx = backpointers[last_state_idx, t]
        best_path.append(states[last_state_idx])
        
    return "".join(reversed(best_path))

# --- Main execution ---

# Read the input data from the file
try:
    with open('sample_inputs/p2/rosalind_ba10c.txt', 'r') as f:
        input_data = f.read()
except FileNotFoundError:
    print("Error: Input file not found.")
    # A default input for testing if file is not found
    input_data = """zxxxxyzzxyxyxyzxzzxzzzyzzxxxzxxyyyzxyxzyxyxyzyyyyzzyyyyzzxzxzyzzzzyxzxxxyxxxxyyzyyzyyyxzzzzyzxyzzyyy
--------
x y z
--------
A B
--------
	A	B
A	0.634	0.366
B	0.387	0.613
--------
	x	y	z
A	0.532	0.226	0.241
B	0.457	0.192	0.351"""


# Parse the data from the input string
observations, states, transitions, emissions = parse_hmm_data(input_data)

# Assume equal starting probabilities for each state
start_probs = {state: 1.0 / len(states) for state in states}

# Run the Viterbi algorithm
most_likely_path = viterbi(observations, states, start_probs, transitions, emissions)

# Print the final result
print(most_likely_path)


DABBCACDABCDABBCABDABCABCDABBCABBCDABCABBBCABCABCABBBDABCABBCABCDCABBCABCABCABCDCABBBBBBBCDABBBBBCAB
