In [6]:
from collections import defaultdict
from common.hmm import print_output, viterbi, backtrack_path

import numpy as np

In [7]:
lines = [
"yzzzyxzxxx",
"--------",
"x y z",
"--------",
"BBABABABAB",
"--------",
"A B C"
]

In [8]:
def parse_input_1(lines):
    sequence = lines[0].strip()
    alphabet = lines[2].strip().split(" ")
    hidden_path = lines[4].strip()
    states = lines[6].strip().split(" ")
    return sequence, alphabet, hidden_path, states

In [9]:
sequence, alphabet, hidden_path, states = parse_input_1(lines)

In [10]:
def compute_transition_probs(hidden_path, states):
    transitions = defaultdict(dict)
    total_leaving = defaultdict(int)
    for state_i in states:
        for state_j in states:
            transitions[state_i][state_j] = 0
        
    for state_i, state_j in zip(hidden_path[:-1], hidden_path[1:]):
        transitions[state_i][state_j] += 1
        total_leaving[state_i] += 1
    
    for state_i in states:
        for state_j in states:
            try:
                transitions[state_i][state_j] /= total_leaving[state_i]
            except ZeroDivisionError:
                transitions[state_i][state_j] = 1.0/len(states)

    return transitions

In [11]:
transitions = compute_transition_probs(hidden_path, states)

In [12]:
def compute_emission_probs(sequence, alphabet, hidden_path, states):
    emissions = defaultdict(dict)
    total_emissions = defaultdict(int)
    
    for state in states:
        for symbol in alphabet:
            emissions[state][symbol] = 0
    
    for symbol, state in zip(sequence, hidden_path):
        emissions[state][symbol] += 1
        total_emissions[state] += 1
    
    for state in states:
        for symbol in alphabet:
            try:
                emissions[state][symbol] /= total_emissions[state]
            except ZeroDivisionError:
                emissions[state][symbol] = 1.0/len(alphabet)   
            
    return emissions

In [63]:
emissions = compute_emission_probs(sequence, alphabet, hidden_path, states)

In [64]:
emissions

defaultdict(dict,
            {'A': {'x': 0.25, 'y': 0.25, 'z': 0.5},
             'B': {'x': 0.5,
              'y': 0.16666666666666666,
              'z': 0.3333333333333333},
             'C': {'x': 0.3333333333333333,
              'y': 0.3333333333333333,
              'z': 0.3333333333333333}})

In [65]:
print_output(transitions, emissions, alphabet)

 A B C
A 0.0 1.0 0.0
B 0.8 0.2 0.0
C 0.3333333333333333 0.3333333333333333 0.3333333333333333
--------
 x y z
A 0.25 0.25 0.5
B 0.5 0.16666666666666666 0.3333333333333333
C 0.3333333333333333 0.3333333333333333 0.3333333333333333


In [66]:
with open("../data/dataset_26260_4.txt","r") as fin:
    lines = [line.strip() for line in fin]
    sequence, alphabet, hidden_path, states = parse_input_1(lines)
    transitions = compute_transition_probs(hidden_path, states)
    emissions = compute_emission_probs(sequence, alphabet, hidden_path, states)
    print_output(transitions, emissions, alphabet)

 A B C D
A 0.3333333333333333 0.3333333333333333 0.2 0.13333333333333333
B 0.2413793103448276 0.27586206896551724 0.2413793103448276 0.2413793103448276
C 0.4 0.25 0.1 0.25
D 0.3 0.3 0.2 0.2
--------
 x y z
A 0.2903225806451613 0.2903225806451613 0.41935483870967744
B 0.3793103448275862 0.3793103448275862 0.2413793103448276
C 0.3 0.4 0.3
D 0.45 0.45 0.1


In [19]:
lines = [
"100",
"--------",
"zyzxzxxxzz",
"--------",
"x y z",
"--------",
"A B",
"--------",
"	A	B",
"A	0.599	0.401	",
"B	0.294	0.706	",
"--------",
"	x	y	z",
"A	0.424	0.367	0.209	",
"B	0.262	0.449	0.289",
]

In [20]:
def parse_input_2(lines):
    niterations = int(lines[0].strip())
    sequence = lines[2].strip()
    symbols = lines[4].strip().split(" ")
    states = lines[6].strip().split(" ")
    
    state_transitions = defaultdict(dict)
    for line in lines[9:9+len(states)]:
        from_state = line.strip().split("\t")[0]
        probs = [float(f) for f in line.strip().split("\t")[1:] ]
        for state, prob in zip(states, probs):
            state_transitions[from_state][state] = prob
            #state_transitions[from_state]["S"] = 0
            #state_transitions[from_state]["E"] = TODO?
            
    #for state in states:
    #    state_transitions["S"][state] = 1.0/len(states)
    #    state_transitions["E"][state] = 0.0
    
    emission_matrix = defaultdict(dict)
    for line in lines[11+len(states):]:
        from_state = line.strip().split("\t")[0]
        probs = [float(f) for f in line.strip().split("\t")[1:] ]
        for symbol, prob in zip(symbols, probs):
            emission_matrix[from_state][symbol] = prob
            
    #for symbol in symbols:
    #    emission_matrix["S"][symbol] = 0.0
    #    emission_matrix["E"][symbol] = 0.0
        
    return sequence, symbols, states, dict(state_transitions), dict(emission_matrix), niterations

In [22]:
sequence, alphabet, states, transitions, emissions, niterations = parse_input_2(lines)
print(sequence)
print(transitions)
print(emissions)


for i in range(niterations):
    print_output(transitions, emissions, alphabet)
    print("########################################")
    
    init_prob = dict()
    for state in emissions.keys():
        init_prob[state] = np.log(emissions[state][sequence[0]]/len(transitions))

    backtrack = []
    final_state = viterbi(sequence[1:], transitions, emissions, init_prob, backtrack)
    hidden_path = backtrack_path(backtrack, final_state)
    
    transitions = compute_transition_probs(hidden_path, states)
    emissions = compute_emission_probs(sequence, alphabet, hidden_path, states)


zyzxzxxxzz
{'A': {'A': 0.599, 'B': 0.401}, 'B': {'A': 0.294, 'B': 0.706}}
{'A': {'x': 0.424, 'y': 0.367, 'z': 0.209}, 'B': {'x': 0.262, 'y': 0.449, 'z': 0.289}}
 A B
A 0.599 0.401
B 0.294 0.706
--------
 x y z
A 0.424 0.367 0.209
B 0.262 0.449 0.289
########################################
 A B
A 0.5 0.5
B 0.0 1.0
--------
 x y z
A 0.3333333333333333 0.3333333333333333 0.3333333333333333
B 0.4 0.1 0.5
########################################
 A B
A 0.5 0.5
B 0.0 1.0
--------
 x y z
A 0.3333333333333333 0.3333333333333333 0.3333333333333333
B 0.4 0.1 0.5
########################################
 A B
A 0.5 0.5
B 0.0 1.0
--------
 x y z
A 0.3333333333333333 0.3333333333333333 0.3333333333333333
B 0.4 0.1 0.5
########################################
 A B
A 0.5 0.5
B 0.0 1.0
--------
 x y z
A 0.3333333333333333 0.3333333333333333 0.3333333333333333
B 0.4 0.1 0.5
########################################
 A B
A 0.5 0.5
B 0.0 1.0
--------
 x y z
A 0.3333333333333333 0.3333333333333333 0.333333

In [25]:
with open("../data/dataset_26260_8.txt","r") as fin:
    lines = [line.strip() for line in fin]
    sequence, alphabet, states, transitions, emissions, niterations = parse_input_2(lines)

    for i in range(niterations):
        print_output(transitions, emissions, alphabet)
        print("########################################")

        init_prob = dict()
        for state in emissions.keys():
            init_prob[state] = np.log(emissions[state][sequence[0]]/len(transitions))

        backtrack = []
        final_state = viterbi(sequence[1:], transitions, emissions, init_prob, backtrack)
        hidden_path = backtrack_path(backtrack, final_state)

        transitions = compute_transition_probs(hidden_path, states)
        emissions = compute_emission_probs(sequence, alphabet, hidden_path, states)


 A B C D
A 0.114 0.424 0.401 0.061
B 0.157 0.386 0.072 0.385
C 0.392 0.118 0.414 0.076
D 0.248 0.3 0.377 0.075
--------
 x y z
A 0.212 0.669 0.119
B 0.143 0.611 0.246
C 0.117 0.564 0.319
D 0.178 0.588 0.234
########################################
 A B C D
A 0.0 0.11764705882352941 0.8823529411764706 0.0
B 0.0 0.0 0.0 1.0
C 0.5789473684210527 0.0 0.42105263157894735 0.0
D 0.5 0.0 0.5 0.0
--------
 x y z
A 0.6 0.4 0.0
B 0.5 0.5 0.0
C 0.017543859649122806 0.3508771929824561 0.631578947368421
D 1.0 0.0 0.0
########################################
 A B C D
A 0.0 0.1111111111111111 0.8888888888888888 0.0
B 0.0 0.0 0.0 1.0
C 0.6 0.0 0.4 0.0
D 1.0 0.0 0.0 0.0
--------
 x y z
A 0.5675675675675675 0.43243243243243246 0.0
B 0.5 0.5 0.0
C 0.01818181818181818 0.32727272727272727 0.6545454545454545
D 1.0 0.0 0.0
########################################
 A B C D
A 0.0 0.1388888888888889 0.8611111111111112 0.0
B 0.0 0.0 0.0 1.0
C 0.6037735849056604 0.0 0.39622641509433965 0.0
D 1.0 0.0 0.0 0.0
------

  # This is added back by InteractiveShellApp.init_path()



A 0.0 0.1388888888888889 0.8611111111111112 0.0
B 0.0 0.0 0.0 1.0
C 0.6037735849056604 0.0 0.39622641509433965 0.0
D 1.0 0.0 0.0 0.0
--------
 x y z
A 0.5405405405405406 0.4594594594594595 0.0
B 0.4 0.6 0.0
C 0.018867924528301886 0.3018867924528302 0.6792452830188679
D 1.0 0.0 0.0
########################################
 A B C D
A 0.0 0.1388888888888889 0.8611111111111112 0.0
B 0.0 0.0 0.0 1.0
C 0.6037735849056604 0.0 0.39622641509433965 0.0
D 1.0 0.0 0.0 0.0
--------
 x y z
A 0.5405405405405406 0.4594594594594595 0.0
B 0.4 0.6 0.0
C 0.018867924528301886 0.3018867924528302 0.6792452830188679
D 1.0 0.0 0.0
########################################
 A B C D
A 0.0 0.1388888888888889 0.8611111111111112 0.0
B 0.0 0.0 0.0 1.0
C 0.6037735849056604 0.0 0.39622641509433965 0.0
D 1.0 0.0 0.0 0.0
--------
 x y z
A 0.5405405405405406 0.4594594594594595 0.0
B 0.4 0.6 0.0
C 0.018867924528301886 0.3018867924528302 0.6792452830188679
D 1.0 0.0 0.0
########################################
 A B C D
A 0

 A B C D
A 0.0 0.1388888888888889 0.8611111111111112 0.0
B 0.0 0.0 0.0 1.0
C 0.6037735849056604 0.0 0.39622641509433965 0.0
D 1.0 0.0 0.0 0.0
--------
 x y z
A 0.5405405405405406 0.4594594594594595 0.0
B 0.4 0.6 0.0
C 0.018867924528301886 0.3018867924528302 0.6792452830188679
D 1.0 0.0 0.0
########################################
 A B C D
A 0.0 0.1388888888888889 0.8611111111111112 0.0
B 0.0 0.0 0.0 1.0
C 0.6037735849056604 0.0 0.39622641509433965 0.0
D 1.0 0.0 0.0 0.0
--------
 x y z
A 0.5405405405405406 0.4594594594594595 0.0
B 0.4 0.6 0.0
C 0.018867924528301886 0.3018867924528302 0.6792452830188679
D 1.0 0.0 0.0
########################################
 A B C D
A 0.0 0.1388888888888889 0.8611111111111112 0.0
B 0.0 0.0 0.0 1.0
C 0.6037735849056604 0.0 0.39622641509433965 0.0
D 1.0 0.0 0.0 0.0
--------
 x y z
A 0.5405405405405406 0.4594594594594595 0.0
B 0.4 0.6 0.0
C 0.018867924528301886 0.3018867924528302 0.6792452830188679
D 1.0 0.0 0.0
########################################
 A B