In [233]:
"""
Author: Daniel Wu
Purpose: PS5 - classify test data
outfile: hmmoutput.txt
"""

import os
import sys
import math
import re
import string
import glob
import random
import json 

def pause():
    programPause = input("Press the <ENTER> key to continue...")
    print("Paused Program")
    
# bring in model parameters
modelfile = "./hmmmodel.txt" 

json_file = open(modelfile, 'r', encoding = 'utf-8')
dic_import = json.load(json_file)
json_file.close()
        
priors = dic_import['priors']
A_Mtx = dic_import['A_Mtx']
B_Mtx = dic_import['B_Mtx']    

word_bag = set([tok for tag in list(B_Mtx.keys()) for tok in list(B_Mtx[tag].keys())])

# testfile = sys.argv[1]
testfile = "/Users/user/Desktop/Fall_2020/CSCI_544/Coding_Assignments/PA5/hmm-training-data/it_isdt_dev_raw.txt"

with open(testfile) as fp:
    testdata = fp.readlines()

    
test_data_pred = []

for line in testdata:            
    sentence = line.split(" ")
    sentence[-1] = sentence[-1].rstrip("\n")         
    
    delta = {}
    delta[0] = {}
    
    # initialize 
    for tag in priors:                
        if sentence[0] in B_Mtx[tag]:
            delta[0][tag] = priors[tag] + B_Mtx[tag][sentence[0]]
        
        if sentence[0] in word_bag and sentence[0] not in B_Mtx[tag]:
            delta[0][tag] = priors[tag] -1e20
            
        if sentence[0] not in word_bag: 
            delta[0][tag] = priors[tag]
      
    # viterbi algo    
    seq_len = len(sentence) - 1
    
    backpointer = {}
    term_states = {}
    
    for i in range(seq_len):        
        i += 1                
        delta[i] = {}   
        backpointer[i] = {}                
        
        #try connect to all states - for all possible states
        for s_1 in A_Mtx:            
            
            max_state_pr = -1e20           
                            
            #backtrack - for states stored previously
            for s_0 in delta[i-1]:
                
                if sentence[i] in B_Mtx[s_1]:
                    temp_delta = B_Mtx[s_1][sentence[i]] + A_Mtx[s_0][s_1] + delta[i-1][s_0]          
                    
                if sentence[i] in word_bag and sentence[i] not in B_Mtx[s_1]:
                    temp_delta = -1e20 + A_Mtx[s_0][s_1] + delta[i-1][s_0]
                
                if sentence[i] not in word_bag:            
                    temp_delta = A_Mtx[s_0][s_1] + delta[i-1][s_0]
                    #don't consider emission prob if word not there
                                
                if temp_delta > max_state_pr:
                    max_state_pr = temp_delta       # max
                    temp_bp = s_0                   # argmax
            
            delta[i][s_1] = max_state_pr  # delta_s_t
            backpointer[i][s_1] = temp_bp # store the highest prob path (argmax)
            
            #if at last stage, find max pr of terminal stage
            if i == seq_len:
                term_states[s_1] = max_state_pr

            
    #work backwards to find most likely sequence    
    for i in range(seq_len, -1, -1):
        
        #get terminal state
        if i == seq_len:
            final_pos = max(term_states, key = term_states.get)
            sentence[i] = sentence[i] + '/' + final_pos 
                
        elif i != seq_len:
            try:
                pos = backpointer[i+1][final_pos]
            except: 
                pass
            
            sentence[i] = sentence[i] + '/' + pos
            final_pos = pos
    
    test_data_pred.append(' '.join(sentence)+'\n')  
    
outfile = ''
    
for line in test_data_pred:
    outfile += line

outfile = outfile[:-1]

# Save everything to file
with open("./hmmoutput.txt", "w") as fp:
    fp.write(outfile)      
    
    
