In [32]:
import time, os
from collections import Counter
from itertools import count

class HMM:

    tag_count = {}
    word_tag = []
    word_tag_em = {}
    tags = ["START","B-negative","B-neutral","B-positive","I-negative","I-neutral","I-positive","O","STOP"]
    new_word_tag = ""
    start_y = []
    y_stop = []
    y_sequence = []
    sequence_trans = {}
    
    def __init__(self,train_x,train_y):
        self.train_x = train_x # train_x is list of words in training data
        self.train_y = train_y # train_y is list of tags in training data related to x
        self.tag_count = Counter(train_y) # total count for each tag found in training data
        self.word_tag = Counter(list(zip(train_x,train_y))) # total count for each word-tag pair found in training data
        self.new_word_tag = self.new_word_emission() #find set probability given to new word

        # find count for each (prev_tag,current_tag) pair found in training data
        self.start_y = train_y[0:len(train_y)-1] 
        self.y_stop = train_y[1:]        
        self.y_sequence = Counter(list(zip(self.start_y,self.y_stop))) 
        #print(self.y_sequence)
        
#=========================== PART 2 =============================
    #counting number of occurence of y_i
    def count_y(self,y_i):                
        return self.tag_count[y_i]
    
    
    #2a count(y pair x)/ count(y)
    def est_emission(self,x_i,y_i):
        return self.word_tag[(x_i,y_i)]
    
    
    #2b emission count(y pair x)/ (count(y)+1)
    def impr_est_emission(self,x_i,y_i):
        #if tag is START or STOP, emission score for empty word = 1
        if y_i == "START" or y_i == "STOP":
            return 1
        else:
            return self.word_tag[(x_i,y_i)]/(self.count_y(y_i)+1)
    
    #2b finding tag for new word using estimation 1/ (count(y)+1)   
    def new_word_emission(self):
        em=[]
        for tag in self.tags:  
            #ignore "START" and "STOP"
            if tag == "START" or tag == "STOP":                
                em.append(0)
            else:
                em.append(1/(self.count_y(tag)+1))
        #return label using the index of maximum emission found
        return self.tags[em.index(max(em))]
        
    def train_p2(self):
        #for each (word,tag) pair, calculate emission score and put in dictionary
        for i in self.word_tag:
            self.word_tag_em[i]=self.impr_est_emission(i[0],i[1])
        return self.word_tag_em
            
    #2c
    def sentiment_analysis(self, test_x):
        print("predicting...")
        y_star=[]
        
        #for each word in new training data eg. hello
        for word in test_x:
            #if word is empty, tag = empty (for score calculation)
            if word == "":
                y_star.append("")
                
            #if word is new, add predicted tag for new words
            elif word not in self.train_x:
                y_star.append(self.new_word_tag)
            
            # otherwise, run through all labels excluding START and STOP to find max emission
            else:
                em_list = [0] #account for "START"
                
                #for each tag in tags eg. B+, B-, Bn 
                for tag in self.tags[1:8]:
                    # if (word,tag) pair not available in training data; count(word,tag) = 0 hence emission = 0
                    if (word,tag) not in self.word_tag_em:
                        em_list.append(0)
                    # else look up in emission dictionary calculated in train to find score
                    else:
                        em_list.append(self.word_tag_em[(word,tag)])
                # get max emission and corresponding label        
                index_of_max = em_list.index(max(em_list))
                predicted_tag = self.tags[index_of_max]
                
                # check if label starts with "I". If so, the previous label should start with either "I" or "B"
                # i.e. there should be a previous label and it should not be "O" or ""
                # if not, label should start with "B" instead
                if predicted_tag[0] == "I":
                    if y_star[-1]=="O" or y_star[-1]=="":                           
                        predicted_tag = "B"+predicted_tag[1:]
                    
                        
                    
                y_star.append(predicted_tag)
                                    
        print("done predicting!")
        return y_star

    
#=========================== PART 3 =============================

    def count_transition(self,y_prev,y_current):
        return self.y_sequence[(y_prev,y_current)]
    
    def est_transition(self,y_prev,y_current):
        if y_prev == "STOP" and y_current == "START":
            return 1
        elif (y_prev,y_current) not in self.y_sequence:
            return 0
        else:
            return self.count_transition(y_prev,y_current)/self.count_y(y_prev)
        
    def train_p3(self):
        for pair in self.y_sequence:
            self.sequence_trans[pair]=self.est_transition(pair[0],pair[1])
            return self.sequence_trans
        
    def viterbi(self,test_data):
        path_probability = {}
        n = len(test_data)
        prev_tag = ""
        current_tag = "START"
        score = 0
        probability = []
        tag_path = []
        for k in range(1,n+2):
            row_prob = []
            row_tag = []
            
            
            if k==1:                
                temp = []
                for current_tag in self.tags:                    
                    temp.append(score*self.est_transition("START",current_tag)*self.impr_est_emission(test_data[k-1],current_tag))
                    row_prob.append(max(temp))
                    row_tag.append("")
                
            elif k<= n:
                for current_tag in self.tags:
                    temp = []
                    for prev_tag in self.tags:                        
                        temp.append(probability[k-1][self.tags.index(prev_tag)]*self.est_transition(prev_tag,current_tag)*self.impr_est_emission(test_data[k-1],current_tag))
                    
                    row_prob.append(max(temp))
                    likely_tag = self.tags[temp.index(max(temp))]
                    row_tag.append(likely_tag)

            
            else:
                for prev_tag in self.tags:
                    temp=[]
                    temp.append(probability[k-1][self.tags.index(prev_tag)]*self.est_transition(prev_tag,"STOP"))
                    row_prob.append(max(temp))
                    likely_tag = self.tags[temp.index(max(temp))]
                    row_tag.append("")
                    
            probability.append(row_prob)
            tag_path.append(row_tag)
        print(probability)
        print(tag_path)
        
                
                
    """def recursive(self,k,current_tag):
        #when step = 0 state = START i.e. (0,START)
        if k==0:
            
            return 1
        
        elif k >= 1 and k<=len(test_data):
            l = [self.recursive(k-1,prev_tag)*self.est_transition(prev_tag,current_tag)
                 *self.impr_est_emission(test_data[k-1],current_tag) for prev_tag in self.tags]
            self.optimal_path.append(self.tags[l.index(max(l))])
            return(max(l))
        else:
            l = [self.recursive(k-1,prev_tag)*self.est_transition(prev_tag,"STOP")
                 for prev_tag in self.tags]
            self.optimal_path.append(self.tags[l.index(max(l))])
            return(max(l))
        
        
        print(test_data[k-1])
        print(len(test))
        return test"""




# format: [[words], [tags]]
def read_train(file_name):
    in_file = open(file_name,'r',encoding='utf8')
    l = []
    words = [""]
    tags = ["START"]
    for line in in_file:
        x = line.strip().split()
        if x != []:
            words.append(x[0])
            tags.append(x[1].rstrip('\n'))
        else:
            words.append("")
            tags.append("STOP")
            words.append("")
            tags.append("START")
    tags=tags[0:len(tags)-1]
    words = words[0:len(words)-1]
    l.append(words)
    l.append(tags)
    in_file.close()
    return l
    
#reading and writing to files 
# format:[words]
def read_dev_in(file_name):
    in_file = open(file_name,'r',encoding='utf8')
    l = []
    for line in in_file:
        l.append(line.strip())
    in_file.close()
    return l

def write_devp2(language,word_list,tag_list):
    file_name = language+"/"+"dev.p2.out"
    if os.path.isfile(file_name):
        print('file exist')
        try:
            os.remove(file_name)
            print("deleted file")
        except OSError:
            pass
    out_file = open(file_name,'a',encoding='utf8')

    for i in range(len(word_list)):        
        out_file.write(word_list[i]+" "+tag_list[i]+"\n")
    
    out_file.close()
        
    
train_data = read_train("EN/train")
test_data = read_dev_in("EN/dev.in")
"""
print(train_data[1])
print(len(train_data[1]))
print(train_data[0])
print(len(train_data[0]))
"""
part2_HMM = HMM(train_data[0],train_data[1])
#print(part2_HMM.train())
#print(part2_HMM.sentiment_analysis(test_data))
starttime = time.time()
part2_HMM.trainp2()
#test_tags = part2_HMM.sentiment_analysis(test_data)
part2_HMM.trainp3()
print(part2_HMM.viterbi(test_data))
#write_devp2("EN",test_data,test_tags)
elapsed = time.time()-starttime

#print ("time taken = "+str(elapsed)+"s")



['START', 'B-neutral', 'I-neutral', 'O', 'O', 'O', 'O', 'B-neutral', 'I-neutral', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'STOP', 'START', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-neutral', 'O', 'O', 'O', 'O', 'STOP', 'START', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-neutral', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'STOP', 'START', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-negative', 'I-negative', 'I-negative', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'STOP', 'START', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-positive', 'I-positive', 'STOP', 'START', 'O', 'O', 'B-neutral', 'I-neutral', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'STOP', 'START', 'B-neutral', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-neutral', 'O', 'B-neutral', 'I-neutral', 'O', 'B-neutral', 'B-neutral', 'I-neutral', 'I-neutral', 'O', 'O', 'O', 'O', 'O', 'O', 'B-neutral', 'O', 'STOP', 'START', 'O', 'O', 'O', 'O', 

'\npart2_HMM = HMM(train_data[0],train_data[1])\n#print(part2_HMM.train())\n#print(part2_HMM.sentiment_analysis(test_data))\nstarttime = time.time()\npart2_HMM.train()\ntest_tags = part2_HMM.sentiment_analysis(test_data)\n#print(part2_HMM.viterbi(test_data))\nwrite_devp2("EN",test_data,test_tags)\nelapsed = time.time()-starttime'