In [1]:
import numpy as np
import pandas as pd

In [2]:
class Hmm:
    def __init__(self,file,k=3):
        self.words = None #set of unique words
        self.tag_word_count = None # dict((tag,word),count)
        self.transmissions = None # dict((tag_u,tag_v),count)
        self.count = None # dict(tag,count)
        self.read_file(file,k)
        self.tags = set(self.count.keys())
        
        self.word_ls = tuple(self.words)#tuple cause immutable
        self._tag_ls = tuple(self.tags)
        #move #START# to first row and #END# to last row
        ls = list(self._tag_ls)
        ls.remove('#START#')
        ls.remove('#END#')
        ls.insert(0,'#START#')
        ls.append('#END#')
        self.tag_ls = tuple(ls)
        
        self.make_matrix()
    
    def make_matrix(self):
        tag_length = len(self.tag_ls)
        transition_matrix = np.zeros((tag_length,tag_length))
        
        for i in range(tag_length):
            for j in range(tag_length):
                tag_u = self.tag_ls[i]
                tag_v = self.tag_ls[j]
                transition_matrix[i][j] = self.transmissions[(tag_u,tag_v)]/self.count[tag_u]
        self.transition_matrix = pd.DataFrame(transition_matrix,index=self.tag_ls,columns=self.tag_ls)
        
        word_length = len(self.word_ls)
        em_matrix = np.zeros((tag_length,word_length))
        for i in range(tag_length):
            for j in range(word_length):
                tag = self.tag_ls[i]
                word = self.word_ls[j]
                em_matrix[i][j] = self.tag_word_count[(tag,word)]/self.count[tag]
        self.em_matrix = pd.DataFrame(em_matrix,index=self.tag_ls,columns=self.word_ls)
        pass
    
    def read_file(self,file,k):
        from collections import defaultdict
        seq = ['#START#']
        f = open(file,'r',encoding='UTF-8')
        tag_word_ls = []
        word_count = defaultdict(int)
        for line in f:
            split = line.split(' ')
            if len(split)<2:
                #this is a line break
                seq.append('#END#')
                seq.append('#START#')
                continue
            word,tag = split
            word = word.strip()
            tag = tag.strip()
            tag_word_ls.append([tag,word])
            word_count[word]+=1
            seq.append(tag)
        f.close()
        
        #Emissions
        for i in range(len(tag_word_ls)):
            tag,word = tag_word_ls[i]
            if word_count[word]<k:
                tag_word_ls[i] = [tag,'#UNK#']
        tag_word_count = defaultdict(int)
        
        words = []
        for tag,word in tag_word_ls:
            tag_word_count[tag,word]+=1
            words.append(word)
        self.words = set(words)
        self.tag_word_count= tag_word_count
        
        #Transistions
        del seq[-1] #delete last item from the list
         #print(seq)
        trans_dict = defaultdict(int)
        count_u = defaultdict(int)
        for i in range(len(seq)-1):
            tag_u = seq[i]
            count_u[tag_u] += 1 # need to count #END# too
            if tag_u == "#END#":
                continue
            #if u is not #END# we count the transmission 
            tag_v = seq[i+1]
            if (tag_u =="#START#" and tag_v == "#END#"):
                #check for empty blank lines at the end and dont count them
                print('these are blank lines')
                count_u["#START#"] -= 1 #remove additional start
                break
            trans_dict[(tag_u,tag_v)] += 1
        self.transmissions = trans_dict
        self.count = count_u

In [3]:
EN = Hmm('./EN/train')

In [4]:
EN.transition_matrix.loc['#START#','B-VP']

0.018661098786376094

In [5]:
EN.transition_matrix.values[0,:]

array([0.00000000e+00, 1.08704163e-01, 5.42868328e-02, 2.60994389e-04,
       0.00000000e+00, 0.00000000e+00, 3.26242986e-03, 1.86610988e-02,
       1.41850450e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       6.48049067e-01, 0.00000000e+00, 1.04397755e-03, 0.00000000e+00,
       0.00000000e+00, 2.25760146e-02, 0.00000000e+00, 1.30497194e-03,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00])

In [6]:
EN.transition_matrix

Unnamed: 0,#START#,B-PP,B-ADVP,B-CONJP,I-NP,I-INTJ,B-ADJP,B-VP,O,I-SBAR,...,I-CONJP,B-LST,I-ADVP,I-UCP,B-SBAR,B-PRT,B-INTJ,I-VP,I-PP,#END#
#START#,0.0,0.108704,0.054287,0.000261,0.0,0.0,0.003262,0.018661,0.14185,0.0,...,0.0,0.001044,0.0,0.0,0.022576,0.0,0.001305,0.0,0.0,0.0
B-PP,0.0,0.018491,0.003318,0.0,0.0,0.0,0.002611,0.026595,0.008484,0.0,...,0.0,0.0,0.0,0.0,0.000979,0.0,0.0,0.0,0.011258,0.000218
B-ADVP,0.0,0.170547,0.016269,0.000561,0.0,0.0,0.01655,0.215989,0.265358,0.0,...,0.0,0.0,0.086957,0.0,0.016269,0.000281,0.0,0.0,0.0,0.000842
B-CONJP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.061224,0.0,...,0.938776,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
I-NP,0.0,0.156509,0.015332,0.000201,0.406679,0.0,0.004103,0.134912,0.227327,0.0,...,0.0,0.0,0.0,0.0,0.006375,0.000128,0.0,0.0,0.0,0.000788
I-INTJ,0.0,0.0,0.0,0.0,0.0,0.285714,0.0,0.0,0.714286,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-ADJP,0.0,0.244432,0.015991,0.0,0.0,0.0,0.001142,0.110794,0.256996,0.0,...,0.0,0.0,0.0,0.0,0.037693,0.000571,0.0,0.0,0.0,0.000571
B-VP,0.0,0.098735,0.031214,0.000164,0.0,0.0,0.039209,0.007229,0.067411,0.0,...,0.0,0.0,0.0,0.0,0.025574,0.011171,0.00011,0.373912,0.0,5.5e-05
O,0.0,0.050142,0.029197,0.001047,0.0,0.0,0.008755,0.11503,0.113522,0.0,...,0.0,8.4e-05,0.0,0.0,0.016128,4.2e-05,0.000586,0.0,0.0,0.318281
I-SBAR,0.0,0.020833,0.0,0.0,0.0,0.0,0.0,0.020833,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
EN.transition_matrix.loc['#START#','B-NP']

0.6480490669450607

In [8]:
def log(m):
    m = np.clip(m, 1e-64, None)
    x = np.log(m)
    return x

In [25]:
def vertebi_k(word_arr,Hmm,k=7):
    """
    Followed pseudocode here
    https://en.wikipedia.org/wiki/Viterbi_algorithm#Pseudocode
    """
    S = Hmm.tag_ls[1:-1] #set of all possible tags remove #START# and #STOP#
    
    A = Hmm.transition_matrix.values[1:-1,1:-1] # A(tag_u_vector,tag_v)
    B = Hmm.em_matrix[1:-1] # B(tag_u->word)
    
    T = len(S) # Total number unique tags
    N = len(word_arr) # Length of sentence make sure no #START# and #STOP#
    
    T1 = np.full((T,N,k),-9999.) #probability table of most possible path to node i.e. store scores of each node
    T2 = np.full((T,N,k),-9999.) # Table of paths where the ith row stores highest scoring paths to T1[i,j]
    
    #Handle first word and base case at the same time
    word = word_arr[0]
    if word not in Hmm.words:
        word = '#UNK#'
    
    temp_matrix = np.full((T,k),-9999.)
    temp_matrix[:,k-1] = log(Hmm.transition_matrix.loc['#START#'][1:-1].values)+log(B[word].values)
    T1[:,0,:] = temp_matrix

#     T1[:,0,:] = (log(Hmm.transition_matrix.loc['#START#'][1:-1].values)+log(B[word].values))[:,None]
    #Note A is vector operation
    # Fill up each column by using previous column
    # j is position of word
    for j in range(1,N):
        # i is position of tag
        #ignore #START# and #END# tag when looping
        word = word_arr[j]
        if word not in Hmm.words:
            word = '#UNK#'
        
        for i in range(T):
            
            T1[i][j] = np.sort((T1[:,j-1]+(log(A[:,i])+log(B.loc[S[i],word]))[:,None]).ravel())[::-1][:k]
            T2[i][j] = np.argsort((T1[:,j-1]+log(A[:,i])[:,None]).ravel())[::-1][:k]

            
    last_col = np.argsort((T1[:,N-1]+log(Hmm.transition_matrix['#END#'].values[1:-1])[:,None]).ravel())[::-1][:k]
    ans = []
    for sol in last_col:
        ls = []
        index = np.unravel_index(sol,(T,k))
        a,b = index
        ls.append(S[a])
        for j in range(N-1,0,-1):
            if j != N-1:
                a,b = np.unravel_index(int(index),(T,k))
            prev_index = T2[a][j][b]
            c,d = np.unravel_index(int(prev_index),(T,k))
            ls.append(S[int(c)])
            index = prev_index
        final = ls[::-1]
        ans.append(final)
        
    return ans

In [32]:
word = "".split(' ')
len(word)

6

In [33]:
vertebi_k(word,EN,k=7)

[['B-NP', 'I-NP', 'B-VP', 'B-ADJP', 'I-ADJP', 'O'],
 ['B-NP', 'I-NP', 'B-VP', 'B-ADJP', 'O', 'O'],
 ['B-NP', 'I-NP', 'B-VP', 'B-NP', 'I-NP', 'O'],
 ['B-NP', 'I-NP', 'B-VP', 'B-ADJP', 'I-ADJP', 'I-ADJP'],
 ['B-NP', 'I-NP', 'B-VP', 'B-NP', 'O', 'O'],
 ['B-NP', 'I-NP', 'B-VP', 'B-NP', 'I-NP', 'I-NP'],
 ['B-NP', 'I-NP', 'B-VP', 'B-ADJP', 'O', 'B-NP']]

In [214]:
LANG = ['AL','CN','EN','SG']
eval_params = lambda lang: {'devin':f'./{lang}/dev.in','devout':f'./{lang}/dev.p3.out','ground_truth':f'./{lang}/dev.out','trainfile':f'./{lang}/train'}

In [215]:
import os
def pred_out(devin,devout,ground_truth,trainfile):
    H = Hmm(trainfile)
    file_object = open(devin, "r",encoding='UTF-8',)
    ls=[[]]
    index=0
    test=[]
    for line in file_object:
        test.append(line.strip())
        if (line.strip()==""):
            ls.append([])
            index+=1
        else:
            ls[index].append(line.strip())
    ls.pop(-1)
    df = pd.DataFrame(test, columns = ['Word'])
    
    from tqdm.notebook import tqdm
    predict=[]
    for i in tqdm(ls):
        for j in vertebi_k(i,H):
            predict.append(j)
        predict.append("")
    df['Tag'] = predict
    
    df.to_csv(devout, sep=" ", index=False, header=False)
    
    if os.name == 'nt':#if it is on windows
        !python ./EvalScript/evalResult.py {ground_truth} {devout}
    else:
        !python3 ./EvalScript/evalResult.py {ground_truth} {devout}

In [216]:
for lang in LANG:
    print(lang)
    pred_out(**eval_params(lang))
    print('---------------------------------')

AL


HBox(children=(IntProgress(value=0, max=1492), HTML(value='')))



#Entity in gold data: 8408
#Entity in prediction: 8491

#Correct Entity : 5438
Entity  precision: 0.6404
Entity  recall: 0.6468
Entity  F: 0.6436

#Correct Sentiment : 4841
Sentiment  precision: 0.5701
Sentiment  recall: 0.5758
Sentiment  F: 0.5729
---------------------------------
CN


HBox(children=(IntProgress(value=0, max=642), HTML(value='')))



#Entity in gold data: 1478
#Entity in prediction: 708

#Correct Entity : 294
Entity  precision: 0.4153
Entity  recall: 0.1989
Entity  F: 0.2690

#Correct Sentiment : 200
Sentiment  precision: 0.2825
Sentiment  recall: 0.1353
Sentiment  F: 0.1830
---------------------------------
EN


HBox(children=(IntProgress(value=0, max=1094), HTML(value='')))

KeyboardInterrupt: 

In [62]:
a = np.random.rand(3,3)

In [63]:
a

array([[0.58554262, 0.15672213, 0.05409078],
       [0.25433011, 0.13347871, 0.17217374],
       [0.37576197, 0.75958957, 0.09796061]])

In [79]:
np.repeat(a[:,:,None],7,axis=2)

array([[[0.58554262, 0.58554262, 0.58554262, 0.58554262, 0.58554262,
         0.58554262, 0.58554262],
        [0.15672213, 0.15672213, 0.15672213, 0.15672213, 0.15672213,
         0.15672213, 0.15672213],
        [0.05409078, 0.05409078, 0.05409078, 0.05409078, 0.05409078,
         0.05409078, 0.05409078]],

       [[0.25433011, 0.25433011, 0.25433011, 0.25433011, 0.25433011,
         0.25433011, 0.25433011],
        [0.13347871, 0.13347871, 0.13347871, 0.13347871, 0.13347871,
         0.13347871, 0.13347871],
        [0.17217374, 0.17217374, 0.17217374, 0.17217374, 0.17217374,
         0.17217374, 0.17217374]],

       [[0.37576197, 0.37576197, 0.37576197, 0.37576197, 0.37576197,
         0.37576197, 0.37576197],
        [0.75958957, 0.75958957, 0.75958957, 0.75958957, 0.75958957,
         0.75958957, 0.75958957],
        [0.09796061, 0.09796061, 0.09796061, 0.09796061, 0.09796061,
         0.09796061, 0.09796061]]])

In [204]:
b= np.arange(28,1,-1)

In [205]:
b = b.reshape(3,3,3)

In [206]:
for i in b:
    print('\n')
    for j in i:
        print('',end= '\t')
        for k in j:
            print(k,end= ' ')



	28 27 26 	25 24 23 	22 21 20 

	19 18 17 	16 15 14 	13 12 11 

	10 9 8 	7 6 5 	4 3 2 

In [207]:
b = np.max(b,axis=2)

In [221]:
b

array([[28, 25, 22],
       [19, 16, 13],
       [10,  7,  4]])

In [224]:
b.reshape(9)

array([28, 25, 22, 19, 16, 13, 10,  7,  4])

In [284]:
b[1][1] =100

In [270]:
np.dstack(np.unravel_index(b.ravel().argsort(),b.shape))

array([[[2, 2],
        [2, 1],
        [2, 0],
        [1, 2],
        [1, 0],
        [0, 2],
        [0, 1],
        [0, 0],
        [1, 1]]], dtype=int64)

In [217]:
a = np.array([[1,2],[3,4]])

array([[[1, 2]],

       [[3, 4]]])

In [258]:
a[:,0] = 3

In [377]:
a

array([[3, 2],
       [3, 4]])

In [378]:
b = np.array([10,10])

In [381]:
a+b[:,None]

array([[13, 12],
       [13, 14]])

In [382]:
a+b

array([[13, 12],
       [13, 14]])