In [5]:
import numpy as np
import pandas as pd

In [6]:
class Hmm:
    def __init__(self,file,k=3):
        self.words = None #set of unique words
        self.tag_word_count = None # dict((tag,word),count)
        self.transmissions = None # dict((tag_u,tag_v),count)
        self.count = None # dict(tag,count)
        self.read_file(file,k)
        self.tags = set(self.count.keys())
        
        self.word_ls = tuple(self.words)#tuple cause immutable
        self._tag_ls = tuple(self.tags)
        #move #START# to first row and #END# to last row
        ls = list(self._tag_ls)
        ls.remove('#START#')
        ls.remove('#END#')
        ls.insert(0,'#START#')
        ls.append('#END#')
        self.tag_ls = tuple(ls)
        
        self.make_matrix()
    
    def make_matrix(self):
        tag_length = len(self.tag_ls)
        transition_matrix = np.zeros((tag_length,tag_length))
        
        for i in range(tag_length):
            for j in range(tag_length):
                tag_u = self.tag_ls[i]
                tag_v = self.tag_ls[j]
                transition_matrix[i][j] = self.transmissions[(tag_u,tag_v)]/self.count[tag_u]
        self.transition_matrix = pd.DataFrame(transition_matrix,index=self.tag_ls,columns=self.tag_ls)
        
        word_length = len(self.word_ls)
        em_matrix = np.zeros((tag_length,word_length))
        for i in range(tag_length):
            for j in range(word_length):
                tag = self.tag_ls[i]
                word = self.word_ls[j]
                em_matrix[i][j] = self.tag_word_count[(tag,word)]/self.count[tag]
        self.em_matrix = pd.DataFrame(em_matrix,index=self.tag_ls,columns=self.word_ls)
        pass
    
    def read_file(self,file,k):
        from collections import defaultdict
        seq = ['#START#']
        f = open(file,'r',encoding='UTF-8')
        tag_word_ls = []
        word_count = defaultdict(int)
        for line in f:
            split = line.split(' ')
            if len(split)<2:
                #this is a line break
                seq.append('#END#')
                seq.append('#START#')
                continue
            word,tag = split
            word = word.strip()
            tag = tag.strip()
            tag_word_ls.append([tag,word])
            word_count[word]+=1
            seq.append(tag)
        f.close()
        
        #Emissions
        for i in range(len(tag_word_ls)):
            tag,word = tag_word_ls[i]
            if word_count[word]<k:
                tag_word_ls[i] = [tag,'#UNK#']
        tag_word_count = defaultdict(int)
        
        words = []
        for tag,word in tag_word_ls:
            tag_word_count[tag,word]+=1
            words.append(word)
        self.words = set(words)
        self.tag_word_count= tag_word_count
        
        #Transistions
        del seq[-1] #delete last item from the list
         #print(seq)
        trans_dict = defaultdict(int)
        count_u = defaultdict(int)
        for i in range(len(seq)-1):
            tag_u = seq[i]
            count_u[tag_u] += 1 # need to count #END# too
            if tag_u == "#END#":
                continue
            #if u is not #END# we count the transmission 
            tag_v = seq[i+1]
            if (tag_u =="#START#" and tag_v == "#END#"):
                #check for empty blank lines at the end and dont count them
                print('these are blank lines')
                count_u["#START#"] -= 1 #remove additional start
                break
            trans_dict[(tag_u,tag_v)] += 1
        self.transmissions = trans_dict
        self.count = count_u

In [7]:
EN = Hmm('./EN/train')

In [8]:
EN.transition_matrix.loc['#START#','B-VP']

0.018661098786376094

In [9]:
EN.transition_matrix.values[0,:]

array([0.00000000e+00, 5.42868328e-02, 1.08704163e-01, 2.60994389e-04,
       0.00000000e+00, 2.25760146e-02, 6.48049067e-01, 3.26242986e-03,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       1.30497194e-03, 1.86610988e-02, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 1.41850450e-01, 0.00000000e+00,
       1.04397755e-03, 0.00000000e+00, 0.00000000e+00])

In [10]:
EN.transition_matrix

Unnamed: 0,#START#,B-ADVP,B-PP,B-CONJP,I-VP,B-SBAR,B-NP,B-ADJP,I-SBAR,I-ADVP,...,B-VP,I-ADJP,I-CONJP,I-PP,I-NP,O,I-INTJ,B-LST,B-PRT,#END#
#START#,0.0,0.054287,0.108704,0.000261,0.0,0.022576,0.648049,0.003262,0.0,0.0,...,0.018661,0.0,0.0,0.0,0.0,0.14185,0.0,0.001044,0.0,0.0
B-ADVP,0.0,0.016269,0.170547,0.000561,0.0,0.016269,0.210379,0.01655,0.0,0.086957,...,0.215989,0.0,0.0,0.0,0.0,0.265358,0.0,0.0,0.000281,0.000842
B-PP,0.0,0.003318,0.018491,0.0,0.0,0.000979,0.928047,0.002611,0.0,0.0,...,0.026595,0.0,0.0,0.011258,0.0,0.008484,0.0,0.0,0.0,0.000218
B-CONJP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.938776,0.0,0.0,0.061224,0.0,0.0,0.0,0.0
I-VP,0.0,0.037602,0.148341,0.000197,0.327887,0.01319,0.35535,0.029038,0.0,0.0,...,0.008269,0.0,0.0,0.0,0.0,0.056698,0.0,0.0,0.023329,9.8e-05
B-SBAR,0.0,0.008952,0.014218,0.0,0.0,0.008425,0.872565,0.00316,0.025276,0.0,...,0.038441,0.0,0.0,0.0,0.0,0.028436,0.0,0.000527,0.0,0.0
B-NP,0.0,0.009809,0.058007,8.5e-05,0.0,0.003403,0.028898,0.003213,0.0,0.0,...,0.130303,0.0,0.0,0.0,0.684706,0.080964,0.0,0.0,0.000359,0.000233
B-ADJP,0.0,0.015991,0.244432,0.0,0.0,0.037693,0.05197,0.001142,0.0,0.0,...,0.110794,0.27984,0.0,0.0,0.0,0.256996,0.0,0.0,0.000571,0.000571
I-SBAR,0.0,0.0,0.020833,0.0,0.0,0.0,0.958333,0.0,0.0,0.0,...,0.020833,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
I-ADVP,0.0,0.030303,0.15427,0.0,0.0,0.07989,0.187328,0.016529,0.0,0.146006,...,0.057851,0.0,0.0,0.0,0.0,0.327824,0.0,0.0,0.0,0.0


In [11]:
EN.transition_matrix.loc['#START#','B-NP']

0.6480490669450607

In [72]:
def log(m):
    m = np.clip(m, 1e-32, None)
    x = np.log(m)
    return x

In [205]:
def vertebi_k(word_arr,Hmm,k=7):
    """
    Followed pseudocode here
    https://en.wikipedia.org/wiki/Viterbi_algorithm#Pseudocode
    """
    S = Hmm.tag_ls[1:-1] #set of all possible tags remove #START# and #STOP#
    
    A = Hmm.transition_matrix.values[1:-1,1:-1] # A(tag_u_vector,tag_v)
    B = Hmm.em_matrix[1:-1] # B(tag_u->word)
    
    T = len(S) # Total number unique tags
    N = len(word_arr) # Length of sentence make sure no #START# and #STOP#
    
    T1 = np.zeros((T,N,k)) #probability table of most possible path to node i.e. store scores of each node
    T2 = np.zeros((T,N,2)) # Table of paths where the ith row stores highest scoring paths to T1[i,j]
    
    #Handle first word and base case at the same time
    word = word_arr[0]
    if word not in Hmm.words:
        word = '#UNK#'
        
    T1[:,0,:] = log(Hmm.transition_matrix.loc['#START#'][1:-1].values*B[word].values)[:,None]
    
    #Note A is vector operation
    # Fill up each column by using previous column
    # j is position of word
    for j in range(1,N):
        # i is position of tag
        #ignore #START# and #END# tag when looping
        word = word_arr[j]
        if word not in Hmm.words:
            word = '#UNK#'
        for i in range(T):
#             print((T1[:,j-1]+log(A[:,i]*B.loc[S[i],word])[:,None]).shape)
#             print(np.sort(T1[:,j-1]+log(A[:,i]*B.loc[S[i],word])[:,None],axis=0)[::-1].shape)
            T1[i][j] = np.sort((T1[:,j-1]+log(A[:,i]*B.loc[S[i],word])[:,None]).ravel())[::-1][:k]
            T2[i,j,:] = np.dstack(np.unravel_index(np.argsort((T1[:,j-1]+log(A[:,i])[:,None]).ravel()),(T,k)))[0][::-1][:k]
            
#             T1[i][j] = np.sort(T1[:,j-1]+log(A[:,i]*B.loc[S[i],word]))[::-1][:k]
#             T2[i][j] = np.argsort(T1[:,j-1]+log(A[:,i]))[::-1][:k]
        
    #handle last word to #END#
    #no emission of #END# 
#     print(T1)
#     print(T2)

    best_row = np.argmax(T1[:,N-1]+log(Hmm.transition_matrix['#END#'].values[1:-1]))
    ans=[]
    curr_index = best_row
    ans.append(S[curr_index])
    for j in range(N-1,0,-1):
        prev_index = T2[int(curr_index)][j]
        ans.append(S[int(prev_index)])
        curr_index = prev_index
#         print(S[j])
    ans = ans[::-1]
#     print(T1)
    return ans

In [206]:
word = "Trump is the best president in the world".split(' ')
len(word)

8

In [207]:
vertebi_k(word,EN)

ValueError: could not broadcast input array from shape (7,2) into shape (2)

In [40]:
LANG = ['AL','CN','EN','SG']
eval_params = lambda lang: {'devin':f'./{lang}/dev.in','devout':f'./{lang}/dev.p3.out','ground_truth':f'./{lang}/dev.out','trainfile':f'./{lang}/train'}

In [41]:
import os
def pred_out(devin,devout,ground_truth,trainfile):
    H = Hmm(trainfile)
    file_object = open(devin, "r",encoding='UTF-8',)
    ls=[[]]
    index=0
    test=[]
    for line in file_object:
        test.append(line.strip())
        if (line.strip()==""):
            ls.append([])
            index+=1
        else:
            ls[index].append(line.strip())
    ls.pop(-1)
    df = pd.DataFrame(test, columns = ['Word'])
    
    from tqdm.notebook import tqdm
    predict=[]
    for i in tqdm(ls):
        for j in vertebi_k(i,H):
            predict.append(j)
        predict.append("")
    df['Tag'] = predict
    
    df.to_csv(devout, sep=" ", index=False, header=False)
    
    if os.name == 'nt':#if it is on windows
        !python ./EvalScript/evalResult.py {ground_truth} {devout}
    else:
        !python3 ./EvalScript/evalResult.py {ground_truth} {devout}

In [42]:
for lang in LANG:
    print(lang)
    pred_out(**eval_params(lang))
    print('---------------------------------')

AL


HBox(children=(IntProgress(value=0, max=1492), HTML(value='')))



#Entity in gold data: 8408
#Entity in prediction: 8498

#Correct Entity : 6740
Entity  precision: 0.7931
Entity  recall: 0.8016
Entity  F: 0.7974

#Correct Sentiment : 6087
Sentiment  precision: 0.7163
Sentiment  recall: 0.7240
Sentiment  F: 0.7201
---------------------------------
CN


HBox(children=(IntProgress(value=0, max=642), HTML(value='')))

KeyboardInterrupt: 

In [62]:
a = np.random.rand(3,3)

In [63]:
a

array([[0.58554262, 0.15672213, 0.05409078],
       [0.25433011, 0.13347871, 0.17217374],
       [0.37576197, 0.75958957, 0.09796061]])

In [79]:
np.repeat(a[:,:,None],7,axis=2)

array([[[0.58554262, 0.58554262, 0.58554262, 0.58554262, 0.58554262,
         0.58554262, 0.58554262],
        [0.15672213, 0.15672213, 0.15672213, 0.15672213, 0.15672213,
         0.15672213, 0.15672213],
        [0.05409078, 0.05409078, 0.05409078, 0.05409078, 0.05409078,
         0.05409078, 0.05409078]],

       [[0.25433011, 0.25433011, 0.25433011, 0.25433011, 0.25433011,
         0.25433011, 0.25433011],
        [0.13347871, 0.13347871, 0.13347871, 0.13347871, 0.13347871,
         0.13347871, 0.13347871],
        [0.17217374, 0.17217374, 0.17217374, 0.17217374, 0.17217374,
         0.17217374, 0.17217374]],

       [[0.37576197, 0.37576197, 0.37576197, 0.37576197, 0.37576197,
         0.37576197, 0.37576197],
        [0.75958957, 0.75958957, 0.75958957, 0.75958957, 0.75958957,
         0.75958957, 0.75958957],
        [0.09796061, 0.09796061, 0.09796061, 0.09796061, 0.09796061,
         0.09796061, 0.09796061]]])

In [204]:
b= np.arange(28,1,-1)

In [205]:
b = b.reshape(3,3,3)

In [206]:
for i in b:
    print('\n')
    for j in i:
        print('',end= '\t')
        for k in j:
            print(k,end= ' ')



	28 27 26 	25 24 23 	22 21 20 

	19 18 17 	16 15 14 	13 12 11 

	10 9 8 	7 6 5 	4 3 2 

In [207]:
b = np.max(b,axis=2)

In [221]:
b

array([[28, 25, 22],
       [19, 16, 13],
       [10,  7,  4]])

In [224]:
b.reshape(9)

array([28, 25, 22, 19, 16, 13, 10,  7,  4])

In [284]:
b[1][1] =100

In [270]:
np.dstack(np.unravel_index(b.ravel().argsort(),b.shape))

array([[[2, 2],
        [2, 1],
        [2, 0],
        [1, 2],
        [1, 0],
        [0, 2],
        [0, 1],
        [0, 0],
        [1, 1]]], dtype=int64)

In [285]:
b

array([[  4,   7,  10],
       [ 13, 100,  22],
       [ 25,  28, 100]])

In [297]:
np.sort(b.ravel())

array([  4,   7,  10,  13,  22,  25,  28, 100, 100])

In [279]:
np.sort(b.ravel())

array([  4,   7,  10,  13,  19,  22,  25,  28, 100])

In [289]:
b.ravel().argsort()

array([0, 1, 2, 3, 5, 6, 7, 4, 8], dtype=int64)

In [295]:
np.sort(b.ravel())

array([  4,   7,  10,  13,  22,  25,  28, 100, 100])

In [52]:
a = np.array([[1,2],[3,4]])

In [53]:
b =  np.array([1,2])

In [54]:
a+b

array([[2, 4],
       [4, 6]])

In [55]:
b.shape

(2,)

In [56]:
a.shape

(2, 2)

In [97]:
a = np.random.rand(21,7)

In [98]:
b = np.random.rand(21,)

In [99]:
a

array([[0.43763193, 0.2942224 , 0.14604834, 0.14865289, 0.10056121,
        0.63419056, 0.82718944],
       [0.32016971, 0.9008598 , 0.62396608, 0.37911076, 0.45469811,
        0.79266094, 0.25555683],
       [0.76534203, 0.78968928, 0.32983063, 0.21812949, 0.20154187,
        0.79341467, 0.33641871],
       [0.35947107, 0.74394   , 0.08892166, 0.8499492 , 0.37100268,
        0.66028738, 0.01629692],
       [0.53681829, 0.17292918, 0.99911889, 0.33802716, 0.13687957,
        0.96098373, 0.2357746 ],
       [0.07010023, 0.17386769, 0.55485796, 0.99067451, 0.64811139,
        0.44932112, 0.43694332],
       [0.27465857, 0.89330206, 0.3862332 , 0.24831904, 0.28223968,
        0.14811862, 0.74228863],
       [0.16324043, 0.32356391, 0.63227476, 0.13206381, 0.07465467,
        0.65916669, 0.55699379],
       [0.69473633, 0.37262897, 0.12420884, 0.39907379, 0.71729547,
        0.08297081, 0.10463308],
       [0.4181538 , 0.78176689, 0.59216056, 0.9019243 , 0.31953277,
        0.4287263 , 0.4

In [100]:
b

array([0.90603144, 0.04727357, 0.63450097, 0.21101138, 0.7411254 ,
       0.32596287, 0.58573164, 0.49107245, 0.24514114, 0.91590703,
       0.75862623, 0.37531515, 0.86220542, 0.85173559, 0.71639988,
       0.05787269, 0.95064001, 0.77071938, 0.86489043, 0.13326636,
       0.88307819])

In [114]:
(a + b[:,None]).shape

(21, 7)