In [9]:
import pandas as pd
import numpy as np

In [10]:
class Emission:
    def __init__(self,file,k=3):
        self.df = None
        self.k = k
        self.count_series = None
        self.word_ls=None
        self.unk_ls=None
        self.dic=None
        self.read_file(file)
        pass
    def read_file(self,file):
        file_object = open(file, "r",encoding='UTF-8')
        dataset=[]
        for line in file_object:
            dataset.append(line)
        data=[]
        for i in range(len(dataset)):
            temp = dataset[i].split()
            if (len(temp)==2):
                data.append(temp)
        self.df = self.smoothen(pd.DataFrame(data, columns = ['Word', 'Tag']),self.k)
        self.count_series = self.df.groupby(['Word', 'Tag']).size()
        tag_count = self.df['Tag'].value_counts()
        self.count_series = self.count_series / (pd.Series(self.count_series.index.get_level_values('Tag').tolist(), index=self.count_series.index).apply(lambda x: tag_count[x]))
        self.word_ls = self.count_series.index.get_level_values('Word').tolist()
        self.unk_ls = self.count_series['#UNK#'].index.tolist()
        self.dic=self.count_series.to_dict()
        file_object.close()
    
    def emission(self,tag,word):
        if (word,tag) in self.dic:
            return self.dic[word,tag]
        else:
            if tag not in self.unk_ls or word in self.word_ls:
                return 0.
            return self.dic['#UNK#',tag]

    @staticmethod
    def smoothen(df,k):
        ls = df['Word'].value_counts()[df['Word'].value_counts()<k].index.tolist()
        df.loc[df['Word'].isin(ls), 'Word'] = "#UNK#"
        return df
    def argmax(self,word):
        """
        This part not needed in part 3?
        produces the tag with highest probability for the word in the sequence.
        """
        if (word==""):
            return ("",None)
        try:
            value = self.count_series.loc[word].idxmax()
            probability = self.count_series.loc[word].max()
        except:
            value = self.count_series.loc["#UNK#"].idxmax()
            probability = self.count_series.loc["#UNK#"].max()
        return (value, probability)

In [11]:
EN_Emission = Emission('./EN/train')
print(EN_Emission.emission('I-NP', 'Trump'))
# print(EN_Emission.argmax('are'))

0.0001282262644025572


In [12]:
AL_emission = Emission('./AL/train')
print(AL_emission.count_series)
print(AL_emission.emission('I-SUBROAD','龙'))
AL_emission.unk_ls

Word   Tag        
#UNK#  B-ASSIST       0.004994
       B-CELLNO       0.003771
       B-CITY         0.003160
       B-COMMUNITY    0.031778
       B-DEVZONE      0.004988
                        ...   
龙      I-SUBROAD      0.002801
       I-TOWN         0.002512
龚      B-POI          0.000680
       I-POI          0.000035
       I-ROAD         0.000066
Length: 13781, dtype: float64
0.0028011204481792717


['B-ASSIST',
 'B-CELLNO',
 'B-CITY',
 'B-COMMUNITY',
 'B-DEVZONE',
 'B-DISTRICT',
 'B-FLOORNO',
 'B-HOUSENO',
 'B-PERSON',
 'B-POI',
 'B-REDUNDANT',
 'B-ROAD',
 'B-ROADNO',
 'B-ROOMNO',
 'B-SUBPOI',
 'B-SUBROAD',
 'B-SUBROADNO',
 'B-TOWN',
 'I-ASSIST',
 'I-CITY',
 'I-COMMUNITY',
 'I-DEVZONE',
 'I-DISTRICT',
 'I-FLOORNO',
 'I-HOUSENO',
 'I-PERSON',
 'I-POI',
 'I-REDUNDANT',
 'I-ROAD',
 'I-ROADNO',
 'I-ROOMNO',
 'I-SUBPOI',
 'I-SUBROAD',
 'I-SUBROADNO',
 'I-TOWN']

In [13]:
class Transmission:    
    def __init__(self,file):
        self.transmissions = None #dictionary where KEY is a tuple (tag_u,tag_v) where tag_u -> tag_v and VALUE = count
        self.count = None #dictionary where KEY is the tag and VALUE = count
        self.read_file(file)
    def read_file(self,file):
        """
        read training file and returns 2 dictionaries
        RETURNS trans_dict,count_u
        trans_dict - > dictionary where KEY is a tuple (tag_u,tag_v) where tag_u _> tag_v and VALUE = count
        count_u -> KEY is the tag and VALUE = count
        """
        from collections import defaultdict
        seq = ['#START#']
        f = open(file,'r',encoding='UTF-8')
        for line in f:
            split = line.split(' ')
            if len(split)<2:
                #this is a line break
                seq.append('#END#')
                seq.append('#START#')
                continue
            word,tag = split
            word = word.strip()
            tag = tag.strip()
            seq.append(tag)
        f.close()
        del seq[-1] #delete last item from the list
#         print(seq)
        trans_dict = defaultdict(int)
        count_u = defaultdict(int)
        for i in range(len(seq)-1):
            tag_u = seq[i]
            count_u[tag_u] += 1 # need to count #END# too
            if tag_u == "#END#":
                continue
            #if u is not #END# we count the transmission 
            tag_v = seq[i+1]
            if (tag_u =="#START#" and tag_v == "#END#"):
                #check for empty blank lines at the end and dont count them
                print('these are blank lines')
                count_u["#START#"] -= 1 #remove additional start
                break
            trans_dict[(tag_u,tag_v)] += 1
        self.transmissions = trans_dict
        self.count = count_u
        return trans_dict,count_u
    def transmission_proba(self,tag_u,tag_v):
        """
        Auv = Count(tag_u->tag_v)/Count(tag_u)
        """
        return self.transmissions[(tag_u,tag_v)]/self.count[tag_u]
    
    def vector_proba(self,ls,tag_v):
        #lazy calculation instead of using transistion matrix(in slides)
        """
        Returns vectorized formed when a list of tag_u is given 
        """
        ans = []
        for word in ls:
            ans.append(self.transmission_proba(word,tag_v))
        return np.array(ans)

In [14]:
EN_transmission = Transmission('./EN/train')
print(EN_transmission.transmission_proba('#START#','B-VP'))
print(EN_transmission.transmission_proba('B-NP','#END#'))
print(EN_transmission.transmission_proba('#START#','#START#'))

0.018661098786376094
0.00023253355882042067
0.0


In [15]:
AL_transmission = Transmission('./AL/train')
print(AL_transmission.transmission_proba('#START#','#START#'))
AL_transmission.count

0.0


defaultdict(int,
            {'#START#': 10448,
             'B-DISTRICT': 6856,
             'I-DISTRICT': 13364,
             'B-POI': 7348,
             'I-POI': 28472,
             'B-HOUSENO': 3455,
             'I-HOUSENO': 3045,
             'B-CELLNO': 1326,
             'I-CELLNO': 1488,
             '#END#': 10447,
             'B-CITY': 5697,
             'I-CITY': 10430,
             'B-ROAD': 6324,
             'I-ROAD': 15178,
             'B-REDUNDANT': 4137,
             'I-REDUNDANT': 3668,
             'B-PROV': 4488,
             'I-PROV': 8666,
             'B-SUBROAD': 387,
             'I-SUBROAD': 714,
             'B-SUBROADNO': 202,
             'I-SUBROADNO': 186,
             'B-ROADNO': 5061,
             'I-ROADNO': 4666,
             'B-SUBPOI': 1650,
             'I-SUBPOI': 4557,
             'B-TOWN': 4612,
             'I-TOWN': 10748,
             'B-ROOMNO': 3107,
             'B-COMMUNITY': 1479,
             'I-COMMUNITY': 3148,
             'B-FLO

In [25]:
def vertebi(word_arr,Transmission,Emission):
    """
    Followed pseudocode here
    https://en.wikipedia.org/wiki/Viterbi_algorithm#Pseudocode
    """
    S = list(Transmission.count.keys()) #set of all possible tags remove #START# and #STOP#
    S.remove('#START#')
    S.remove('#END#')
    A = Transmission.vector_proba # A(tag_u_vector,tag_v)
    B = Emission.emission # B(tag_u->word)
    T = len(S) # Total number unique tags
    N = len(word_arr) # Length of sentence make sure no #START# and #STOP#
    
    T1 = np.zeros((T,N)) #probability table of most possible path to node i.e. store scores of each node
    T2 = np.zeros((T,N)) # Table of paths where the ith row stores highest scoring paths to T1[i,j]
    
    #Handle first word and base case at the same time
    for i in range(T):
        T1[i,0] = 1 * Transmission.transmission_proba('#START#',S[i]) * B(S[i],word_arr[0])
        T2[i,0] = 0 #Path for first column is set to 0 same for all
    #Note A is vector operation
    # Fill up each column by using previous column
    # j is position of word
    for j in range(1,N):
        # i is position of tag
        #ignore #START# and #END# tag when looping
        for i in range(T):
            tag = S[i]
            #note A(S,tag_u gives a vector)
            T1[i][j] = np.max(T1[:,j-1]*A(S,tag)*B(tag,word_arr[j])) 
            T2[i][j] = np.argmax(T1[:,j-1]*A(S,tag))
    #handle last word to #END#
    #no emission of #END# 
#     print(T1[:,N-1]*A(S,'#END#'))
    best_row = np.argmax(T1[:,N-1]*A(S,'#END#'))
    ans=[]
    for j in range(1,N):
        index = int(T2[best_row][j])
#         print(S[j])
        ans.append(S[index])
#     print(T1)
#     print(T2)
    ans.append(S[best_row])
    return ans

In [26]:
word = ['The','dog','is','good']

In [27]:
word1 = ['杭','州','市','西','湖','区','古','荡','新','c','西','34','幢','八','单','元','1375']
# word1 = ['杭','州','市','西','湖','区']
# vertebi(word1,AL_transmission,AL_emission)

In [28]:
# %%timeit
word = "Trump is the best president in the world".split(' ')
vertebi(word,EN_transmission,EN_Emission)

['B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP']

In [29]:
EN_Emission.emission('O',"Trump")

0.0

In [30]:
EN_Emission.count_series

Word     Tag   
!        O         0.000628
#        B-ADJP    0.001713
         B-NP      0.000423
         I-NP      0.000183
         O         0.000042
                     ...   
young    I-NP      0.000128
younger  B-NP      0.000085
         I-NP      0.000092
your     B-NP      0.000803
         I-NP      0.000037
Length: 12171, dtype: float64

In [31]:
print(word)

['Trump', 'is', 'the', 'best', 'president', 'in', 'the', 'world']


In [32]:
LANG = ['AL','CN','EN','SG']
eval_params = lambda lang: {'devin':f'./{lang}/dev.in','devout':f'./{lang}/dev.p3.out','ground_truth':f'./{lang}/dev.out','trainfile':f'./{lang}/train'}

In [33]:
import os
def pred_out(devin,devout,ground_truth,trainfile):
    Em = Emission(trainfile)
    Trans = Transmission(trainfile)
    
    
    file_object = open(devin, "r",encoding='UTF-8',)
    ls=[[]]
    index=0
    test=[]
    for line in file_object:
        test.append(line.strip())
        if (line.strip()==""):
            ls.append([])
            index+=1
        else:
            ls[index].append(line.strip())
    ls.pop(-1)
    df = pd.DataFrame(test, columns = ['Word'])
    
    from tqdm.notebook import tqdm
    predict=[]
    for i in tqdm(ls):
        for j in vertebi(i,Trans,Em):
            predict.append(j)
        predict.append("")
    df['Tag'] = predict
    
    df.to_csv(devout, sep=" ", index=False, header=False)
    
    if os.name == 'nt':#if it is on windows
        !python ./EvalScript/evalResult.py {ground_truth} {devout}
    else:
        !python3 ./EvalScript/evalResult.py {ground_truth} {devout}

In [None]:
for lang in LANG:
    pred_out(**eval_params(lang))

HBox(children=(IntProgress(value=0, max=1492), HTML(value='')))



#Entity in gold data: 8408
#Entity in prediction: 17746

#Correct Entity : 2706
Entity  precision: 0.1525
Entity  recall: 0.3218
Entity  F: 0.2069

#Correct Sentiment : 1634
Sentiment  precision: 0.0921
Sentiment  recall: 0.1943
Sentiment  F: 0.1250


HBox(children=(IntProgress(value=0, max=642), HTML(value='')))



#Entity in gold data: 1478
#Entity in prediction: 886

#Correct Entity : 175
Entity  precision: 0.1975
Entity  recall: 0.1184
Entity  F: 0.1481

#Correct Sentiment : 119
Sentiment  precision: 0.1343
Sentiment  recall: 0.0805
Sentiment  F: 0.1007


HBox(children=(IntProgress(value=0, max=1094), HTML(value='')))



#Entity in gold data: 13179
#Entity in prediction: 13730

#Correct Entity : 10567
Entity  precision: 0.7696
Entity  recall: 0.8018
Entity  F: 0.7854

#Correct Sentiment : 9537
Sentiment  precision: 0.6946
Sentiment  recall: 0.7237
Sentiment  F: 0.7088


HBox(children=(IntProgress(value=0, max=3107), HTML(value='')))

In [None]:
pred_out(**eval_params('AL'))

In [None]:
file_object = open("./EN/dev.in", "r")
ls=[[]]
index=0
test=[]
for line in file_object:
    test.append(line.strip())
    if (line.strip()==""):
        ls.append([])
        index+=1
    else:
        ls[index].append(line.strip())
ls.pop(-1)
EN_test_df = pd.DataFrame(test, columns = ['Word'])
EN_test_df

In [None]:
vertebi(ls[1],EN_transmission,EN_Emission)
# print(len(ls))

In [None]:
from tqdm.notebook import tqdm
predict=[]
for i in tqdm(ls):
    for j in vertebi(i,EN_transmission,EN_Emission):
        predict.append(j)
    predict.append("")
EN_test_df['Tag'] = predict
EN_test_df

In [None]:
EN_test_df.to_csv("./EN/dev.p3.out", sep=" ", index=False, header=False)

In [None]:
!python ./EvalScript/evalResult.py ./EN/dev.out ./EN/dev.p3.out

In [7]:
import os

In [8]:
os.name

'nt'

0