In [241]:
import pandas as pd
import numpy as np

In [242]:
class Emission:
    def __init__(self,file,k=3):
        self.df = None
        self.k = k
        self.count_series = None
        self.word_ls=None
        self.unk_ls=None
        self.read_file(file)
        pass
    def read_file(self,file):
        file_object = open(file, "r",encoding='UTF-8')
        dataset=[]
        for line in file_object:
            dataset.append(line)
        data=[]
        for i in range(len(dataset)):
            temp = dataset[i].split()
            if (len(temp)==2):
                data.append(temp)
        self.df = self.smoothen(pd.DataFrame(data, columns = ['Word', 'Tag']),self.k)
        self.count_series = self.df.groupby(['Word', 'Tag']).size()
        tag_count = self.df['Tag'].value_counts()
        self.count_series = self.count_series / (pd.Series(self.count_series.index.get_level_values('Tag').tolist(), index=self.count_series.index).apply(lambda x: tag_count[x]))
        self.word_ls = self.count_series.index.get_level_values('Word').tolist()
        self.unk_ls = self.count_series['#UNK#'].index.tolist()
        file_object.close()
    
    def emission(self,tag,word):
        try:
            return self.count_series.loc[word,tag]
        except:
            if word in self.word_ls or tag not in self.unk_ls:
                return 0.
            return self.count_series.loc['#UNK#',tag]

    @staticmethod
    def smoothen(df,k):
        ls = df['Word'].value_counts()[df['Word'].value_counts()<k].index.tolist()
        df.loc[df['Word'].isin(ls), 'Word'] = "#UNK#"
        return df
    def argmax(self,word):
        """
        This part not needed in part 3?
        produces the tag with highest probability for the word in the sequence.
        """
        if (word==""):
            return ("",None)
        try:
            value = self.count_series.loc[word].idxmax()
            probability = self.count_series.loc[word].max()
        except:
            value = self.count_series.loc["#UNK#"].idxmax()
            probability = self.count_series.loc["#UNK#"].max()
        return (value, probability)

In [243]:
EN_Emission = Emission('./EN/train')
print(EN_Emission.emission('B-VP', 'the'))
print(EN_Emission.argmax('are'))

0.0
('B-VP', 0.03707354471277586)


In [244]:
class Transmission:    
    def __init__(self,file):
        self.transmissions = None #dictionary where KEY is a tuple (tag_u,tag_v) where tag_u -> tag_v and VALUE = count
        self.count = None #dictionary where KEY is the tag and VALUE = count
        self.read_file(file)
    def read_file(self,file):
        """
        read training file and returns 2 dictionaries
        RETURNS trans_dict,count_u
        trans_dict - > dictionary where KEY is a tuple (tag_u,tag_v) where tag_u _> tag_v and VALUE = count
        count_u -> KEY is the tag and VALUE = count
        """
        from collections import defaultdict
        seq = ['#START#']
        f = open(file,'r',encoding='UTF-8')
        for line in f:
            split = line.split(' ')
            if len(split)<2:
                #this is a line break
                seq.append('#END#')
                seq.append('#START#')
                continue
            word,tag = split
            word = word.strip()
            tag = tag.strip()
            seq.append(tag)
        f.close()
        del seq[-1] #delete last item from the list
#         print(seq)
        trans_dict = defaultdict(int)
        count_u = defaultdict(int)
        for i in range(len(seq)-1):
            tag_u = seq[i]
            count_u[tag_u] += 1 # need to count #END# too
            if tag_u == "#END#":
                continue
            #if u is not #END# we count the transmission 
            tag_v = seq[i+1]
            if (tag_u =="#START#" and tag_v == "#END#"):
                #check for empty blank lines at the end and dont count them
                print('these are blank lines')
                count_u["#START#"] -= 1 #remove additional start
                break
            trans_dict[(tag_u,tag_v)] += 1
        self.transmissions = trans_dict
        self.count = count_u
        return trans_dict,count_u
    def transmission_proba(self,tag_u,tag_v):
        """
        Auv = Count(tag_u->tag_v)/Count(tag_u)
        """
        return self.transmissions[(tag_u,tag_v)]/self.count[tag_u]
    
    def vector_proba(self,ls,tag_v):
        #lazy calculation instead of using transistion matrix(in slides)
        """
        Returns vectorized formed when a list of tag_u is given 
        """
        ans = []
        for word in ls:
            ans.append(self.transmission_proba(word,tag_v))
        return np.array(ans)

In [245]:
EN_transmission = Transmission('./EN/train')
print(EN_transmission.transmission_proba('#START#','B-NP'))
print(EN_transmission.transmission_proba('B-NP','#END#'))
print(EN_transmission.transmission_proba('#START#','#START#'))

0.6480490669450607
0.00023253355882042067
0.0


In [246]:
def vertebi(word_arr,Transmission,Emission):
    """
    Followed pseudocode here
    https://en.wikipedia.org/wiki/Viterbi_algorithm#Pseudocode
    """
    S = list(Transmission.count.keys()) #set of all possible tags remove #START# and #STOP#
    S.remove('#START#')
    S.remove('#END#')
    A = Transmission.vector_proba # A(tag_u_vector,tag_v)
    B = Emission.emission # B(tag_u->word)
    T = len(S) # Total number unique tags
    N = len(word_arr) # Length of sentence make sure no #START# and #STOP#
    
    T1 = np.zeros((T,N)) #probability table of most possible path to node i.e. store scores of each node
    T2 = np.zeros((T,N)) # Table of paths where the ith row stores highest scoring paths to T1[i,j]
    
    #Handle first word and base case at the same time
    for i in range(T):
        T1[i,0] = 1 * Transmission.transmission_proba('#START#',S[i]) * B(S[i],word_arr[0])
        T2[i,0] = 0 #Path for first column is set to 0 same for all
    #Note A is vector operation
    # Fill up each column by using previous column
    # j is position of word
    for j in range(1,N):
        # i is position of tag
        #ignore #START# and #END# tag when looping
        for i in range(T):
            tag = S[i]
            #note A(S,tag_u gives a vector)
            T1[i][j] = np.max(T1[:,j-1]*A(S,tag)*B(tag,word_arr[j])) 
            T2[i][j] = np.argmax(T1[:,j-1]*A(S,tag))
    #handle last word to #END#
    #no emission of #END# 
#     print(T1)
#     print(T2)
#     print(T1[:,N-1]*A(S,'#END#'))
    best_row = np.argmax(T1[:,N-1]*A(S,'#END#'))
    ans=[]
    for j in range(1,N):
        index = int(T2[best_row][j])
#         print(S[j])
        ans.append(S[index])
    ans.append(S[best_row])
    return ans

In [247]:
word = ['The','dog','is','a','good','boy', '.']

In [248]:
# %%timeit
vertebi(word,EN_transmission,EN_Emission)

['B-NP', 'I-NP', 'B-VP', 'B-NP', 'I-NP', 'I-NP', 'O']

In [249]:
EN_Emission.emission('O',"Trump")

0.0

In [250]:
EN_Emission.count_series

Word     Tag   
!        O         0.000628
#        B-ADJP    0.001713
         B-NP      0.000423
         I-NP      0.000183
         O         0.000042
                     ...   
young    I-NP      0.000128
younger  B-NP      0.000085
         I-NP      0.000092
your     B-NP      0.000803
         I-NP      0.000037
Length: 12171, dtype: float64

In [251]:
print(word)

['The', 'dog', 'is', 'a', 'good', 'boy', '.']


In [252]:
a = [1,2,3,4,5]

In [253]:
a[1:-1]

[2, 3, 4]

In [254]:
file_object = open("./EN/dev.in", "r")
ls=[[]]
index=0
test=[]
for line in file_object:
    test.append(line.strip())
    if (line.strip()==""):
        ls.append([])
        index+=1
    else:
        ls[index].append(line.strip())
ls.pop(-1)
EN_test_df = pd.DataFrame(test, columns = ['Word'])
EN_test_df

Unnamed: 0,Word
0,HBO
1,has
2,close
3,to
4,24
...,...
27220,were
27221,in
27222,Congress
27223,.


In [255]:
vertebi(ls[1],EN_transmission,EN_Emission)
# print(len(ls))

['B-NP',
 'I-NP',
 'I-NP',
 'B-PP',
 'B-NP',
 'I-NP',
 'I-NP',
 'B-NP',
 'I-NP',
 'I-NP',
 'O']

In [256]:
from tqdm.notebook import tqdm
predict=[]
for i in tqdm(ls):
    for j in vertebi(i,EN_transmission,EN_Emission):
        predict.append(j)
    predict.append("")
EN_test_df['Tag'] = predict
EN_test_df

HBox(children=(IntProgress(value=0, max=1094), HTML(value='')))




Unnamed: 0,Word,Tag
0,HBO,B-NP
1,has,B-VP
2,close,I-VP
3,to,I-VP
4,24,B-NP
...,...,...
27220,were,B-VP
27221,in,B-PP
27222,Congress,B-NP
27223,.,O


In [257]:
EN_test_df.to_csv("./EN/dev.p3.out", sep=" ", index=False, header=False)

In [258]:
!python3 ./EvalScript/evalResult.py ./EN/dev.out ./EN/dev.p3.out


#Entity in gold data: 13179
#Entity in prediction: 13730

#Correct Entity : 10567
Entity  precision: 0.7696
Entity  recall: 0.8018
Entity  F: 0.7854

#Correct Sentiment : 9537
Sentiment  precision: 0.6946
Sentiment  recall: 0.7237
Sentiment  F: 0.7088
