In [1]:
import pandas as pd

In [2]:
class Emission:
    def __init__(self,file,k=3):
        self.df = None
        self.k = k
        self.count_series = None
        self.read_file(file)
        pass
    def read_file(self,file):
        file_object = open(file, "r",encoding='UTF-8')
        dataset=[]
        for line in file_object:
            dataset.append(line)
        data=[]
        for i in range(len(dataset)):
            temp = dataset[i].split()
            if (len(temp)==2):
                data.append(temp)
        self.df = self.smoothen(pd.DataFrame(data, columns = ['Word', 'Tag']),self.k)
        self.count_series = self.df.groupby(['Word', 'Tag']).size()
        file_object.close()
    def emission(self,word,tag):
        df = self.df
        is_tag = (df.Tag==tag)
        y = is_tag.sum()
        xy = (df.Word[is_tag]==word).sum()
        return xy/y
    @staticmethod
    def smoothen(df,k):
        ls = df['Word'].value_counts()[df['Word'].value_counts()<k].index.tolist()
        df.loc[df['Word'].isin(ls), 'Word'] = "#UNK#"
        return df
    def argmax(self,word):
        """
        This part not needed in part 3?
        produces the tag with highest probability for the word in the sequence.
        """
        if (word==""):
            return ""
        try:
            value = self.count_series.loc[word].idxmax()
            probability = value/self.count_series[word].sum()
        except:
            tag = self.count_series.loc["#UNK#"].idxmax()
            probability = self.count_series[word,value]/self.count_series[word].sum()
        return (value,probability)

In [5]:
EN_Emission = Emission('./EN/train')
print(EN_Emission.emission('are','B-VP'))
print(EN_Emission.argmax('are'))

0.03707354471277586
('B-VP', 0.9941262848751835)


In [60]:
class Transmission:    
    def __init__(self,file):
        self.transmissions = None #dictionary where KEY is a tuple (tag_u,tag_v) where tag_u -> tag_v and VALUE = count
        self.count = None #dictionary where KEY is the tag and VALUE = count
        self.read_file(file)
    def read_file(self,file):
        """
        read training file and returns 2 dictionaries
        RETURNS trans_dict,count_u
        trans_dict - > dictionary where KEY is a tuple (tag_u,tag_v) where tag_u _> tag_v and VALUE = count
        count_u -> KEY is the tag and VALUE = count
        """
        from collections import defaultdict
        seq = ['#START#']
        f = open(file,'r',encoding='UTF-8')
        for line in f:
            split = line.split(' ')
            if len(split)<2:
                #this is a line break
                seq.append('#END#')
                seq.append('#START#')
                continue
            word,tag = split
            word = word.strip()
            tag = tag.strip()
            seq.append(tag)
        f.close()
        del seq[-1] #delete last item from the list
#         print(seq)
        trans_dict = defaultdict(int)
        count_u = defaultdict(int)
        for i in range(len(seq)-1):
            tag_u = seq[i]
            count_u[tag_u] += 1 # need to count #END# too
            if tag_u == "#END#":
                continue
            #if u is not #END# we count the transmission 
            tag_v = seq[i+1]
            if (tag_u =="#START#" and tag_v == "#END#"):
                #check for empty blank lines at the end and dont count them
                print('these are blank lines')
                count_u["#START#"] -= 1 #remove additional start
                break
            trans_dict[(tag_u,tag_v)] += 1
        self.transmissions = trans_dict
        self.count = count_u
        return trans_dict,count_u
    def transmission_proba(self,tag_u,tag_v):
        """
        Auv = Count(tag_u->tag_v)/Count(tag_u)
        """
        return self.transmissions[(tag_u,tag_v)]/self.count[tag_u]

In [61]:
EN_transmission = Transmission('./EN/train')
print(EN_transmission.transmission_proba('#START#','B-NP'))
print(EN_transmission.transmission_proba('B-NP','#END#'))
print(EN_transmission.transmission_proba('#START#','#START#'))

0.6480490669450607
0.00023253355882042067
0.0
