In [84]:
import pandas as pd
import numpy as np 


## Define functions

In [85]:
def load_data(path):
    
    with open(path) as f:
        tags=[]
        observations=[]
        tag_temp=[]
        obs_temp=[]
        for line in f.readlines():
            if line=='\n':
                tags.append(tag_temp)
                observations.append(obs_temp)
                tag_temp=[]
                obs_temp=[]
            else:
                tag_obs_lst = line.split(' ')
                if len(tag_obs_lst)==1:
                    obs_temp.append(tag_obs_lst[0].split('\n')[0])
                else:
                    # print(tag_obs_lst)
                    tag_temp.append(tag_obs_lst[1].split('\n')[0])
                    obs_temp.append(tag_obs_lst[0])
    return tags,observations




In [86]:
def tag_lib(tags):
    alltag=[]
    for tag in tags:
       for t in tag:
           alltag.append(t)
    #print(alltag)
    possible_tags=list(set(alltag))
    return possible_tags
def obs_lib(observations):
    all_obs=[]
    for observation in observations:
        for obs in observation:
            all_obs.append(obs)
    possible_obs=list(set(all_obs))
    return possible_obs

def map(lib):
    dictionary={}
    for i in range(len(lib)):
        dictionary[lib[i]]=i
    return dictionary

def label(data,dictionary):
    label=[]
    for d in data:
        temp=[]
        for ele in d:
            temp.append(dictionary[ele])
        label.append(temp)
    return label


In [87]:
def get_emission_para(obslabel,taglabel,obslib,taglib):
    emission_matrix=np.zeros(shape=(len(taglib),len(obslib)+1))
    for tls,ols in zip(taglabel,obslabel):
        for tl,ol in zip(tls,ols):
            emission_matrix[tl,ol]=emission_matrix[tl,ol]+1
    emission_matrix[:,-1]=0.5
    # print(emission_matrix)
    emission_matrix=emission_matrix/np.sum(emission_matrix,axis=1).reshape(emission_matrix.shape[0],1)
    return emission_matrix


In [88]:
def get_transition_para(taglabel,taglib):
    transition_matrix=np.zeros(shape=(len(taglib)+1,len(taglib)+1)) # +1 for START, STOP
    for tagseq in taglabel:
        for i in range(len(tagseq)):
            if i == 0:
                transition_matrix[-1,tagseq[i]]=transition_matrix[-1,tagseq[i]]+1
            elif i == len(tagseq)-1:
                transition_matrix[tagseq[i],-1]=transition_matrix[tagseq[i],-1]+1
            else:
                transition_matrix[tagseq[i-1],tagseq[i]]=transition_matrix[tagseq[i-1],tagseq[i]]+1
    transition_matrix=transition_matrix/np.sum(transition_matrix,axis=1).reshape(transition_matrix.shape[0],1)
    return transition_matrix



In [89]:
def log(x, inf_replace=-100):
    out = np.log(x)
    out[~np.isfinite(out)] = inf_replace
    return out

In [90]:
def topk(observation,transition_matrix,emission_matrix,k,obs_dict):
    obs_len=len(observation)
    numOftags=emission_matrix.shape[0]
    pi = np.zeros(shape=(numOftags,obs_len+1))

    for i in range(obs_len+1):
        if i == 0:
            if observation[i] in obs_dict.keys():
                pi[:,i]= transition_matrix[-1,:-1]+emission_matrix[:,obs_dict[observation[i]]]
                # print(i,pi[:,i])
            else:
                pi[:,i]= transition_matrix[-1,:-1]+emission_matrix[:,-1]
        elif i == obs_len:
            pi[:,i]=pi[:,i-1]+transition_matrix[:-1,-1]

        else:
            for j in range(numOftags):
                if observation[i] in obs_dict.keys():
                    # print(emission_matrix[:,obs_dict[observation[i]]])
                    node_pi=pi[:,i-1]+transition_matrix[:-1,j]+emission_matrix[j,obs_dict[observation[i]]]
                    # print(i,node_pi)
                else:
                    node_pi=pi[:,i-1]+transition_matrix[:-1,j]+emission_matrix[j,-1]
                # print(node_pi)
                node_pi.sort()
                # print(i,j,node_pi)
                pi[j,i]=node_pi[-k]

    return pi


In [91]:
import copy
def find_path(observation,pi,emission_matrix,transition_matrix,k,tag_lib,tag_dict):
    obs_len=len(observation)
    path=[]

    for i in range(obs_len-1,-1,-1):
        if i == obs_len-1:
            layer=pi[:,i]*transition_matrix[:-1,-1]
            layercopy=copy.deepcopy(layer)
            layer.sort()
            ele=layer[-k]
            idx=np.where(layercopy==ele)
            path.append(taglib[idx[0][0]])
            
        else:
            layer=pi[:,i]*transition_matrix[:-1,tag_dict[path[-1]]]
            layercopy=copy.deepcopy(layer)
            layer.sort()
            ele=layer[-k]
            idx=np.where(layercopy==ele)
            path.append(taglib[idx[0][0]])
    return path

## Get Parameters

In [92]:
tags,observations=load_data('./EN/train')

obslib=obs_lib(observations)
# print(obslib)
taglib=tag_lib(tags)
# print(taglib)

tag_dict=map(taglib)
obs_dict=map(obslib)
# print(obs_dict)

obs_label=label(observations,obs_dict)
tag_label=label(tags,tag_dict)
# print(obs_label)

emission_matrix=get_emission_para(obs_label,tag_label,obslib,taglib)
transition_matrix=get_transition_para(tag_label,taglib)

transition_transform=log(transition_matrix)
emission_transform=log(emission_matrix)

## Generate tag sequence

In [93]:
test_tags,test_observations=load_data('./EN/dev.in')
k=3
value=[]
pilst=[]
for obs in test_observations:
    pi=topk(obs,transition_transform,emission_transform,k,obs_dict)
    pilst.append(pi)
    last=pi[:,-1]
    last.sort()
    value.append(last[-k])



In [94]:
k=3
pathlst=[]
for obs,pi in zip(test_observations,pilst):
    # print(obs)
    path = find_path(obs,pi,emission_transform,transition_transform,k,tag_lib,tag_dict)
    # print(path)
    path.reverse()
    # print(path)
    pathlst.append(path)

with open('./EN/dev.p4.out','w') as f:
    for os,ps in zip(test_observations,pathlst):
        for o, p in zip(os,ps):
            f.write(o+' '+p+'\n')
        f.write('\n')
    f.close()