## Training to get emission parameters

In [12]:
import pandas as pd

# inputFile is the path of the data
def getEmissionParameters(inputFile,k):
    f = open(inputFile,"r", encoding="utf8")
    data = f.read()
    f.close()
    trainingDataList=[]
    data = data.split('\n')
    for i in data:
        if len(i)!=0:
            trainingDataList.append(i.split(' '))
        else:
            trainingDataList.append(['',''])

    df = pd.DataFrame(trainingDataList)
    df.columns=['word','state']
    states, dfArray = stateCounts(df)
    wordsDict = getWords(states,dfArray)
    numerator = wordCounts(wordsDict, k)
    b_uo = emissionParameters(numerator,states,k)
    return b_uo,wordsDict

In [13]:
# get total count of each state and save the row it was found in
# then get relevant count of state with word
def stateCounts(df):
    trainingDataArray = df.to_numpy()

    states = {}
    for row in range(len(trainingDataArray)):
        if trainingDataArray[row][1]=='':
            continue
        elif trainingDataArray[row][1] not in states.keys():
            states[trainingDataArray[row][1]]=[1,str(row)]
        else:
            states[trainingDataArray[row][1]]=[states[trainingDataArray[row][1]][0]+1,states[trainingDataArray[row][1]][1]+" "+str(row)]

    return states,trainingDataArray


In [14]:
# get all the words in the state
def getWords(states,trainingDataArray):
    all_the_words={}
    for state,value in states.items():
        # print(state)
        words_in_state=[]
        positions = value[1].split(' ')
        for i in positions:
            words_in_state.append(trainingDataArray[int(i)][0])
        all_the_words[state]=words_in_state
    return all_the_words


In [15]:
# get count(state-->word)
# words is a list of words for each state
def wordCounts(all_the_words, k):
    numerator = {}
    for state,words in all_the_words.items():
        for i in words:
            # print(i)
            if i+' '+state not in numerator.keys():
                numerator[i+" "+state] = 1
            else:
                numerator[i+" "+state] = numerator[i+" "+state] + 1
        # add in #unk# token
        numerator['#UNK# '+state] = k
    return numerator


In [16]:
# calculate emission parameter: word value/state value
def emissionParameters(numerator,states,k):
    b_uo = {}
    olds=''
    for header,numerator_count in numerator.items():
        cur_state = header.split(' ')[1]
        if cur_state!=olds: # a new state
            denom = states[cur_state][0] # get total count of the state

            b_uo[header]=numerator_count/(denom+k)
        else:
            b_uo[header]=numerator_count/(denom+k)

        olds = cur_state
    return b_uo


In [17]:
# b_uo,wordsInTrainingData = getEmissionParameters('./EN(2)/train',0.5)
# b_uo,wordsInTrainingData = getEmissionParameters('./SG(1)/train',0.5)
b_uo,wordsInTrainingData = getEmissionParameters('./CN/train',0.5)

f = open('./CN/wordsInTrain.txt','w', encoding="utf8")  # change file path here
for i in wordsInTrainingData.values():
    for j in i:
        f.write(j+'\n')
f.close()
print("Training has finished.")

## Testing

In [18]:
import numpy as np
# inputFile is the path of the data
def testForEmissionParameters(inputFile,train_Words,b_uo):
    k=0.5
    result=[]
    f = open(inputFile,"r", encoding="utf8")
    f2 = open(train_Words,'r', encoding="utf8")
    data = f.read()
    train_words = f2.read()
    f.close()
    f2.close()

    data = data.split('\n')
    train_words = train_words.split('\n')
    w_data = wordChecker(data,train_words)
    # get estimated y value
    final_wordlist=[]
    for word in w_data:
        possible_y = {}
        if len(word)==0:
            result.append('')
        else:
            unknown_check = word.find("#UNK#")
            for key,value in b_uo.items():
                cur_word,state = key.split(' ')
                if word==cur_word:
                    possible_y[value]=state
                elif unknown_check!=-1:
                    possible_y[value]=state
            if len(possible_y)!=0:
                optimal_y = max(possible_y.keys())
                result.append(possible_y[optimal_y])
        if unknown_check!=-1:
            final_wordlist.append(word[:len(word)-5])
            
        else:
            final_wordlist.append(word)
            
    return final_wordlist,result


In [19]:
# if word not in training set then make it #UNK#
def wordChecker(data,train_words):
    word_data=[]
    for word in data:
        if len(word)==0:
            word_data.append('')
        elif word in train_words:
            word_data.append(word)
        else:
            word_data.append(word+"#UNK#")

    return word_data


In [20]:

# data,result=testForEmissionParameters('./EN(2)/dev.in','./EN(2)/wordsInTrain.txt',b_uo)
# data,result=testForEmissionParameters('./SG(1)/dev.in','./SG(1)/wordsInTrain.txt',b_uo)
data,result=testForEmissionParameters('./CN/dev.in','./CN/wordsInTrain.txt',b_uo)

f = open('./CN/dev.p2.out','w', encoding="utf8")    # Change file path here
for i in range(len(data)):
    f.write(data[i]+" "+result[i]+'\n')
f.close()
print("dev.p2.out has been created.")