In [83]:
import pandas as pd
from tqdm.auto import tqdm

In [84]:
def train_emission(filename):
    """
    Returns - a dictionary containing emission parameters
    """
    with open(filename, encoding="utf8") as f:
        lines = f.readlines()
    
    # for each state y, keep track of each observation count i.e. count (y -> x)
    # before eg: {state1: {obs1: 1, obs2: 5}, state2: {obs1: 4}}
    emission_dict = {}
    
    # update emission_dict for state with count(y -> x) = 0
    # after eg: {state1: {obs1: 1, obs2: 5}, state2: {obs1: 4, obs2: 0}}
    observations = set()
    
    for line in lines:
        split_line = line.split()
        
        # process only valid lines
        if len(split_line) == 2:
            obs, state = split_line[0], split_line[1]
            
            observations.add(obs)
            
            if state not in emission_dict:
                emission_dict[state] = {}
                
            if obs not in emission_dict[state]:
                emission_dict[state][obs] = 1
            else:
                emission_dict[state][obs] += 1

    for k, v in emission_dict.items():
        for obs in observations:
            if obs not in v:
                emission_dict[k][obs] = 0
    
    return emission_dict

In [85]:
def get_emission_params_fixed(emission_dict, state, obs, k=0.5):
    
    if state not in emission_dict:
        return 0
    
    state_data = emission_dict[state]
    count_y = sum(state_data.values()) # count(y)
    
    if obs == "#UNK#":
        count_y_to_x = k
    else:
        count_y_to_x = state_data[obs] # count(y -> x)
    
    return count_y_to_x / (count_y + k)

In [86]:
def train_transition(filename):
    """
    Returns - a dictionary containing transition parameters
    """
    with open(filename, encoding="utf8") as f:
        lines = f.readlines()
    
    # for each state u, keep track of each state count i.e. count (u,v)
    # before eg: {START: {y1: 1, y2: 5}, y1: {y1: 3, y2: 4, STOP: 1}, y2: {y1: 1, STOP: 3}}
    transition_dict = {}
    
    # after eg: {START: {y1: 1, y2: 5, STOP: 0}, y1: {y1: 3, y2: 4, STOP: 1}, y2: {y1: 1, y2: 0, STOP: 3}}
    states = set()
    states.add('STOP')
    states.add('PREVSTART')
    states.add('START')
    
    prev_prev_state = 'PREVSTART'
    prev_state = 'START'
        
    for line in lines:
        split_line = line.split()
                
        # Start new sequence
        if len(split_line) < 2:
            if (prev_prev_state, prev_state, 'STOP') not in transition_dict.keys():
                transition_dict[(prev_prev_state, prev_state, 'STOP')] = 1
            else:
                transition_dict[(prev_prev_state, prev_state, 'STOP')] += 1
            prev_prev_state = 'PREVSTART'
            prev_state = 'START'

        # processing the sentence
        elif len(split_line) == 2:
            curr_state = split_line[1]
            states.add(curr_state)
            
            if (prev_prev_state, prev_state, curr_state) not in transition_dict.keys():
                transition_dict[(prev_prev_state, prev_state, curr_state)] = 1
            
            else:
                transition_dict[(prev_prev_state, prev_state, curr_state)] += 1
            
            prev_prev_state = prev_state
            prev_state = curr_state
            
    
    # Store all transition counts from state t to u
    transition_count = {}
    
    for (t, u ,v), count in transition_dict.items():
        if (t, u) not in transition_count.keys():
            transition_count[(t, u)] = count
        else:
            transition_count[(t, u)] += count
    
    return transition_dict, transition_count, states

In [87]:
def get_transition_params(transition_dict, transition_count, t, u, v):
    
    if (t, u, v) not in transition_dict.keys():
        return 0
    
    count_t_u_v = transition_dict[(t, u ,v)]
    count_t_u = transition_count[(t, u)]
            
    return count_t_u_v / count_t_u

In [88]:
transition_dict, transition_count, states = train_transition('../dataset/EN/train')
print(get_transition_params(transition_dict, transition_count, 'PREVSTART', 'START', 'B-NP'))
print(transition_count)

0.6480490669450607
{('PREVSTART', 'START'): 7663, ('START', 'B-NP'): 4966, ('B-NP', 'I-NP'): 32390, ('I-NP', 'B-VP'): 7365, ('B-VP', 'B-ADVP'): 570, ('B-ADVP', 'B-ADJP'): 59, ('B-ADJP', 'I-ADJP'): 490, ('I-ADJP', 'I-ADJP'): 84, ('I-ADJP', 'B-PP'): 164, ('B-PP', 'B-NP'): 17064, ('I-NP', 'B-PP'): 8544, ('I-NP', 'O'): 12410, ('O', 'O'): 2710, ('O', 'B-ADJP'): 209, ('B-NP', 'B-VP'): 6164, ('B-VP', 'B-PP'): 1803, ('I-NP', 'I-NP'): 22201, ('B-VP', 'B-SBAR'): 467, ('B-SBAR', 'B-NP'): 1657, ('B-VP', 'B-NP'): 6304, ('B-VP', 'O'): 1231, ('O', 'B-NP'): 8288, ('I-NP', 'B-NP'): 2601, ('B-ADVP', 'B-PP'): 608, ('B-VP', 'I-VP'): 6828, ('I-VP', 'B-NP'): 3610, ('O', 'B-VP'): 2746, ('I-VP', 'I-VP'): 3331, ('B-NP', 'O'): 3830, ('B-NP', 'B-PP'): 2744, ('I-VP', 'B-PP'): 1507, ('B-PP', 'B-PP'): 340, ('B-NP', 'B-NP'): 1367, ('I-NP', 'B-SBAR'): 348, ('B-SBAR', 'B-VP'): 73, ('B-NP', 'B-ADVP'): 464, ('B-ADVP', 'B-NP'): 750, ('B-ADVP', 'I-ADVP'): 310, ('I-ADVP', 'B-NP'): 68, ('START', 'B-PP'): 833, ('I-VP', 'B-SB

In [89]:
def obtain_all_obs(emission_dict):
    """
    Obtain all distinct observations words in the emission_dict.
    Purpose: This helps us identify words in Test Set that do not exist in the Training Set (or the emission_dict)
    Returns - Set of Strings.
    """
    all_observations = set()
    
    for s_to_obs_dict in emission_dict.values():
        for obs in s_to_obs_dict.keys():
            all_observations.add(obs)
            
    return all_observations

def preprocess_sentence(sentence, training_set_words):
    """
    sentence - a list of Strings (words or observations)
    Returns - a list of Strings, where Strings not in training_set_words are replaced by "#UNK#"
    """
    return [ word if word in training_set_words else "#UNK#" for word in sentence ]

In [126]:
def second_order_vertibi(emission_dict, transition_dict, transition_count, states, sentence, is_preprocessed):
    # Helper functions
    a = lambda t, u, v: get_transition_params(transition_dict, transition_count, t, u, v)
    b = lambda state, obs: get_emission_params_fixed(emission_dict, state, obs, k=0.5)
    
    # Add all possible state combinations
    all_states = set()
    for u in states:
        for v in states:
            all_states.add((u, v))
    
    proc_sent = sentence
    if not is_preprocessed:
        training_set_words = obtain_all_obs(emission_dict)
        proc_sent = preprocess_sentence(sentence, training_set_words)
    
    n = len(proc_sent) + 2
    
    # Pi Table
#     P = np.zeros((len(all_states), n))
    P = pd.DataFrame(index=all_states, columns=range(n)).fillna(0)
    # Backtrace Table
#     B = np.zeros((len(all_states), n))
    B = pd.DataFrame(index=all_states, columns=range(n))
    
    # Initialise the starting states
    P[0][('PREVSTART', 'START')] = 1
#     P.loc[('PREVSTART', 'START'), 0] = 1
    
    # Recursive forward step
    for j in range(1, n - 1):
        x = proc_sent[j - 1]
        
        for v in states:  # Current state
            for u in states:  # Previous state
                for t in states: # Previous previous state
                    p = P.loc[(t, u), j - 1] * a(t, u, v) * b(v, x)
                    if p > P.loc[(u, v), j]:
                        P.loc[(u, v), j] = p  # Update probability
                        B.loc[(u, v), j] = t  # Update backpointer 

    
    # Termination
    j = n - 1
    v = 'STOP'
    for u in states: # Previous state
        for t in states: # Previous previous stat
            p = P.loc[(t, u), j - 1] * a(t, u, v)
            if p > P.loc[(u, v), j]:
                P.loc[(u, v), j] = p  # Update probability
                B.loc[(u, v), j] = t  # Update backpointer

    
    
    
    

    # Backtrace
    state_combi = P[n-1].idxmax()

    state_seq = []
    for i in range(n-1, 0, -1):
        prev_state = B.loc[state_combi, i]
        if isinstance(prev_state, str): # Check if previous state is of type str
            state_seq.append(state_combi[1])
            state_combi = (prev_state, state_combi[0])
        else: # No possible transition to START
            state_seq = ['O'] * n
            break
    state_seq = state_seq[::-1][:-1]  # reverse and drop STOP
    
    return P, B, state_seq

In [None]:
train_file = '../dataset/EN/train'
emission_dict = train_emission(train_file)
transition_dict, transition_count, states = train_transition(train_file)
sentence = "He added that the stress-related compensation claims is about twice the average for all injury claims ."
sentence = sentence.split(' ')

_, _, seq = second_order_vertibi(emission_dict, transition_dict, transition_count, states, sentence, is_preprocessed=False)
print(seq)

In [108]:
sets = ['EN', 'SG', 'CN']

for dataset in tqdm(sets):
    
    print(f"Evaluating on {dataset}.")
    
    in_file = f"../dataset/{dataset}/dev.in"
    train_file = f"../dataset/{dataset}/train"
    out_file = f"../dataset/{dataset}/dev.p5.out"
    
    # Train
    emission_dict = train_emission(train_file)
    transition_dict, transition_count, states = train_transition(train_file)
    
    # Obtain all distinct words in Training Set
    training_set_words = obtain_all_obs(emission_dict)
    
    # Create file handler to write to /dev.p5.out
    outf_h = open(out_file, "w", encoding="utf8")
    
    # Read in file
    with open(in_file, encoding="utf8") as f:
        lines = f.readlines()
        
    sent = [] # initialise array to store 1 sentence at a time.
    for word in tqdm(lines):
        
        if word != "\n":
            sent.append(word.strip())
            
        # We reached end of sentence - time to predict sentence's sequence of states (aka tags)
        else:
            # preprocess sentence (change unknown words to "#UNK#")
            sent_proc = preprocess_sentence(sent, training_set_words)
            # obtain processed sentence's predicted state seq (list of corresponding predicted states for each word in sent)
            _, _, sent_state_sequence = second_order_vertibi(emission_dict, transition_dict, transition_count, states, sent_proc, is_preprocessed=True)
            
            for word, state in zip(sent, sent_state_sequence):
                outf_h.write(word + ' ' + state)
                outf_h.write("\n") # newline for each word
            outf_h.write("\n") # another newline when end of sentence

            # Reset sentence list
            sent = []
            
    outf_h.close()  

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

Evaluating on EN.


HBox(children=(FloatProgress(value=0.0, max=27225.0), HTML(value='')))





TypeError: '>' not supported between instances of 'float' and 'str'