In [1]:
from tqdm.auto import tqdm
import numpy as np

# Emissions
Copied from part 2

In [2]:
def train_emission(filename):
    """
    Returns - a dictionary containing emission parameters
    """
    with open(filename, encoding="utf8") as f:
        lines = f.readlines()
    
    # for each state y, keep track of each observation count i.e. count (y -> x)
    # before eg: {state1: {obs1: 1, obs2: 5}, state2: {obs1: 4}}
    emission_dict = {}
    
    # update emission_dict for state with count(y -> x) = 0
    # after eg: {state1: {obs1: 1, obs2: 5}, state2: {obs1: 4, obs2: 0}}
    observations = set()
    
    for line in lines:
        split_line = line.split()
        
        # process only valid lines
        if len(split_line) == 2:
            obs, state = split_line[0], split_line[1]
            observations.add(obs)
            
            if state not in emission_dict:
                emission_dict[state] = {}
                
            if obs not in emission_dict[state]:
                emission_dict[state][obs] = 1
            else:
                emission_dict[state][obs] += 1

    for k, v in emission_dict.items():
        for obs in observations:
            if obs not in v:
                emission_dict[k][obs] = 0
    
    return emission_dict

In [3]:
def get_emission_params_fixed(emission_dict, state, obs, k=0.5):
    
    if state not in emission_dict:
        return 0
    
    state_data = emission_dict[state]
    count_y = sum(state_data.values()) # count(y)
    
    if obs == "#UNK#":
        count_y_to_x = k
    else:
        count_y_to_x = state_data[obs] # count(y -> x)
    
    return count_y_to_x / (count_y + k)

# Transitions
Function is changed from previous parts to count for transitions from t -> u -> v

In [4]:
def train_transition(filename):
    """
    Returns - 2 dictionaries containing transition parameters and a set of all unique states
    
    """
    with open(filename, encoding="utf8") as f:
        lines = f.readlines()
    
    # Store all set of 3 state transitions
    # eg: {(PREVSTART, START, state1): 2, (PREVSTART, START, state2): 3, ...}
    transition_dict = {}

    states = set()
    states.add('STOP')
    states.add('PREVSTART')
    states.add('START')
    
    prev_prev_state = 'PREVSTART'
    prev_state = 'START'
        
    for line in lines:
        split_line = line.split()
                
        # Start new sequence
        if len(split_line) < 2:
            if (prev_prev_state, prev_state, 'STOP') not in transition_dict.keys():
                transition_dict[(prev_prev_state, prev_state, 'STOP')] = 1
            else:
                transition_dict[(prev_prev_state, prev_state, 'STOP')] += 1
            prev_prev_state = 'PREVSTART'
            prev_state = 'START'

        # Processing the current sequence
        elif len(split_line) == 2:
            curr_state = split_line[1]
            states.add(curr_state)
            
            if (prev_prev_state, prev_state, curr_state) not in transition_dict.keys():
                transition_dict[(prev_prev_state, prev_state, curr_state)] = 1
            
            else:
                transition_dict[(prev_prev_state, prev_state, curr_state)] += 1
            
            prev_prev_state = prev_state
            prev_state = curr_state
            
    
    # Store all transition counts from state t to u
    transition_count = {}
    
    for (t, u ,v), count in transition_dict.items():
        if (t, u) not in transition_count.keys():
            transition_count[(t, u)] = count
        else:
            transition_count[(t, u)] += count
    
    return transition_dict, transition_count, states

In [5]:
def get_transition_params(transition_dict, transition_count, t, u, v):
    
    if (t, u, v) not in transition_dict.keys():
        return 0
    
    count_t_u_v = transition_dict[(t, u ,v)]
    count_t_u = transition_count[(t, u)]
            
    return count_t_u_v / count_t_u

## Helper functions

In [6]:
def obtain_all_obs(emission_dict):
    """
    Obtain all distinct observations words in the emission_dict.
    Purpose: This helps us identify words in Test Set that do not exist in the Training Set (or the emission_dict)
    Returns - Set of Strings.
    """
    all_observations = set()
    
    for s_to_obs_dict in emission_dict.values():
        for obs in s_to_obs_dict.keys():
            all_observations.add(obs)
            
    return all_observations

def preprocess_sentence(sentence, training_set_words):
    """
    sentence - a list of Strings (words or observations)
    Returns - a list of Strings, where Strings not in training_set_words are replaced by "#UNK#"
    """
    return [ word if word in training_set_words else "#UNK#" for word in sentence ]

def log(m):
    if isinstance(m, float) or isinstance(m, int):
        return -np.inf if m == 0 else np.log(m)
    
    m = np.clip(m, 1e-32, None)
    x = np.log(m)
    
    x[x <= np.log(1e-32)] = -np.inf
    
    return x

# Training

In [36]:
def second_order_vertibi(emission_dict, transition_dict, transition_count, states, sentence, is_preprocessed):
    # Helper functions
    a = lambda t, u, v: get_transition_params(transition_dict, transition_count, t, u, v)
    b = lambda state, obs: get_emission_params_fixed(emission_dict, state, obs, k=0.5)
    
    # Add all possible state combinations
    all_states = set()
    for u in states:
        for v in states:
            all_states.add((u, v))
            
    all_states = list(all_states)
    
    proc_sent = sentence
    if not is_preprocessed:
        training_set_words = obtain_all_obs(emission_dict)
        proc_sent = preprocess_sentence(sentence, training_set_words)
    proc_sent = ["start"] + proc_sent + ["stop"]
    
    n = len(proc_sent)
    
    # Pi Table
#     P = [{state: -np.inf for state in all_states} for x in range(n)]
    P = np.ones((n, len(all_states))) * -np.inf
    
    # Backtrace Table
#     B = [{state: None for state in all_states} for x in range(n)]
    B = [[ None for x in all_states ] for y in range(n)]
    
    # Base case is log(1) = 0
#     P[0][('PREVSTART', 'START')] = 0
    
    # Base case at j = 1
    trans = np.array([a('PREVSTART', 'START', v) for u, v in all_states])
    emis = np.array([b(v, proc_sent[1]) for u, v in all_states])
    P[1, :] = log(trans) + log(emis)
    B[1] = ['PREVSTART' for row in B[1]] # t is PREVSTART at j = 1
    
    # Base case at j = 2
    trans = np.array([a('START', u, v) for u, v in all_states])
    emis = np.array([b(v, proc_sent[1]) for u, v in all_states])
    P[2, :] = log(trans) + log(emis)
    B[2] = ['START' for row in B[2]]
    
    # Recursive forward step
    for j in range(3, n - 1):
        x = proc_sent[j]
        for t, _ in all_states:
            for row_no, (u , v) in enumerate(all_states):
                transitions = np.array([a(t,u,v) for t, _ in all_states])
                prev_scores = P[j-1, :] + log(transitions) + log(b(v,x))
                top = prev_scores.argmax()
                P[j,row_no] = prev_scores[top]
                B[j][row_no] = all_states[top][0] # store t
                if P[j,row_no] == -np.inf:
                    B[j][row_no] = None
#         for v in states:  # Current state
#             for t, u in all_states: # All state combis, t is the grandfather and u is the parent
#                 p = P[j - 1][(t,u)] + log(a(t, u, v)) + log(b(v, x))
#                 if p > P[j][(u, v)]:
#                     P[j][(u, v)] = p # Update probability [{(u,v): p,... }, ...]
#                     B[j][(u, v)] = t # Update backpointer [{(u,v): t, ...}, ...]
    
    # Termination
#     j = n - 1
#     v = 'STOP'
#     for t, u in list(all_states):
#         p = P[j - 1][(t, u)] + log(a(t, u, v))
#         if p > P[j][(u, v)]:
#             P[j][(u, v)] = p # Update probability
#             B[j][(u, v)] = t # Update backpointer
    
    # Termination: j=n-1. Note that proc_sent[n-1] give us the last word in sentence.
    j = n - 1
    transitions = np.array([ a(t, u, "STOP") for t,u in all_states ])
    previous_scores = P[j-1] + log(transitions)
    u, v = all_states[previous_scores.argmax()]

    assert(v == 'STOP')
    return

    # Backtrace
    
    state_seq = []
    for j in range(n-1, 0 ,-1):
        uv_row_no = all_states.index((u, v))
        t = B[j][uv_row_no]
        
        v = state_seq[-1]
        u = t
        
        t = B[j][uv_row_no]
        
        if t == None:
            return ['O'] * (n-2)
            
        state_seq.append(u)
        
    state_seq = state_seq[::-1][1:-1]  # reverse and drop STOP
    return state_seq
        
    # Backtrace
#     u, v = max(P[n - 1], key=P[n - 1].get) # argmax (u, v)
#     state_seq = []
#     for i in range(n - 1, 0, -1):
#         t = B[i][(u, v)]
#         if t != None:       # Check if t is not None
#             state_seq.append(v) # v
#             u, v = t, u     # set t, u as the next u, v
#         else:               # No possible transition to START
#             state_seq = ['O'] * n-2
#             break
#     state_seq = state_seq[::-1][:-1]  # reverse and drop STOP
    
    
    
    
    return P, B, state_seq

# Testing function

In [37]:
train_file = '../dataset/EN/train'
emission_dict = train_emission(train_file)
transition_dict, transition_count, states = train_transition(train_file)
sentence = 'He added that the stress-related compensation claims is about twice the average for all injury claims .'

sentence = sentence.split(' ')

seq = second_order_vertibi(emission_dict, transition_dict, transition_count, states, sentence, is_preprocessed=False)
seq

AssertionError: 

In [24]:
len(sentence)

29

# Evaluation on dev.in

In [None]:
sets = ['EN', 'SG', 'CN']

for dataset in tqdm(sets):
    
    print(f"Evaluating on {dataset}.")
    
    in_file = f"../dataset/{dataset}/dev.in"
    train_file = f"../dataset/{dataset}/train"
    out_file = f"../dataset/{dataset}/dev.p5.out"
    
    # Train
    emission_dict = train_emission(train_file)
    transition_dict, transition_count, states = train_transition(train_file)
    
    # Obtain all distinct words in Training Set
    training_set_words = obtain_all_obs(emission_dict)
    
    # Create file handler to write to /dev.p5.out
    outf_h = open(out_file, "w", encoding="utf8")
    
    # Read in file
    with open(in_file, encoding="utf8") as f:
        lines = f.readlines()
        
    sent = [] # initialise array to store 1 sentence at a time.
    for word in tqdm(lines):
        
        if word != "\n":
            sent.append(word.strip())
            
        # We reached end of sentence - time to predict sentence's sequence of states (aka tags)
        else:
            # preprocess sentence (change unknown words to "#UNK#")
            sent_proc = preprocess_sentence(sent, training_set_words)
            # obtain processed sentence's predicted state seq (list of corresponding predicted states for each word in sent)
            _, _, sent_state_sequence = second_order_vertibi(emission_dict, transition_dict, transition_count, states, sent_proc, is_preprocessed=True)
            
            for word, state in zip(sent, sent_state_sequence):
                outf_h.write(word + ' ' + state)
                outf_h.write("\n") # newline for each word
            outf_h.write("\n") # another newline when end of sentence

            # Reset sentence list
            sent = []
            
    outf_h.close()  

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

Evaluating on EN.


HBox(children=(FloatProgress(value=0.0, max=27225.0), HTML(value='')))

# Running EvalScript

In [None]:
%cd ../EvalScript

In [None]:
datasets = ['EN', 'SG', 'CN']

for st in datasets:
    gold = f"../dataset/{st}/dev.out"
    pred = f"../dataset/{st}/dev.p5.out"
    print(st)
    !python evalResult.py $gold $pred
    print("=" * 20, end="\n\n")