## Estimate emission parameters from the training set using MLE

In [1]:
def train_emission(filename):    
    with open(filename) as f:
        lines = f.readlines()
    
    # for each state y, keep track of each observation count i.e. count (y -> x)
    # before eg: {state1: {obs1: 1, obs2: 5}, state2: {obs1: 4}}
    emission_dict = {}
    
    # update emission_dict for state with count(y -> x) = 0
    # after eg: {state1: {obs1: 1, obs2: 5}, state2: {obs1: 4, obs2: 0}}
    observations = set()
    
    for line in lines:
        split_line = line.split()
        
        # process only valid lines
        if len(split_line) == 2:
            obs, state = split_line[0], split_line[1]
            
            observations.add(obs)
            
            if state not in emission_dict:
                emission_dict[state] = {}
                
            if obs not in emission_dict[state]:
                emission_dict[state][obs] = 1
            else:
                emission_dict[state][obs] += 1

    for k, v in emission_dict.items():
        for obs in observations:
            if obs not in v:
                emission_dict[k][obs] = 0
    
    return emission_dict

In [2]:
def get_emission_params(emission_dict, state, obs):
    
    if state not in emission_dict:
        return "State not in emission dict"
    
    state_data = emission_dict[state]
    
    if obs not in state_data:
        return "Word did not appear in training data"
    
    count_y_to_x = state_data[obs] # count(y -> x)
    count_y = sum(state_data.values()) # count(y)
    
    return count_y_to_x / count_y

In [3]:
emission_dict = train_emission('../dataset/EN/train')

emission_dict

{'B-NP': {'Municipal': 1,
  'corporate': 28,
  'a': 3589,
  'bonds': 21,
  'the': 7756,
  'He': 153,
  'stress-related': 5,
  'about': 203,
  'all': 152,
  'The': 1255,
  'Saturday': 6,
  'We': 110,
  'no': 102,
  'that': 676,
  'us': 35,
  'Walter': 4,
  'SNET': 2,
  'Innovative': 1,
  'our': 50,
  'revenue': 29,
  'three': 52,
  'whom': 10,
  'solo': 1,
  'they': 394,
  'their': 359,
  'hold': 5,
  'another': 63,
  'heightened': 1,
  'two': 111,
  'Stocks': 6,
  'good': 17,
  'data': 4,
  'less': 25,
  'Those': 14,
  'trading': 26,
  'yesterday': 123,
  '1.2': 2,
  'U.S.': 89,
  'this': 313,
  'one': 180,
  'Bernard': 4,
  'PaineWebber': 5,
  'New': 133,
  "'s": 1374,
  'Monday': 46,
  'last': 140,
  'which': 386,
  'precious': 7,
  'gold': 32,
  'particular': 7,
  'You': 35,
  'she': 51,
  'stereotypical': 1,
  'armadillos': 1,
  'I': 255,
  'my': 69,
  'Shattuck': 2,
  'it': 833,
  'Refcorp': 2,
  'August': 33,
  'ailing': 1,
  'FARMERS': 1,
  'abundant': 1,
  'This': 77,
  'an': 5

In [4]:
get_emission_params(emission_dict, 'B-NP', 'corporate')

0.0005919036042701618

## Modify the computation of emission probabilities
This is to account for words that appear in the test set but do not appear in the training set. Before running this function, such words should have been replaced by the `#UNK` token during the testing phase.

In [5]:
def get_emission_params_fixed(emission_dict, state, obs, k=0.5):
    
    if state not in emission_dict:
        return "State not in emission dict"
    
    state_data = emission_dict[state]
    count_y = sum(state_data.values()) # count(y)
    
    if obs == "#UNK#":
        count_y_to_x = k
    else:
        count_y_to_x = state_data[obs] # count(y -> x)
    
    return count_y_to_x / (count_y + k)