In [1]:
from tqdm.auto import tqdm

## Estimate emission parameters from the training set using MLE

In [2]:
def train_emission(filename):
    """
    Returns - a dictionary containing emission parameters
    """
    with open(filename, encoding="utf8") as f:
        lines = f.readlines()
    
    # for each state y, keep track of each observation count i.e. count (y -> x)
    # before eg: {state1: {obs1: 1, obs2: 5}, state2: {obs1: 4}}
    emission_dict = {}
    
    # update emission_dict for state with count(y -> x) = 0
    # after eg: {state1: {obs1: 1, obs2: 5}, state2: {obs1: 4, obs2: 0}}
    observations = set()
    
    for line in lines:
        split_line = line.split()
        
        # process only valid lines
        if len(split_line) == 2:
            obs, state = split_line[0], split_line[1]
            
            observations.add(obs)
            
            if state not in emission_dict:
                emission_dict[state] = {}
                
            if obs not in emission_dict[state]:
                emission_dict[state][obs] = 1
            else:
                emission_dict[state][obs] += 1

    for k, v in emission_dict.items():
        for obs in observations:
            if obs not in v:
                emission_dict[k][obs] = 0
    
    return emission_dict

In [3]:
def get_emission_params(emission_dict, state, obs):
    
    if state not in emission_dict:
        raise Exception("State not in emission dict")
    
    state_data = emission_dict[state]
    
    if obs not in state_data:
        raise Exception("Word did not appear in training data")
    
    count_y_to_x = state_data[obs] # count(y -> x)
    count_y = sum(state_data.values()) # count(y)
    
    return count_y_to_x / count_y

### Testing with example 

In [4]:
emission_dict = train_emission('../dataset/EN/train')

# emission_dict
get_emission_params(emission_dict, 'I-NP', 'corporate')

0.0003846787932076716

## Modify the computation of emission probabilities
This is to account for words that appear in the test set but do not appear in the training set. Before running this function, such words should have been replaced by the `#UNK#` token during the testing phase.

In [5]:
def get_emission_params_fixed(emission_dict, state, obs, k=0.5):
    
    if state not in emission_dict:
        raise Exception("State not in emission dict")
    
    state_data = emission_dict[state]
    count_y = sum(state_data.values()) # count(y)
    
    if obs == "#UNK#":
        count_y_to_x = k
    else:
        count_y_to_x = state_data[obs] # count(y -> x)
    
    return count_y_to_x / (count_y + k)

## Implement a simple system that produces the tag for each word `x` in the sequence

In [6]:
def obtain_all_obs(emission_dict):
    """
    Obtain all distinct observations words in the emission_dict.
    Purpose: This helps us identify words in Test Set that do not exist in the Training Set (or the emission_dict)
    Returns - Set of Strings.
    """
    all_observations = set()
    
    for s_to_obs_dict in emission_dict.values():
        for obs in s_to_obs_dict.keys():
            all_observations.add(obs)
            
    return all_observations

def preprocess_sentence(sentence, training_set_words):
    """
    sentence - a list of Strings (word or observations)
    Returns - a list of Strings, where Strings not in training_set_words are replaced by "#UNK#"
    """
    return [ word if word in training_set_words else "#UNK#" for word in sentence ]

In [7]:
def label_sequence(sentence, emission_dict):
    """
    sentence - a list of Strings (words or observations).
    emission_dict - a dictionary containing emission parameters
    Returns - list of Strings (corresponding highest prob state for each word)
    """
    
    all_states = list(emission_dict.keys()) # all distinct states
    
    sequence = [] # aka tags
    
    for word in sentence:
        emission_state = { state: get_emission_params_fixed(emission_dict, state, word) for state in all_states }
        sequence.append(max(emission_state, key=lambda state: emission_state[state]))
        
    return sequence

### Testing with example

In [8]:
training_set_words = obtain_all_obs(emission_dict)

sentence = "The quick brown fox jumps over the lazy dog ."
# sentence = "The incident occurred Saturday night ."

sentence = sentence.split(' ')
sentence = preprocess_sentence(sentence, training_set_words)

print(sentence)
print(label_sequence(sentence, emission_dict))

['The', 'quick', '#UNK#', 'fox', 'jumps', 'over', 'the', '#UNK#', 'dog', '.']
['B-NP', 'B-NP', 'B-UCP', 'B-NP', 'I-NP', 'B-PRT', 'B-NP', 'B-UCP', 'I-NP', 'O']


## Evaluate on dev.in

In [9]:
sets = ['EN', 'SG', 'CN']

for dataset in tqdm(sets):
    
    print(f"Evaluating on {dataset}.")
    
    in_file = f"../dataset/{dataset}/dev.in"
    train_file = f"../dataset/{dataset}/train"
    out_file = f"../dataset/{dataset}/dev.p2.out"
    
    # Train
    emission_dict = train_emission(train_file)
    # Obtain all distinct words in Training Set
    training_set_words = obtain_all_obs(emission_dict)
    
    # Create file handler to write to /dev.p2.out
    outf_h = open(out_file, "w", encoding="utf8")
    
    # Read in file
    with open(in_file, encoding="utf8") as f:
        lines = f.readlines()
        
    sent = [] # initialise array to store 1 sentence at a time.
    for word in tqdm(lines):
        
        if word != "\n":
            sent.append(word.strip())
            
        # We reached end of sentence - time to predict sentence's sequence of states (aka tags)
        else:
            # preprocess sentence (change unknown words to "#UNK#")
            sent_proc = preprocess_sentence(sent, training_set_words)
            # obtain processed sentence's predicted state seq (list of corresponding predicted states for each word in sent)
            sent_state_sequence = label_sequence(sent_proc, emission_dict)

            for word, state in zip(sent, sent_state_sequence):
                outf_h.write(word + ' ' + state)
                outf_h.write("\n") # newline for each word
            outf_h.write("\n") # another newline when end of sentence

            # Reset sentence list
            sent = []
    
    outf_h.close()  

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

Evaluating on EN.


HBox(children=(FloatProgress(value=0.0, max=27225.0), HTML(value='')))


Evaluating on SG.


HBox(children=(FloatProgress(value=0.0, max=36841.0), HTML(value='')))


Evaluating on CN.


HBox(children=(FloatProgress(value=0.0, max=13414.0), HTML(value='')))





### Testing with Example

In [10]:
training_set_words = obtain_all_obs(emission_dict)

with open("../dataset/SG/dev.in", encoding="utf8") as f:
    lines = f.readlines()

In [11]:
sent = []

outf = open("demo.txt", "w", encoding="utf8")
for word in tqdm(lines):

    if word != "\n":
        sent.append(word.strip())
    
    # We reached end of sentence - time to predict sentence's sequence of states (aka tags)
    else: 
        # preprocess sentence (change unknown words to "#UNK#")
        sent_proc = preprocess_sentence(sent, training_set_words)
        # obtain processed sentence's predicted state seq (list of corresponding predicted states for each word in sent)
        sent_state_sequence = label_sequence(sent_proc, emission_dict)

        for word, state in zip(sent, sent_state_sequence):
            outf.write(word + ' ' + state)
            outf.write("\n") # newline for each word
        outf.write("\n") # another newline when end of sentence

        # Reset sentence list
        sent = []

outf.close()

HBox(children=(FloatProgress(value=0.0, max=36841.0), HTML(value='')))




# Gold Evaluation

In [12]:
%cd ../EvalScript

C:\Users\kting\Documents\GitHub\50.007-design-project\EvalScript


In [13]:
datasets = ['EN', 'SG', 'CN']

for dataset in datasets:
    gold = f"../dataset/{dataset}/dev.out"
    pred = f"../dataset/{dataset}/dev.p2.out"
    print(dataset)
    !python evalResult.py $gold $pred
    print("=" * 20, end="\n\n")

EN

#Entity in gold data: 13179
#Entity in prediction: 18650

#Correct Entity : 9542
Entity  precision: 0.5116
Entity  recall: 0.7240
Entity  F: 0.5996

#Correct Sentiment : 8456
Sentiment  precision: 0.4534
Sentiment  recall: 0.6416
Sentiment  F: 0.5313

SG

#Entity in gold data: 4301
#Entity in prediction: 12237

#Correct Entity : 2386
Entity  precision: 0.1950
Entity  recall: 0.5548
Entity  F: 0.2885

#Correct Sentiment : 1531
Sentiment  precision: 0.1251
Sentiment  recall: 0.3560
Sentiment  F: 0.1851

CN

#Entity in gold data: 700
#Entity in prediction: 4248

#Correct Entity : 345
Entity  precision: 0.0812
Entity  recall: 0.4929
Entity  F: 0.1395

#Correct Sentiment : 167
Sentiment  precision: 0.0393
Sentiment  recall: 0.2386
Sentiment  F: 0.0675

