In [1]:
import re
import json
from collections import defaultdict
import numpy as np

### Task 1: Vocabulary Creation (20 points)

What is the selected threshold for unknown words replacement? What is the total size of your
vocabulary and what is the total occurrences of the special token ‘< unk >’
after replacement?

In [2]:
n_threshold = 3
train_vocab = defaultdict(int)

# File importing
with open('../../data/vocab-data/train', 'r') as tr_file:
    Lines = tr_file.readlines()
 
    # Create vocab
    for line in Lines:
        if line.strip():
            word = re.split(r'\t', line)[1]
            cleaned_word = re.sub(r'\W+', '', word)     

        if word not in train_vocab:
            train_vocab[cleaned_word] = 0
        train_vocab[cleaned_word] += 1

# Handle <unk> tokens  
unk_count = sum(v for k, v in train_vocab.items() if v <= n_threshold)
new_vocab = {k: v for k, v in train_vocab.items() if v > n_threshold}
new_vocab['<unk>'] = unk_count
indexed_vocab = {word: (index, count) for index, (word, count) in enumerate(sorted(new_vocab.items(), key = lambda item: item[1], reverse=True), start = 1)}

# File Writing
f = open("../../data/outputs/train_vocab.txt", "a")
for k,v in indexed_vocab.items():
    # word index count
    new_line = f"{k}\t{v[0]}\t{v[1]}\n"
    f.write(new_line)
f.close()


## HMM Model with Emission & Transition Probabilities

In [3]:
transition_counts = defaultdict(int)
emission_counts = defaultdict(int)
state_counts = defaultdict(int)

# Open training data
with open('../../data/vocab-data/train', 'r') as tr_file:
    Lines = tr_file.readlines()
    prev_state = None

    # Process each line
    for line in Lines:
        line = line.strip()
        parts = line.split('\t')
        if len(parts) >= 2:
            word, state = parts[1], parts[2]
            cleaned_word = re.sub(r'\W+', '', word)

            # Emission and transition counts
            emission_counts[(state, cleaned_word)] += 1
            state_counts[state] += 1
            if prev_state is not None:
                transition_counts[(prev_state, state)] += 1
            prev_state = state
        else:
            # Calculate emission probability of new sentence to a word type
            word_type = '/n'
            state = '<new_line>'
            if prev_state is not None:
                transition_counts[(prev_state, state)] += 1
                state_counts[state] += 1

            prev_state = state

In [4]:
transition_counts

defaultdict(int,
            {('NNP', 'NNP'): 33139,
             ('NNP', ','): 12131,
             (',', 'CD'): 987,
             ('CD', 'NNS'): 5502,
             ('NNS', 'JJ'): 995,
             ('JJ', ','): 1717,
             (',', 'MD'): 490,
             ('MD', 'VB'): 7541,
             ('VB', 'DT'): 5661,
             ('DT', 'NN'): 37299,
             ('NN', 'IN'): 31554,
             ('IN', 'DT'): 31088,
             ('DT', 'JJ'): 17200,
             ('JJ', 'NN'): 26472,
             ('NN', 'NNP'): 1214,
             ('NNP', 'CD'): 1680,
             ('CD', '.'): 2530,
             ('.', '<new_line>'): 35255,
             ('<new_line>', 'NNP'): 7562,
             ('NNP', 'VBZ'): 3434,
             ('VBZ', 'NN'): 751,
             ('IN', 'NNP'): 14091,
             (',', 'DT'): 6211,
             ('DT', 'NNP'): 8757,
             ('NNP', 'VBG'): 155,
             ('VBG', 'NN'): 1819,
             ('NN', '.'): 13890,
             ('JJ', 'CC'): 1003,
             ('CC', 'JJ'): 252

In [5]:
# Calculate probabilities
transition_probs = {k: v / state_counts[k[0]] for k, v in transition_counts.items()}
emission_probs = {k: v / state_counts[k[0]] for k, v in emission_counts.items()}

In [6]:
# HMM Model for JSON
hmm_model = {
    "transition": {f"({k[0]},{k[1]})": v for k, v in transition_probs.items()},
    "emission": {f"({k[0]},{k[1]})": v for k, v in emission_probs.items()}
}

with open("../../data/outputs/hmm.json", "w") as f:
    json.dump(hmm_model, f, indent = 4)

In [7]:
print(len(hmm_model['transition']))
print(len(hmm_model['emission']))

1416
48892


## Greedy HMM Decoding

In [8]:
# Get set of all possible tags
states = set()
for transition_key in hmm_model["transition"].keys():
    state_pair = transition_key.strip("()").split(',')
    for state in state_pair:
        cleaned_state = state.strip("\"' ")
        if cleaned_state:
            states.add(cleaned_state)

states_list = list(states)
print(states_list)

['LS', 'EX', 'JJ', 'VBG', 'WP', 'VBD', 'VB', 'PRP', '-LRB-', 'UH', ':', 'FW', 'TO', '-RRB-', 'VBP', 'WRB', 'NNPS', 'RBS', 'NNP', '$', 'IN', 'CD', 'DT', 'VBZ', 'RBR', 'POS', 'MD', 'VBN', 'SYM', 'NNS', '<new_line>', 'RP', 'WP$', 'JJR', 'JJS', 'CC', '.', 'NN', 'RB', '#', 'PRP$', 'PDT', 'WDT', '``']


In [21]:
output_file_path = '../../data/outputs/greedy.out'

with open('../../data/vocab-data/dev', 'r') as tr_file, open(output_file_path, 'w') as out_file:
    Lines = tr_file.readlines()
    max = -np.inf
    prev_state = '<new_line>'
    
    for line in Lines:
        line = line.split('\t')
        if len(line) == 3:
            _, word, index = line[2].replace('\n',''), line[1], line[0]
            
            prev_state = '<new_line>' if index == '1' else prev_state
                
            for state in states_list:
                trans_indexing = f'({prev_state},{state})'
                emiss_indexing = f'({state},{word})'
                
                try:
                    trans = hmm_model['transition'][trans_indexing]
                    emiss = hmm_model['emission'][emiss_indexing]
                except KeyError:
                    continue

                s_prob = trans * emiss
                if s_prob > max:
                    max = s_prob
                    optim_state = state
                    # print(f'Previous {prev_state}, State: {state}, word: {word}')

            out_file.write(f'{index}\t{word}\t{optim_state}\n')
            
            prev_state = state 
            max = -np.inf


In [24]:
! python ../eval.py -p ../../data/outputs/greedy.out -g ../../data/vocab-data/dev


total: 131768, correct: 97430, accuracy: 73.94%


In [None]:
# python eval.py -p greedy.out -g dev
# python POS-Sequence-Labeling\eval.py -p ./data/outputs/greedy.out -g ./data/vocab-data/dev
# There should be two greedy.out files, one for dev data, the other for test data. You need compute the accuracy on dev set and submit the greedy.out of test data.


## Viterbi Decoding 