In [1]:
import re
import json
from collections import defaultdict
import numpy as np
import pandas as pd 

### Task 1: Vocabulary Creation (20 points)

What is the selected threshold for unknown words replacement? What is the total size of your
vocabulary and what is the total occurrences of the special token ‘< unk >’
after replacement?

In [2]:
n_threshold = 3
train_vocab = defaultdict(int)

vocab_df  = pd.read_csv('../../data/vocab-data/train', sep='\t', skip_blank_lines = False, header = None)
vocab_df.columns = ['Index', 'Word', 'POS']

print(vocab_df.columns)

# File importing
with open('../../data/vocab-data/train', 'r') as tr_file:
    Lines = tr_file.readlines()

    # Create vocab
    for line in Lines:
        if line.strip():
            word = re.split(r'\t', line)[1]
            cleaned_word = re.sub(r'\W+', '', word)     

        if word not in train_vocab:
            train_vocab[cleaned_word] = 0
        train_vocab[cleaned_word] += 1


Index(['Index', 'Word', 'POS'], dtype='object')


If some word 'xyz' has frequency 3 and my threshold for categorizing as '<unk>' is 4. Then  we should add 3 to the frequency occurrence count of '<unk>'

In [3]:
# Handle <unk> tokens  
unk_count = sum(v for k, v in train_vocab.items() if v <= n_threshold)
new_vocab = {k: v for k, v in train_vocab.items() if v > n_threshold}
new_vocab['<unk>'] = unk_count
indexed_vocab = {word: (index, count) for index, (word, count) in enumerate(sorted(new_vocab.items(), key = lambda item: item[1], reverse=True), start = 1)}

In [4]:
# File Writing
f = open("../../data/outputs/train_vocab.txt", "a")
for k,v in indexed_vocab.items():
    # word index count
    new_line = f"{k}\t{v[0]}\t{v[1]}\n"
    f.write(new_line)
f.close()

## HMM Model with Emission & Transition Probabilities

In [5]:
transition_counts = defaultdict(int)
emission_counts = defaultdict(int)
state_counts = defaultdict(int)

# Open training data
with open('../../data/vocab-data/train', 'r') as tr_file:
    Lines = tr_file.readlines()
    prev_state = None

    # Process each line
    for line in Lines:
        line = line.strip()
        parts = line.split('\t')
        
        if len(parts) >= 2:
            word, state = parts[1], parts[2]
            cleaned_word = word # re.sub(r'\W+', '', word)

            # Emission and transition counts
            emission_counts[(state, cleaned_word)] += 1
            state_counts[state] += 1
            if prev_state is not None:
                transition_counts[(prev_state, state)] += 1
            prev_state = state

        else:
            # Calculate emission probability of new sentence to a word type
            word_type = '/n'
            state = '<new_line>'
            if prev_state is not None:
                transition_counts[(prev_state, state)] += 1
                state_counts[state] += 1

            prev_state = state

In [6]:
# Calculate probabilities
transition_probs = {k: v / state_counts[k[0]] for k, v in transition_counts.items()}
emission_probs = {k: v / state_counts[k[0]] for k, v in emission_counts.items()}

In [7]:
# HMM Model for JSON
hmm_model = {
    "transition": {f"({k[0]},{k[1]})": v for k, v in transition_probs.items()},
    "emission": {f"({k[0]},{k[1]})": v for k, v in emission_probs.items()}
}

with open("../../data/outputs/hmm.json", "w") as f:
    json.dump(hmm_model, f, indent = 4)

In [8]:
print(len(hmm_model['transition']))
print(len(hmm_model['emission']))

1416
50286


## Greedy HMM Decoding

In [9]:
# output_file_path = '../../data/outputs/greedy.out'
# states = list(set([k.split(',')[0].strip('(') for k in hmm_model['transition'].keys()]))

# with open('../../data/vocab-data/dev', 'r') as tr_file, open(output_file_path, 'w') as out_file:
#     Lines = tr_file.readlines()
#     max = -np.inf
#     prev_state = '<new_line>'
    
#     for line in Lines:
#         line = line.split('\t')

#         if len(line) == 3: # if not new line 
#             _, word, index = line[2].replace('\n',''), line[1], line[0]
#             prev_state = '<new_line>' if index == '1' else prev_state
                
#             for state in states:
#                 trans_indexing = f'({prev_state},{state})'
#                 emiss_indexing = f'({state},{word})'
                
#                 try:
#                     trans = hmm_model['transition'][trans_indexing]
#                     emiss = hmm_model['emission'][emiss_indexing]
                    
#                 except KeyError:
#                     continue

#                 s_prob = trans * emiss
#                 if s_prob > max:
#                     max = s_prob
#                     optim_state = state
#                     # print(f'Previous {prev_state}, State: {state}, word: {word}')
#             out_file.write(f'{index}\t{word}\t{optim_state}\n')
#         else:
#             out_file.write('\n')

                
#         prev_state = state 
#         max = -np.inf


In [10]:
output_file_path = '../../data/outputs/greedy.out'
states = list(set([k.split(',')[0].strip('(') for k in hmm_model['transition'].keys()]))

with open('../../data/vocab-data/dev', 'r') as tr_file, open(output_file_path, 'w') as out_file:
    Lines = tr_file.readlines()
    prev_state = '<new_line>'
    
    for line in Lines:
        line = line.split('\t')
        max = -np.inf  # Reset max for each new word

        if len(line) == 3:  # if not new line 
            _, word, index = line[2].replace('\n',''), line[1], line[0]
            prev_state = '<new_line>' if index == '1' else prev_state
                
            for state in states:
                trans_indexing = f'({prev_state},{state})'
                emiss_indexing = f'({state},{word})'
                
                try:
                    trans = hmm_model['transition'][trans_indexing]
                    emiss = hmm_model['emission'][emiss_indexing]
                except KeyError:
                    continue

                s_prob = trans * emiss
                if s_prob > max:
                    max = s_prob
                    optim_state = state

            out_file.write(f'{index}\t{word}\t{optim_state}\n')
            prev_state = optim_state  # Update prev_state correctly within the loop

        else:
            out_file.write('\n')
            prev_state = '<new_line>'  # Reset prev_state for a new sentence/line


In [11]:
! python ../eval.py -p ../../data/outputs/greedy.out -g ../../data/vocab-data/dev


total: 131768, correct: 114991, accuracy: 87.27%


In [12]:
# python eval.py -p greedy.out -g dev
# python POS-Sequence-Labeling\eval.py -p ./data/outputs/greedy.out -g ./data/vocab-data/dev
# There should be two greedy.out files, one for dev data, the other for test data. You need compute the accuracy on dev set and submit the greedy.out of test data.


## Viterbi Decoding 

In [13]:
def viterbi(obs, states, start_p, trans_p, emit_p):
    pass


with open('../../data/outputs/hmm.json') as model_file:
    hmm_model = json.load(model_file)

# Extract states and start probabilities if available; otherwise, initialize uniformly
states = list(set([k.split(',')[0].strip('(') for k in hmm_model['transition'].keys()]))
start_probabilities = {state: 1/len(states) for state in states}  # Uniform start probabilities


with open('../../data/vocab-data/dev', 'r') as dev_file:
    observations = [line.strip().split('\t')[1] for line in dev_file if len(line.strip().split('\t')) > 1]

# states = ['NNP', 'CD', 'NNS', 'JJ', 'MD', 'VB']  # Update based on your model
# observations = ['Pierre', 'Vinken'] 

print(len(observations), len(states))
# Since start_probabilities are not provided, assume uniform distribution
predicted_tags = viterbi(observations, states, start_probabilities, hmm_model['transition'], hmm_model['emission'])

print(start_probabilities)

131768 46
{'': 0.021739130434782608, 'VB': 0.021739130434782608, 'WRB': 0.021739130434782608, 'CC': 0.021739130434782608, 'DT': 0.021739130434782608, '#': 0.021739130434782608, 'WDT': 0.021739130434782608, 'FW': 0.021739130434782608, 'MD': 0.021739130434782608, '.': 0.021739130434782608, 'VBG': 0.021739130434782608, "''": 0.021739130434782608, '<new_line>': 0.021739130434782608, 'PRP': 0.021739130434782608, 'SYM': 0.021739130434782608, 'EX': 0.021739130434782608, 'PDT': 0.021739130434782608, 'RB': 0.021739130434782608, 'NNS': 0.021739130434782608, ':': 0.021739130434782608, '``': 0.021739130434782608, 'VBZ': 0.021739130434782608, 'VBP': 0.021739130434782608, 'NN': 0.021739130434782608, '$': 0.021739130434782608, 'IN': 0.021739130434782608, 'NNPS': 0.021739130434782608, 'JJR': 0.021739130434782608, 'LS': 0.021739130434782608, 'JJ': 0.021739130434782608, 'RBR': 0.021739130434782608, '-LRB-': 0.021739130434782608, 'WP$': 0.021739130434782608, 'TO': 0.021739130434782608, 'RBS': 0.021739130