In [1]:
import re
import json
from collections import defaultdict
import numpy as np
import pandas as pd 
from tqdm import tqdm 

### Task 1: Vocabulary Creation (20 points)

What is the selected threshold for unknown words replacement? What is the total size of your
vocabulary and what is the total occurrences of the special token ‘< unk >’
after replacement?

In [2]:
n_threshold = 2
train_vocab = defaultdict(int)

vocab_df  = pd.read_csv('../../data/vocab-data/train', sep='\t', skip_blank_lines = False, header = None)
vocab_df.columns = ['Index', 'Word', 'POS']

print(vocab_df.columns)

# File importing
with open('../../data/vocab-data/train', 'r') as tr_file:
    Lines = tr_file.readlines()

    # Create vocab
    for line in Lines:
        if line.strip():
            word = re.split(r'\t', line)[1]
            cleaned_word = re.sub(r'\W+', '', word)     

        if word not in train_vocab:
            train_vocab[cleaned_word] = 0
        train_vocab[cleaned_word] += 1


Index(['Index', 'Word', 'POS'], dtype='object')


If some word 'xyz' has frequency 3 and my threshold for categorizing as '<unk>' is 4. Then  we should add 3 to the frequency occurrence count of '<unk>'

In [3]:
# Handle <unk> tokens  
unk_count = sum(v for k, v in train_vocab.items() if v <= n_threshold)
new_vocab = {k: v for k, v in train_vocab.items() if v > n_threshold}
new_vocab['<unk>'] = unk_count
indexed_vocab = {word: (index, count) for index, (word, count) in enumerate(sorted(new_vocab.items(), key = lambda item: item[1], reverse=True), start = 1)}

In [None]:
# File Writing
f = open("../../data/outputs/train_vocab.txt", "a")
for k,v in indexed_vocab.items():
    # word index count
    new_line = f"{k}\t{v[0]}\t{v[1]}\n"
    f.write(new_line)
f.close()

## HMM Model with Emission & Transition Probabilities

In [7]:
def load_word_frequencies(vocab_counts_file):
    word_frequencies = {}
    with open(vocab_counts_file, 'r') as file:
        for line in file:
            word, _, count = line.strip().split('\t')
            word_frequencies[word] = int(count)
    return word_frequencies

In [8]:
vocab_counts_file = '../../data/outputs/train_vocab.txt'  
word_frequencies = load_word_frequencies(vocab_counts_file)

transition_counts = defaultdict(int)
emission_counts = defaultdict(int)
state_counts = defaultdict(int)

with open('../../data/vocab-data/train', 'r') as tr_file:
    Lines = tr_file.readlines()
    prev_state = None

    # Process each line
    for line in Lines:
        line = line.strip()
        parts = line.split('\t')
        
        if len(parts) >= 2:
            word, state = parts[1], parts[2]
            # Replace the word with '<unk>' if its frequency is below the threshold
            cleaned_word = word if word_frequencies.get(word, 0) >= n_threshold else '<unk>'

            # Emission and transition counts
            emission_counts[(state, cleaned_word)] += 1
            state_counts[state] += 1
            if prev_state is not None:
                transition_counts[(prev_state, state)] += 1
            prev_state = state

        else:
            word_type = '/n'
            state = '<new_line>'
            if prev_state is not None:
                transition_counts[(prev_state, state)] += 1
                state_counts[state] += 1

            prev_state = state


In [9]:
# Calculate probabilities
transition_probs = {k: v / state_counts[k[0]] for k, v in transition_counts.items()}
emission_probs = {k: v / state_counts[k[0]] for k, v in emission_counts.items()}

In [10]:
# HMM Model for JSON
hmm_model = {
    "transition": {f"({k[0]},{k[1]})": v for k, v in transition_probs.items()},
    "emission": {f"({k[0]},{k[1]})": v for k, v in emission_probs.items()}
}

with open("../../data/outputs/hmm.json", "w") as f:
    json.dump(hmm_model, f, indent = 4)

In [11]:
print(len(hmm_model['transition']))
print(len(hmm_model['emission']))

1416
21219


## Greedy HMM Decoding

In [22]:
output_file_path = '../../data/outputs/greedy.out'
states = list(set([k.split(',')[0].strip('(') for k in hmm_model['transition'].keys()]))

with open('../../data/vocab-data/dev', 'r') as tr_file, open(output_file_path, 'w') as out_file:
    Lines = tr_file.readlines()
    prev_state = '<new_line>'
    
    for line in Lines:
        line = line.split('\t')
        max_prob = -np.inf  # Reset max for each new word

        if len(line) == 3:  # if not new line 
            _, word, index = line[2].replace('\n',''), line[1], line[0]
            word = word if word_frequencies.get(word, 0) >= n_threshold else '<unk>'

            prev_state = '<new_line>' if index == '1' else prev_state
                
            for state in states:
                trans_indexing = f'({prev_state},{state})'
                emiss_indexing = f'({state},{word})'
                # print(emiss_indexing)
                try:
                    trans = hmm_model['transition'][trans_indexing]
                    emiss = hmm_model['emission'][emiss_indexing]
                except KeyError:
                    continue

                s_prob = trans * emiss
                if s_prob > max_prob:

                    max_prob = s_prob
                    optim_state = state

            out_file.write(f'{index}\t{word}\t{optim_state}\n')
            prev_state = optim_state  # Update prev_state correctly within the loop

        else:
            out_file.write('\n')
            prev_state = '<new_line>' 


In [23]:
! python ../eval.py -p ../../data/outputs/greedy.out -g ../../data/vocab-data/dev

total: 131768, correct: 102741, accuracy: 77.97%


## Viterbi Decoding 

In [32]:
def viterbi_decode(observations, states, hmm_model, output_file_path):
    num_obs = len(observations)
    num_states = len(states)
    viterbi_table = [[0.0 for _ in range(num_states)] for _ in range(num_obs)]
    backpointer = [[0 for _ in range(num_states)] for _ in range(num_obs)]
    
    default_probability = 0.0
    
    # Initialize the first column of the Viterbi table
    for s in range(num_states):
        state = states[s]
        emiss_indexing = f'({state},{observations[0]})' if observations[0] != '\n' else None
        emission_prob = hmm_model['emission'].get(emiss_indexing, default_probability) if emiss_indexing else default_probability
        viterbi_table[0][s] = emission_prob
        backpointer[0][s] = 0
    
    # Fill the Viterbi table
    for t in range(1, num_obs): #, desc="Filling Viterbi table"):
        for s in range(num_states):
            state = states[s]
            max_tr_prob = None
            prev_st_selected = 0
            for prev_st in range(num_states):
                prev_state = states[prev_st]
                trans_indexing = f'({prev_state},{state})'
                tr_prob = viterbi_table[t-1][prev_st] * hmm_model['transition'].get(trans_indexing, default_probability)
                if max_tr_prob is None or tr_prob > max_tr_prob:
                    max_tr_prob = tr_prob
                    prev_st_selected = prev_st
                           

            emiss_indexing = f'({state},{observations[t]})'
            emission_prob = hmm_model['emission'].get(emiss_indexing, default_probability)
            max_prob = max_tr_prob * emission_prob
            
            viterbi_table[t][s] = max_prob
            backpointer[t][s] = prev_st_selected

            assert max_tr_prob is not None, f"Max transition probability not found for t={t}, state={state}"
            # assert emission_prob > 0, f"Emission probability is zero or very low for t={t}, state={state}, observation={observations[t]}"
     
    
    # Decode the best path from back to front
    best_path = []
    max_prob = max(viterbi_table[-1])
    last_state = viterbi_table[-1].index(max_prob)
    best_path.append(states[last_state])

    for t in range(num_obs - 2, -1, -1): #, desc="Backtracking"):
        last_state = backpointer[t+1][last_state]
        best_path.insert(0, states[last_state])

    return best_path

In [33]:
def extract_sentences_from_dev_file(file_path, word_freq, n_threshold = 2):
    sentences = [] 
    current_sentence = []  
    
    with open(file_path, 'r') as file:
        for line in file:
            if line.strip():  
                parts = line.strip().split('\t')
                if len(parts) >= 2:  # Ensure we have both index and word in the line
                    word = parts[1]
                    # Replace the word with '<unk>' if its frequency is below the threshold
                    cleaned_word = word if word_freq.get(word, 0) >= n_threshold else '<unk>'
                    current_sentence.append(cleaned_word)
            else: 
                if current_sentence:  
                    sentences.append(current_sentence)
                    current_sentence = []  
        
        if current_sentence: # IF NOT NEW LINE
            sentences.append(current_sentence)
    
    return sentences

In [34]:
with open('../../data/outputs/hmm.json') as model_file:
    hmm_model = json.load(model_file)

states_list = list(set([k.split(',')[0].strip('(') for k in hmm_model['transition'].keys()]))
sentences = extract_sentences_from_dev_file('../../data/vocab-data/dev', word_freq = word_frequencies )

opt_seq = []
for i, sentence in tqdm(enumerate(sentences)):
    path = viterbi_decode(observations = sentences[i] , states = states_list, hmm_model = hmm_model, output_file_path = '../../data/outputs/viterbi.out' )
    opt_seq.append(path)

5527it [01:06, 82.81it/s] 


In [35]:
def write_viterbi_output(dev_file_path, predictions, output_file_path):
    with open(dev_file_path, 'r') as dev_file, open(output_file_path, 'w') as out_file:
        prediction_index = 0  # separate predictions counter 
        for line in dev_file:
            if line.strip():
                index, word, _ = line.strip().split('\t')
                if prediction_index < len(predictions) and predictions[prediction_index]:
                    tag = predictions[prediction_index].pop(0)
                    out_file.write(f'{index}\t{word}\t{tag}\n')
            else:
                out_file.write('\n')
                prediction_index += 1  


dev_file_path = '../../data/vocab-data/dev'  # Update this to your dev file path
output_file_path='./viterbi_new.out'
write_viterbi_output(dev_file_path, opt_seq, output_file_path)

In [36]:
! python ../eval.py -p viterbi_new.out -g ../../data/vocab-data/dev


total: 131768, correct: 107740, accuracy: 81.76%
