In [2]:
import re
import json
from collections import defaultdict
import numpy as np
import pandas as pd 
from tqdm import tqdm 
from pathlib import Path

### Task 1: Vocabulary Creation (20 points)

What is the selected threshold for unknown words replacement? What is the total size of your
vocabulary and what is the total occurrences of the special token ‘< unk >’
after replacement?

In [3]:
n_threshold = 2
train_vocab = defaultdict(int)

vocab_df  = pd.read_csv('../../data/vocab-data/train', sep='\t', skip_blank_lines = False, header = None)
vocab_df.columns = ['Index', 'Word', 'POS']

print(vocab_df.columns)

# File importing
with open('../../data/vocab-data/train', 'r') as tr_file:
    Lines = tr_file.readlines()

    # Create vocab
    for line in Lines:
        if line.strip():
            word = re.split(r'\t', line)[1]
            cleaned_word = re.sub(r'\W+', '', word)     

        if word not in train_vocab:
            train_vocab[cleaned_word] = 0
        train_vocab[cleaned_word] += 1


Index(['Index', 'Word', 'POS'], dtype='object')


If some word 'xyz' has frequency 3 and my threshold for categorizing as '<unk>' is 4. Then  we should add 3 to the frequency occurrence count of '<unk>'

In [4]:
# Handle <unk> tokens  
unk_count = sum(v for k, v in train_vocab.items() if v <= n_threshold)
new_vocab = {k: v for k, v in train_vocab.items() if v > n_threshold}
new_vocab['<unk>'] = unk_count
indexed_vocab = {word: (index, count) for index, (word, count) in enumerate(sorted(new_vocab.items(), key = lambda item: item[1], reverse=True), start = 1)}

In [5]:
# File Writing
with open("../../data/outputs/train_vocab.txt", "w") as f:
    for k,v in indexed_vocab.items():
        # word index count
        new_line = f"{k}\t{v[0]}\t{v[1]}\n"
        f.write(new_line)


## HMM Model with Emission & Transition Probabilities

In [6]:
def load_word_frequencies(vocab_counts_file):
    word_frequencies = {}
    with open(vocab_counts_file, 'r') as file:
        for line in file:
            word, _, count = line.strip().split('\t')
            word_frequencies[word] = int(count)
    return word_frequencies

In [7]:
def read_wordsequences_tagsequences(path: str | Path) -> tuple[list[list[str]], list[list[str]]]:
    """This is used to read from the train and dev set"""
    with open(path) as f:
        data = f.read()
    tagged_sentences = re.split(r'\n\n', data.strip('\n'))

    word_sequences = [
        [
            wordrecord.split('\t')[1] 
            for wordrecord in sentence.split('\n')
        ] 
        for sentence in tagged_sentences
    ]

    tag_sequences = [
        [
            tag_word.split('\t')[2] 
            for tag_word in sentence.split('\n')
        ] 
        for sentence in tagged_sentences
    ]
    
    return word_sequences, tag_sequences

from __future__ import annotations
from collections import Counter
from dataclasses import dataclass
from itertools import chain
from typing import Any
import itertools


class BaseTransform:
    def fit_transform(self, *args, **kwargs) -> Any:
        self.fit(*args, **kwargs)
        return self.transform(*args, **kwargs)


@dataclass
class CountReplaceTransform(BaseTransform):
    threshold: int
    special_token: str

    def fit(self, sequences: list[list[str]], *_) -> CountReplaceTransform:
        frequency = Counter(chain(*sequences)) # Get the word counts
        self.words = { word: count for word, count in frequency.items() if count >= self.threshold }
        return self

    def transform(self, sequences: list[list[str]], *_) -> list[list[str]]:
        
        return [
            [
                (word if word in self.words else self.special_token)
                for word in sequence
            ] 
            for sequence in sequences
        ]


# Utilization
vocab_counts_file = '../../data/outputs/train_vocab.txt'  
word_frequencies = load_word_frequencies(vocab_counts_file)


word_sequences, tag_sequences = read_wordsequences_tagsequences('../../data/vocab-data/train')
transformer = CountReplaceTransform(2, '<unk>')
word_sequences_t = transformer.fit_transform(word_sequences)

counts = Counter(itertools.chain(*tag_sequences))
emission_temp = [list(zip(y, x)) for (y, x) in zip(tag_sequences, word_sequences_t)]
emission_counts = Counter(itertools.chain(*emission_temp))

prior_counts = Counter([sequence[0] for sequence in tag_sequences])

bigrams = [list(zip(y, y[1:])) for y in tag_sequences]
        
transition_counts = Counter(chain(*bigrams))

S = set(counts.keys())
O = set(Counter(itertools.chain(*word_sequences_t)).keys())
N = len(tag_sequences)

transition_probs = defaultdict(int)
emission_probs = defaultdict(int)
state_probs = defaultdict(int)

# Prior Probabilities
for s in S:
    state_probs[s] = prior_counts[s] / N

# Transition probabilities
for s in S:
    for s_ in S:
        transition_probs[s, s_] = transition_counts[s, s_] / counts[s]

# Emission probabilities
for o in O:
    for s in S:
        emission_probs[s, o] = emission_counts[s, o] / counts[s]


In [8]:
[(k, v) for k, v in emission_probs.items() if v != 0]

[(('JJ', 'Central'), 8.482627578718784e-05),
 (('NNP', 'Central'), 0.00045657930782576936),
 (('NN', 'tooling'), 1.5682092618439004e-05),
 (('NNS', 'Taxes'), 3.456679168322992e-05),
 (('NN', 'masse'), 7.841046309219502e-06),
 (('FW', 'masse'), 0.004464285714285714),
 (('NN', 'litmus'), 1.5682092618439004e-05),
 (('NNS', 'Events'), 3.456679168322992e-05),
 (('NN', 'box'), 0.00011761569463829253),
 (('NN', 'seed'), 0.00018034406511204854),
 (('RB', 'clear'), 0.00013503933020492218),
 (('VBP', 'clear'), 8.112932013629726e-05),
 (('JJ', 'clear'), 0.001967969598262758),
 (('VB', 'clear'), 0.0006277217623288478),
 (('NN', 'Time'), 8.625150940141452e-05),
 (('NNP', 'Time'), 0.0007647703406081636),
 (('NNP', "O'Brien"), 0.0001027303442607981),
 (('CD', '1991'), 0.002637917192338571),
 (('NNS', 'aides'), 0.00031110112514906926),
 (('NNS', 'conservatives'), 0.00017283395841614962),
 (('CD', '320'), 0.000200711090721413),
 (('NN', 'rolling'), 1.5682092618439004e-05),
 (('JJ', 'rolling'), 6.786102

In [9]:
[(k, v) for k, v in transition_probs.items() if v != 0]

[(('VBD', 'VBD'), 0.002013493941855947),
 (('VBD', 'RB'), 0.08424882546186725),
 (('VBD', '$'), 0.015436786887562259),
 (('VBD', '-RRB-'), 0.0001059733653608393),
 (('VBD', 'UH'), 0.00014129782048111908),
 (('VBD', 'VBP'), 0.00014129782048111908),
 (('VBD', 'RBS'), 0.0003885690063230775),
 (('VBD', 'EX'), 0.0010950581087286729),
 (('VBD', '#'), 0.0001059733653608393),
 (('VBD', "''"), 0.0004238934614433572),
 (('VBD', 'NNPS'), 0.00014129782048111908),
 (('VBD', 'NN'), 0.032781094351619626),
 (('VBD', 'JJ'), 0.05510614998763644),
 (('VBD', 'JJR'), 0.0075594333957398705),
 (('VBD', 'VBZ'), 0.0004238934614433572),
 (('VBD', 'MD'), 0.0003885690063230775),
 (('VBD', 'PRP$'), 0.026140096789007028),
 (('VBD', 'CC'), 0.002967254230103501),
 (('VBD', 'IN'), 0.11498110141651065),
 (('VBD', 'WDT'), 0.0001059733653608393),
 (('VBD', 'DT'), 0.17012257585926738),
 (('VBD', 'RP'), 0.01540146243244198),
 (('VBD', 'NNP'), 0.0553534211734784),
 (('VBD', '``'), 0.0061111307358084005),
 (('VBD', 'TO'), 0.

Original Cell

In [10]:
# vocab_counts_file = '../../data/outputs/train_vocab.txt'  
# word_frequencies = load_word_frequencies(vocab_counts_file)

# transition_counts = defaultdict(int)
# emission_counts = defaultdict(int)
# state_counts = defaultdict(int)

# with open('../../data/vocab-data/train', 'r') as tr_file:
#     Lines = tr_file.readlines()
#     prev_state = None

#     # Process each line
#     for line in Lines:
#         line = line.strip()
#         parts = line.split('\t')
        
#         if len(parts) >= 2:
#             word, state = parts[1], parts[2]
#             # Replace the word with '<unk>' if its frequency is below the threshold
#             cleaned_word = word if word in word_frequencies else '<unk>'

#             # Emission and transition counts
#             emission_counts[(state, cleaned_word)] += 1
#             state_counts[state] += 1
#             if prev_state is not None:
#                 transition_counts[(prev_state, state)] += 1
#             prev_state = state

#         else:
#             word_type = '/n'
#             state = '<new_line>'
#             if prev_state is not None:
#                 transition_counts[(prev_state, state)] += 1
#                 state_counts[state] += 1

#             prev_state = state


In [11]:
# # Calculate probabilities
# transition_probs = {k: v for k, v in transition_counts.items()}
# emission_probs = {k: v for k, v in emission_counts.items()}

In [12]:
# HMM Model for JSON
hmm_model = {
    "transition": {f"({k[0]},{k[1]})": v for k, v in transition_probs.items()},
    "emission": {f"({k[0]},{k[1]})": v for k, v in emission_probs.items()}
}

with open("../../data/outputs/hmm_.json", "w") as f:
    json.dump(hmm_model, f, indent = 4)

In [13]:
print(len(hmm_model['transition']))
print(len(hmm_model['emission']))

2025
1043235


## Greedy HMM Decoding

In [14]:
from typing import  List


def decode_greedy(sentence: List[str], state_probs: defaultdict, transition_probs:defaultdict, emission_probs:defaultdict ):
    # print(type(state_probs), type(transition_probs), type(emission_probs) )
    path = []
    word = sentence[0]
    max_state = None
    max_prob = 0

    for s in S:
        prob = state_probs[s] * emission_probs[s, word]
        # print(prob)
        if prob > max_prob:
            max_prob = prob
            max_state = s
    
    path.append(max_state)
    prev_state = max_state
        

    for i, word in enumerate(sentence[1:]):
        max_state = None
        max_prob = 0
        for s in S:
            prob = transition_probs[prev_state, s] * emission_probs[s, word]
            if prob > max_prob:
                max_prob = prob
                max_state = s
        path.append(max_state)
        prev_state = max_state
    
    return path


def write_wordsequences_tagsequences(filepath: str, word_sequences: list[list[str]], tag_sequences: list[list[str]]):
    result_str = ""
    
    for seq_index in range(len(word_sequences)):
        word_seq = word_sequences[seq_index]
        tag_seq = tag_sequences[seq_index]
        
        if len(word_seq) != len(tag_seq):
            raise ValueError("Word sequence and tag sequence lengths do not match.")
        
        for pair_index in range(len(word_seq)):
            word = word_seq[pair_index]
            tag = tag_seq[pair_index]
            result_str += f"{pair_index + 1}\t{word}\t{tag}\n"
        
        if seq_index < len(word_sequences) - 1:
            result_str += "\n"
    
    # Write the result string to the specified file
    with open(filepath, 'w') as file:
        file.write(result_str)



word_sequences_dev, tag_sequences_dev = read_wordsequences_tagsequences('../../data/vocab-data/dev')
word_sequences_dev_t = transformer.transform(word_sequences_dev)
pred_tagsequence = [
    decode_greedy(word_sequences_dev_t[i], state_probs, transition_probs, emission_probs)
    for i in range(len(word_sequences_dev_t))
]
write_wordsequences_tagsequences('../../data/outputs/greedy_.out', word_sequences_dev_t, pred_tagsequence)

!python ../eval.py -p ../../data/outputs/greedy_.out -g ../../data/vocab-data/dev

total: 131768, correct: 122793, accuracy: 93.19%


In [None]:
# Citation: https://en.wikipedia.org/wiki/Viterbi_algorithm
def viterbi(obs, states, start_p, trans_p, emit_p):
    # Initialize the dynamic programming table
    V = [{}]
    path = {}
    
    # Initialize base cases (t == 0)
    for st in states:
        V[0][st] = start_p[st] * emit_p[st, obs[0]]
        path[st] = [st]
    
    # Run Viterbi for t > 0
    for t in range(1, len(obs)):
        V.append({})
        newpath = {}
        
        for st in states:
            (max_prob, state) = max(
                (V[t-1][prev_st] * trans_p[prev_st, st] * emit_p[st, obs[t]], prev_st) 
                for prev_st in states)
            V[t][st] = max_prob
            newpath[st] = path[state] + [st]
        
        path = newpath
    
    # Build the output
    (max_prob, state) = max((V[len(obs) - 1][st], st) for st in states)
    return path[state], max_prob

def dptable(V):
    # Print a table of steps from dictionary
    yield "    " + " ".join(f"{i:5d}" for i in range(len(V)))
    for state in V[0]:
        yield f"{state}: " + " ".join(f"{v[state]:.5f}" for v in V)



word_sequences_dev, tag_sequences_dev = read_wordsequences_tagsequences('../../data/vocab-data/dev')
word_sequences_dev_t = transformer.transform(word_sequences_dev)
pred_tagsequence = [
    viterbi(word_sequences_dev_t[i], S, state_probs, transition_probs, emission_probs)[0]
    for i in tqdm(range(len(word_sequences_dev_t)))
]
write_wordsequences_tagsequences('../../data/outputs/viterbi_.out', word_sequences_dev_t, pred_tagsequence)

!python ../eval.py -p ../../data/outputs/viterbi_.out -g ../../data/vocab-data/dev

In [None]:
output_file_path = '../../data/outputs/greedy.out'
states = list(set([k.split(',')[0].strip('(') for k in hmm_model['transition'].keys()]))

with open('../../data/vocab-data/dev', 'r') as tr_file, open(output_file_path, 'w') as out_file:
    Lines = tr_file.readlines()
    prev_state = '<new_line>'
    
    for line in Lines:
        line = line.split('\t')
        max_prob = -np.inf  # Reset max for each new word

        if len(line) == 3:  # if not new line 
            _, word, index = line[2].replace('\n',''), line[1], line[0]
            word = word if word_frequencies.get(word, 0) >= n_threshold else '<unk>'

            prev_state = '<new_line>' if index == '1' else prev_state
                
            for state in states:
                trans_indexing = f'({prev_state},{state})'
                emiss_indexing = f'({state},{word})'

                try:
                    trans = hmm_model['transition'][trans_indexing]
                    emiss = hmm_model['emission'][emiss_indexing]
                except KeyError:
                    continue

                s_prob = trans * emiss
                if s_prob > max_prob:

                    max_prob = s_prob
                    optim_state = state

            out_file.write(f'{index}\t{word}\t{optim_state}\n')
            prev_state = optim_state  # Update prev_state correctly within the loop

        else:
            out_file.write('\n')
            prev_state = '<new_line>' 


In [None]:
! python ../eval.py -p ../../data/outputs/greedy.out -g ../../data/vocab-data/dev

## Viterbi Decoding 

In [None]:
def viterbi_decode(observations, states, hmm_model, output_file_path):
    num_obs = len(observations)
    num_states = len(states)
    viterbi_table = [[0.0 for _ in range(num_states)] for _ in range(num_obs)]
    backpointer = [[0 for _ in range(num_states)] for _ in range(num_obs)]
    
    default_probability = 0.0
    
    # Initialize the first column of the Viterbi table
    for s in range(num_states):
        state = states[s]
        emiss_indexing = f'({state},{observations[0]})' if observations[0] != '\n' else None
        emission_prob = hmm_model['emission'].get(emiss_indexing, default_probability) if emiss_indexing else default_probability
        viterbi_table[0][s] = emission_prob
        backpointer[0][s] = 0
    
    # Fill the Viterbi table
    for t in range(1, num_obs): #, desc="Filling Viterbi table"):
        for s in range(num_states):
            state = states[s]
            max_tr_prob = None
            prev_st_selected = 0
            for prev_st in range(num_states):
                prev_state = states[prev_st]
                trans_indexing = f'({prev_state},{state})'
                tr_prob = viterbi_table[t-1][prev_st] * hmm_model['transition'].get(trans_indexing, default_probability)
                if max_tr_prob is None or tr_prob > max_tr_prob:
                    max_tr_prob = tr_prob
                    prev_st_selected = prev_st
                           

            emiss_indexing = f'({state},{observations[t]})'
            emission_prob = hmm_model['emission'].get(emiss_indexing, default_probability)
            max_prob = max_tr_prob * emission_prob
            
            viterbi_table[t][s] = max_prob
            backpointer[t][s] = prev_st_selected

            assert max_tr_prob is not None, f"Max transition probability not found for t={t}, state={state}"
            # assert emission_prob > 0, f"Emission probability is zero or very low for t={t}, state={state}, observation={observations[t]}"
     
    
    # Decode the best path from back to front
    best_path = []
    max_prob = max(viterbi_table[-1])
    last_state = viterbi_table[-1].index(max_prob)
    best_path.append(states[last_state])

    for t in range(num_obs - 2, -1, -1): #, desc="Backtracking"):
        last_state = backpointer[t+1][last_state]
        best_path.insert(0, states[last_state])

    return best_path

In [None]:
def extract_sentences_from_dev_file(file_path, word_freq, n_threshold = 1):
    sentences = [] 
    current_sentence = []  
    
    with open(file_path, 'r') as file:
        for line in file:
            if line.strip():  
                parts = line.strip().split('\t')
                if len(parts) >= 2:  
                    word = parts[1]
                    # Replace the word with '<unk>' if its frequency is below the threshold
                    cleaned_word = word if word_freq.get(word, 0) >= n_threshold else '<unk>'
                    current_sentence.append(cleaned_word)
            else: 
                if current_sentence:  
                    sentences.append(current_sentence)
                    current_sentence = []  
        
        if current_sentence: # IF NOT NEW LINE
            sentences.append(current_sentence)
    
    return sentences

In [None]:
with open('../../data/outputs/hmm.json') as model_file:
    hmm_model = json.load(model_file)

states_list = list(set([k.split(',')[0].strip('(') for k in hmm_model['transition'].keys()]))
sentences = extract_sentences_from_dev_file('../../data/vocab-data/dev', word_freq = word_frequencies )

opt_seq = []
for i, sentence in tqdm(enumerate(sentences)):
    path = viterbi_decode(observations = sentences[i] , states = states_list, hmm_model = hmm_model, output_file_path = '../../data/outputs/viterbi.out' )
    opt_seq.append(path)

In [None]:
def write_viterbi_output(dev_file_path, predictions, output_file_path):
    with open(dev_file_path, 'r') as dev_file, open(output_file_path, 'w') as out_file:
        prediction_index = 0  # separate predictions counter 
        for line in dev_file:
            if line.strip():
                index, word, _ = line.strip().split('\t')
                if prediction_index < len(predictions) and predictions[prediction_index]:
                    tag = predictions[prediction_index].pop(0)
                    out_file.write(f'{index}\t{word}\t{tag}\n')
            else:
                out_file.write('\n')
                prediction_index += 1  


dev_file_path = '../../data/vocab-data/dev'  # Update this to your dev file path
output_file_path='./viterbi_new.out'
write_viterbi_output(dev_file_path, opt_seq, output_file_path)

In [None]:
! python ../eval.py -p viterbi_new.out -g ../../data/vocab-data/dev
