In [40]:
import re
import json
from collections import defaultdict
import numpy as np
import pandas as pd 
from tqdm import tqdm 

### Task 1: Vocabulary Creation (20 points)

What is the selected threshold for unknown words replacement? What is the total size of your
vocabulary and what is the total occurrences of the special token ‘< unk >’
after replacement?

In [41]:
n_threshold = 3
train_vocab = defaultdict(int)

vocab_df  = pd.read_csv('../../data/vocab-data/train', sep='\t', skip_blank_lines = False, header = None)
vocab_df.columns = ['Index', 'Word', 'POS']

print(vocab_df.columns)

# File importing
with open('../../data/vocab-data/train', 'r') as tr_file:
    Lines = tr_file.readlines()

    # Create vocab
    for line in Lines:
        if line.strip():
            word = re.split(r'\t', line)[1]
            cleaned_word = re.sub(r'\W+', '', word)     

        if word not in train_vocab:
            train_vocab[cleaned_word] = 0
        train_vocab[cleaned_word] += 1


Index(['Index', 'Word', 'POS'], dtype='object')


If some word 'xyz' has frequency 3 and my threshold for categorizing as '<unk>' is 4. Then  we should add 3 to the frequency occurrence count of '<unk>'

In [42]:
# Handle <unk> tokens  
unk_count = sum(v for k, v in train_vocab.items() if v <= n_threshold)
new_vocab = {k: v for k, v in train_vocab.items() if v > n_threshold}
new_vocab['<unk>'] = unk_count
indexed_vocab = {word: (index, count) for index, (word, count) in enumerate(sorted(new_vocab.items(), key = lambda item: item[1], reverse=True), start = 1)}

In [43]:
# File Writing
f = open("../../data/outputs/train_vocab.txt", "a")
for k,v in indexed_vocab.items():
    # word index count
    new_line = f"{k}\t{v[0]}\t{v[1]}\n"
    f.write(new_line)
f.close()

## HMM Model with Emission & Transition Probabilities

In [44]:
transition_counts = defaultdict(int)
emission_counts = defaultdict(int)
state_counts = defaultdict(int)

# Open training data
with open('../../data/vocab-data/train', 'r') as tr_file:
    Lines = tr_file.readlines()
    prev_state = None

    # Process each line
    for line in Lines:
        line = line.strip()
        parts = line.split('\t')
        
        if len(parts) >= 2:
            word, state = parts[1], parts[2]
            cleaned_word = word # re.sub(r'\W+', '', word)

            # Emission and transition counts
            emission_counts[(state, cleaned_word)] += 1
            state_counts[state] += 1
            if prev_state is not None:
                transition_counts[(prev_state, state)] += 1
            prev_state = state

        else:
            # Calculate emission probability of new sentence to a word type
            word_type = '/n'
            state = '<new_line>'
            if prev_state is not None:
                transition_counts[(prev_state, state)] += 1
                state_counts[state] += 1

            prev_state = state

In [45]:
# Calculate probabilities
transition_probs = {k: v / state_counts[k[0]] for k, v in transition_counts.items()}
emission_probs = {k: v / state_counts[k[0]] for k, v in emission_counts.items()}

In [46]:
# HMM Model for JSON
hmm_model = {
    "transition": {f"({k[0]},{k[1]})": v for k, v in transition_probs.items()},
    "emission": {f"({k[0]},{k[1]})": v for k, v in emission_probs.items()}
}

with open("../../data/outputs/hmm.json", "w") as f:
    json.dump(hmm_model, f, indent = 4)

In [47]:
print(len(hmm_model['transition']))
print(len(hmm_model['emission']))

1416
50286


## Greedy HMM Decoding

In [48]:
# output_file_path = '../../data/outputs/greedy.out'
# states = list(set([k.split(',')[0].strip('(') for k in hmm_model['transition'].keys()]))

# with open('../../data/vocab-data/dev', 'r') as tr_file, open(output_file_path, 'w') as out_file:
#     Lines = tr_file.readlines()
#     max = -np.inf
#     prev_state = '<new_line>'
    
#     for line in Lines:
#         line = line.split('\t')

#         if len(line) == 3: # if not new line 
#             _, word, index = line[2].replace('\n',''), line[1], line[0]
#             prev_state = '<new_line>' if index == '1' else prev_state
                
#             for state in states:
#                 trans_indexing = f'({prev_state},{state})'
#                 emiss_indexing = f'({state},{word})'
                
#                 try:
#                     trans = hmm_model['transition'][trans_indexing]
#                     emiss = hmm_model['emission'][emiss_indexing]
                    
#                 except KeyError:
#                     continue

#                 s_prob = trans * emiss
#                 if s_prob > max:
#                     max = s_prob
#                     optim_state = state
#                     # print(f'Previous {prev_state}, State: {state}, word: {word}')
#             out_file.write(f'{index}\t{word}\t{optim_state}\n')
#         else:
#             out_file.write('\n')

                
#         prev_state = state 
#         max = -np.inf


In [49]:
output_file_path = '../../data/outputs/greedy.out'
states = list(set([k.split(',')[0].strip('(') for k in hmm_model['transition'].keys()]))

with open('../../data/vocab-data/dev', 'r') as tr_file, open(output_file_path, 'w') as out_file:
    Lines = tr_file.readlines()
    prev_state = '<new_line>'
    
    for line in Lines:
        line = line.split('\t')
        max_prob = -np.inf  # Reset max for each new word

        if len(line) == 3:  # if not new line 
            _, word, index = line[2].replace('\n',''), line[1], line[0]
            prev_state = '<new_line>' if index == '1' else prev_state
                
            for state in states:
                trans_indexing = f'({prev_state},{state})'
                emiss_indexing = f'({state},{word})'
                # print(emiss_indexing)
                try:
                    trans = hmm_model['transition'][trans_indexing]
                    emiss = hmm_model['emission'][emiss_indexing]
                except KeyError:
                    continue

                s_prob = trans * emiss
                if s_prob > max_prob:

                    max_prob = s_prob
                    optim_state = state

            out_file.write(f'{index}\t{word}\t{optim_state}\n')
            prev_state = optim_state  # Update prev_state correctly within the loop

        else:
            out_file.write('\n')
            prev_state = '<new_line>' 


In [50]:
# with open('../../data/vocab-data/dev', 'r') as tr_file, open(output_file_path, 'w') as out_file:
#     Lines = tr_file.readlines()
#     for line in Lines:
#         line = line.split('\t')
#         if len(line) == 3:  # if not new line 
#             _, word, index = line[2].replace('\n',''), line[1], line[0]
#             out_file.write(f'{index}\t{word}\t{optim_state}\n')
#         else:
#             out_file.write('\n')


In [51]:
! python ../eval.py -p ../../data/outputs/greedy.out -g ../../data/vocab-data/dev

total: 131768, correct: 114991, accuracy: 87.27%


In [52]:
# python eval.py -p greedy.out -g dev
# python POS-Sequence-Labeling\eval.py -p ./data/outputs/greedy.out -g ./data/vocab-data/dev
# There should be two greedy.out files, one for dev data, the other for test data. You need compute the accuracy on dev set and submit the greedy.out of test data.


## Viterbi Decoding 

In [53]:
# #works well at first but stops working 
# def viterbi_decode(observations, states, hmm_model):
#     num_obs = len(observations)
#     num_states = len(states)
#     print(f"States: {states}")
    
#     # Initialize the Viterbi table with zeros
#     viterbi_table = [[0.0 for _ in range(num_states)] for _ in range(num_obs)]
#     backpointer = [[0 for _ in range(num_states)] for _ in range(num_obs)]
    
#     default_probability = 0.00  # Small probability for missing transitions/emissions
    
#     # Initialize the first column of the table
#     for s in tqdm(range(num_states), desc="Initializing first column"):
#         state = states[s]
#         emiss_indexing = f'({state},{observations[0]})'
#         emission_prob = hmm_model['emission'].get(emiss_indexing, default_probability)
#         viterbi_table[0][s] = (1 / num_states) * emission_prob
#         backpointer[0][s] = 0

#     # Fill the Viterbi table
#     for t in tqdm(range(1, num_obs), desc="Filling Viterbi table"):
#         for s in range(num_states):
#             state = states[s]
#             max_tr_prob = None
#             prev_st_selected = 0

#             for prev_st in range(num_states):
#                 prev_state = states[prev_st]
#                 trans_indexing = f'({prev_state},{state})'
#                 trans_prob = hmm_model['transition'].get(trans_indexing, default_probability)

#                 # if t == 36:  
#                 #     print('trans indexing: ', trans_indexing, 'prob: ', trans_prob)

#                 tr_prob = viterbi_table[t-1][prev_st] * trans_prob

#                 if max_tr_prob is None or tr_prob > max_tr_prob:
#                     max_tr_prob = tr_prob
#                     prev_st_selected = prev_st
            
#             emiss_indexing = f'({state},{observations[t]})'
#             emission_prob = hmm_model['emission'].get(emiss_indexing, default_probability)
#             # if t == 36:  
#             #     print('emiss indexing : ', emiss_indexing, 'prob: ', emission_prob)

#             max_prob = max_tr_prob * emission_prob
#             viterbi_table[t][s] = max_prob
#             backpointer[t][s] = prev_st_selected
   
#     # The final most probable state sequence
#     best_path = []
#     max_prob = max(viterbi_table[-1])
#     last_state = viterbi_table[-1].index(max_prob)
#     best_path.append(states[last_state])

#     # Trace back
#     for t in tqdm(range(num_obs - 1, 0, -1), desc="Backtracking"):
#         last_state = backpointer[t][last_state]
#         best_path.insert(0, states[last_state])

#     return best_path


In [54]:
# with open('../../data/outputs/hmm.json') as model_file:
#     hmm_model = json.load(model_file)

# # Extract states and start probabilities if available; otherwise, initialize uniformly
# states = list(set([k.split(',')[0].strip('(') for k in hmm_model['transition'].keys()]))
# start_probabilities = {state: 1/len(states) for state in states}  # Uniform start probabilities


# with open('../../data/vocab-data/dev', 'r') as dev_file:
#     observations = []
#     for line in dev_file:
#         if line.strip():  # If the line is not empty
#             observations.append(line.strip().split('\t')[1])
#         else:
#             break

# # states = ['NNP', 'CD', 'NNS', 'JJ', 'MD', 'VB'] 
# # observations = ['Pierre', 'Vinken' ] 

# # Since start_probabilities are not provided, assume uniform distribution
# output_file_path = '../../data/outputs/viterbi.out'
# predicted_tags = viterbi_decode(observations, states, hmm_model)


In [55]:
# predicted_tags

# Makes first 100 predictions then stops

In [56]:
def viterbi_decode(observations, states, hmm_model, output_file_path):
    num_obs = len(observations)
    num_states = len(states)
    viterbi_table = [[0.0 for _ in range(num_states)] for _ in range(num_obs)]
    backpointer = [[0 for _ in range(num_states)] for _ in range(num_obs)]
    
    default_probability = 0.0
    
    # Initialize the first column of the Viterbi table
    for s in range(num_states):
        state = states[s]
        emiss_indexing = f'({state},{observations[0]})' if observations[0] != '\n' else None
        emission_prob = hmm_model['emission'].get(emiss_indexing, default_probability) if emiss_indexing else default_probability
        viterbi_table[0][s] = emission_prob
        backpointer[0][s] = 0
    
    # Fill the Viterbi table
    for t in range(1, num_obs): #, desc="Filling Viterbi table"):
        for s in range(num_states):
            state = states[s]
            max_tr_prob = None
            prev_st_selected = 0
            for prev_st in range(num_states):
                prev_state = states[prev_st]
                trans_indexing = f'({prev_state},{state})'
                tr_prob = viterbi_table[t-1][prev_st] * hmm_model['transition'].get(trans_indexing, default_probability)
                if max_tr_prob is None or tr_prob > max_tr_prob:
                    max_tr_prob = tr_prob
                    prev_st_selected = prev_st
                           

            emiss_indexing = f'({state},{observations[t]})'
            emission_prob = hmm_model['emission'].get(emiss_indexing, default_probability)
            max_prob = max_tr_prob * emission_prob
            
            viterbi_table[t][s] = max_prob
            backpointer[t][s] = prev_st_selected

            assert max_tr_prob is not None, f"Max transition probability not found for t={t}, state={state}"
            # assert emission_prob > 0, f"Emission probability is zero or very low for t={t}, state={state}, observation={observations[t]}"
     
    
    # Decode the best path from back to front
    best_path = []
    max_prob = max(viterbi_table[-1])
    last_state = viterbi_table[-1].index(max_prob)
    best_path.append(states[last_state])

    for t in range(num_obs - 2, -1, -1): #, desc="Backtracking"):
        last_state = backpointer[t+1][last_state]
        best_path.insert(0, states[last_state])

    return best_path

states_list = list(set([k.split(',')[0].strip('(') for k in hmm_model['transition'].keys()]))

with open('../../data/outputs/hmm.json') as model_file:
    hmm_model = json.load(model_file)

def extract_sentences_from_dev_file(file_path):
    sentences = [] 
    current_sentence = []  
    
    with open(file_path, 'r') as file:
        for line in file:
            if line.strip():  
                parts = line.strip().split('\t')
                if len(parts) > 0:
                    word = parts[1]  
                    current_sentence.append(word)
            else: 
                if current_sentence:  
                    sentences.append(current_sentence)
                    current_sentence = []  
        
        if current_sentence:
            sentences.append(current_sentence)
    
    return sentences

# Usage example
file_path = '../../data/vocab-data/dev' 
sentences = extract_sentences_from_dev_file(file_path)
print('setnence: ', sentences[0])
opt_seq = []
for i, sentence in tqdm(enumerate(sentences)):
    path = viterbi_decode(observations = sentences[i] , states = states_list, hmm_model = hmm_model, output_file_path = '../../data/outputs/viterbi.out' )
    opt_seq.append(path)

setnence:  ['The', 'Arizona', 'Corporations', 'Commission', 'authorized', 'an', '11.5', '%', 'rate', 'increase', 'at', 'Tucson', 'Electric', 'Power', 'Co.', ',', 'substantially', 'lower', 'than', 'recommended', 'last', 'month', 'by', 'a', 'commission', 'hearing', 'officer', 'and', 'barely', 'half', 'the', 'rise', 'sought', 'by', 'the', 'utility', '.']


14it [00:00, 62.42it/s]

5527it [01:21, 67.47it/s]


In [57]:
def write_viterbi_output(dev_file_path, predictions, output_file_path):
    with open(dev_file_path, 'r') as dev_file, open(output_file_path, 'w') as out_file:
        prediction_index = 0  # separate predictions counter 
        for line in dev_file:
            if line.strip():
                index, word, _ = line.strip().split('\t')
                if prediction_index < len(predictions) and predictions[prediction_index]:
                    tag = predictions[prediction_index].pop(0)
                    out_file.write(f'{index}\t{word}\t{tag}\n')
            else:
                out_file.write('\n')
                prediction_index += 1  


dev_file_path = '../../data/vocab-data/dev'  # Update this to your dev file path
output_file_path='./viterbi_new.out'
write_viterbi_output(dev_file_path, opt_seq, output_file_path)

In [58]:
! python ../eval.py -p viterbi_new.out -g ../../data/vocab-data/dev


total: 131768, correct: 18131, accuracy: 13.76%


In [59]:
# with open('../../data/outputs/hmm.json') as model_file:
#     hmm_model = json.load(model_file)

# # Extract states and start probabilities if available; otherwise, initialize uniformly
# states = list(set([k.split(',')[0].strip('(') for k in hmm_model['transition'].keys()]))
# start_probabilities = {state: 1/len(states) for state in states}  # Uniform start probabilities


# with open('../../data/vocab-data/dev', 'r') as dev_file:
#     observations = []
#     for line in dev_file:
#         if line.strip():  # If the line is not empty
#             observations.append(line.strip().split('\t')[1])
#         else:  # If the line is empty (sentence boundary)
#             observations.append('\n')  # Append a newline character to represent sentence boundaries

# # states = ['NNP', 'CD', 'NNS', 'JJ', 'MD', 'VB'] 
# # observations = ['Pierre', 'Vinken' ] 

# # Since start_probabilities are not provided, assume uniform distribution
# output_file_path = '../../data/outputs/viterbi.out'
# predicted_tags = viterbi_decode(observations, states, hmm_model, output_file_path)


In [60]:
! python ../eval.py -p ../../data/outputs/viterbi.out -g ../../data/vocab-data/dev


Traceback (most recent call last):
  File "c:\Users\amant\OneDrive\Desktop\Projects\NLP\NLP-HW2\POS-Sequence-Labeling\eval.py", line 36, in <module>
    pline = pf_lines[pf_count]
            ~~~~~~~~^^^^^^^^^^
IndexError: list index out of range

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Users\amant\OneDrive\Desktop\Projects\NLP\NLP-HW2\POS-Sequence-Labeling\eval.py", line 58, in <module>
    print(repr(gline), repr(pline), total)
                            ^^^^^
NameError: name 'pline' is not defined. Did you mean: 'gline'?


In [61]:
# import numpy as np
# import pandas as pd
# def viterbi_decode(observations, states, hmm_model, output_file_path):
#     # Initialize the Viterbi table with log probabilities
#     t = hmm_model["transition"]
#     e = hmm_model["emission"]
#     start = "<start>"
#     n_obs = len(observations)
#     n_states = len(states)
#     pi = pd.DataFrame(-np.inf, index=np.arange(n_obs), columns=states)
#     backpointer = pd.DataFrame(None, index=np.arange(n_obs), columns=states)

#     log_default_probability = np.log(1e-4)  # Slightly higher default probability

#     # Initialize the first column of the Viterbi table
#     for s in states:
#         t_s = t.get(f"({start},{s})", 1e-4)
#         e_x1_s = e.get(f"({s},{observations[0]})", 1e-4)
#         pi.at[0, s] = np.log(t_s) + np.log(e_x1_s)

#     # Fill the Viterbi table
#     for j in range(1, n_obs):
#         for s in states:
#             max_log_prob = -np.inf
#             max_prev_st = None
#             for prev_st in states:
#                 t_s_sp = t.get(f"({prev_st},{s})", 1e-4)
#                 e_x_s = e.get(f"({s},{observations[j]})", 1e-4)
#                 log_prob = pi.at[j-1, prev_st] + np.log(t_s_sp) + np.log(e_x_s)
#                 if log_prob > max_log_prob:
#                     max_log_prob = log_prob
#                     max_prev_st = prev_st
#             pi.at[j, s] = max_log_prob if max_log_prob != -np.inf else log_default_probability
#             backpointer.at[j, s] = max_prev_st

#     # Backtrack to find the best path
#     best_path = []
#     last_state = pi.idxmax(axis=1).iloc[-1]
#     for j in range(n_obs - 1, -1, -1):
#         best_path.insert(0, last_state)
#         last_state = backpointer.at[j, last_state]

#     # # Write the best path to the output file
#     with open(output_file_path, 'w') as out_file:
#         for i, (word, state) in enumerate(zip(observations, best_path)):
#             # Check if state is None, this can happen if backpointer is not updated properly
#             if state is '<new_line>':
#                 state = 'UNK'  # UNK for unknown state
#             out_file.write(f"{i}\t{word}\t{state}\n")

#     return best_path

# # Example usage:
# # states_list = list(hmm_model['transitions'].keys())
# # observations = ['The', 'quick', 'brown', 'fox', ...]  # Your list of observations
# # path = viterbi_decode(observations, states_list, hmm_model, 'viterbi.out')

# with open('../../data/outputs/hmm.json') as model_file:
#     hmm_model = json.load(model_file)

# # Extract states and start probabilities if available; otherwise, initialize uniformly
# states = list(set([k.split(',')[0].strip('(') for k in hmm_model['transition'].keys()]))
# start_probabilities = {state: 1/len(states) for state in states}  # Uniform start probabilities


# with open('../../data/vocab-data/dev', 'r') as dev_file:
#     observations = []
#     for line in dev_file:
#         if line.strip():  # If the line is not empty
#             observations.append(line.strip().split('\t')[1])

# # states = ['NNP', 'CD', 'NNS', 'JJ', 'MD', 'VB'] 
# # observations = ['Pierre', 'Vinken' ] 

# # Since start_probabilities are not provided, assume uniform distribution
# output_file_path = '../../data/outputs/viterbi.out'
# predicted_tags = viterbi_decode(observations[:100], states, hmm_model, output_file_path='')


In [62]:
# predicted_tags

In [63]:
# dev_file_path = '../../data/vocab-data/dev'  # Update this to your dev file path
# output_file_path='./viterbi_new.out'

# with open(dev_file_path, 'r') as dev_file, open(output_file_path, 'w') as out_file:
#     prediction_index = 0  # Initialize a separate counter for predictions
#     for line in dev_file:
#         if line.strip():  # If the line is not empty
#             index, word, _ = line.strip().split('\t')
#             print(prediction_index)
#             try:
#                 if predicted_tags[prediction_index] == '<new_line>':
#                     print('enter')
#                     prediction_index += 1
#             except:
#                 print('idk')
#             if prediction_index < len(predicted_tags):  # Check to avoid index out of range
#                 tag = predicted_tags[prediction_index]
#                 out_file.write(f'{index}\t{word}\t{tag}\n')
#                 prediction_index += 1  # Increment only if a prediction was written
#             else:
#                 break
#         else:
#             out_file.write('\n')  # Preserve sentence boundaries without incrementing prediction_index
          

In [64]:
! python ../eval.py -p viterbi_new.out -g ../../data/vocab-data/dev


total: 131768, correct: 18131, accuracy: 13.76%
