In [1]:
import nltk
nltk.download("brown")
nltk.download("universal_tagset")

[nltk_data] Downloading package brown to /Users/siraj/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/siraj/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

## Tags in Brown (Universal) corpus:
- ADJ – Adjective (e.g., big, blue, fast)
- ADP – Adposition (prepositions and postpositions, e.g., in, on, under)
- ADV – Adverb (e.g., quickly, very, silently)
- CONJ – Coordinating conjunction (e.g., and, but, or)
- DET – Determiner (e.g., the, a, some, this)
- NOUN – Noun (e.g., cat, city, book)
- NUM – Numeral (e.g., one, 100, third)
- PRON – Pronoun (e.g., he, she, it, they)
- PRT – Particle (e.g., up, off, not — especially in phrasal verbs like "shut up")
- VERB – Verb (e.g., run, is, jump, eaten)
- . – Punctuation (e.g., ., !, ?)
- X – Other (miscellaneous, foreign words, typos, etc.)

In [2]:
from nltk.corpus import brown
import numpy as np
import pandas as pd
import math
from collections import defaultdict

from utils import train_test_split, extract_vocab, get_word_tag, preprocess_test, input_preprocess

In [3]:
tagged_sentences = list(brown.tagged_sents(tagset='universal'))

In [4]:
train_l, test_l = train_test_split(tagged_sentences, test_size=0.3)

print(f"Length of train set: {len(train_l)}")
print(f"length of test set: {len(test_l)}")

Length of train set: 40138
length of test set: 17202


In [5]:
vocab = extract_vocab(train_l)

cnt = 0
for k, v in vocab.items():
    print(f"{k} : {v}")
    cnt+=1
    if cnt>20:
        break


 : 0
! : 1
$.03 : 2
$.07 : 3
$.50 : 4
$1 : 5
$1,000 : 6
$1,000,000 : 7
$1,200 : 8
$1,500 : 9
$1,500,000 : 10
$1.1 : 11
$1.8 : 12
$10 : 13
$10,000 : 14
$10.50 : 15
$100 : 16
$100,000 : 17
$110 : 18
$12,500 : 19
$125 : 20


In [6]:
test_sentences, test_set = preprocess_test(test_l, vocab)

test_sentences[:20]

['Keith',
 'was',
 'an',
 'eagle',
 '.',
 '--n--',
 'Mark',
 'the',
 'specimen',
 'at',
 'the',
 'outer',
 'edges',
 'of',
 'the',
 'template',
 'with',
 'pen',
 'and',
 'indelible']

In [7]:
def create_dictionaries(train_l, vocab):

    emission_count = defaultdict(int)
    transitions_count = defaultdict(int)
    tag_count = defaultdict(int)

    prev_tag = "--s--"

    for line in train_l:
        word, tag = get_word_tag(("\n", "\n"), vocab)
        emission_count[(tag, word)] += 1
        transitions_count[(prev_tag, tag)] += 1
        tag_count[tag] += 1
        prev_tag = tag
        for word_tag in line:

            word, tag = get_word_tag(word_tag, vocab)

            emission_count[(tag, word)] += 1

            transitions_count[(prev_tag, tag)] += 1

            tag_count[tag] += 1

            prev_tag = tag
    
    return emission_count, transitions_count, tag_count

In [8]:
emission_counts, transition_counts, tag_counts = create_dictionaries(train_l, vocab)

In [9]:
states = sorted(tag_counts.keys())
print(f"Number of POS tags (number of 'states'): {len(states)}")
print("View these POS tags (states)")
print(states)

Number of POS tags (number of 'states'): 13
View these POS tags (states)
['--s--', '.', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRON', 'PRT', 'VERB', 'X']


In [10]:
print("transition examples: ")
for ex in list(transition_counts.items())[:3]:
    print(ex)
print()

print("emission examples: ")
for ex in list(emission_counts.items())[200:203]:
    print (ex)
print()

print("ambiguous word example: ")
for tup,cnt in emission_counts.items():
    if tup[1] == 'back': print (tup, cnt) 

transition examples: 
(('--s--', '--s--'), 1)
(('--s--', 'DET'), 8566)
(('DET', 'NOUN'), 59870)

emission examples: 
(('NOUN', 'suits'), 12)
(('VERB', 'filed'), 22)
(('ADJ', 'several'), 235)

ambiguous word example: 
('ADV', 'back') 497
('NOUN', 'back') 121
('ADJ', 'back') 20
('VERB', 'back') 17


In [11]:
def create_transition_matrix(transition_counts, alpha, tag_counts):
    
    all_tags = sorted(tag_counts.keys())
    num_tags = len(all_tags)

    transition_matrix = np.zeros((num_tags, num_tags))

    trans_key = set(transition_counts.keys())

    for i in range(num_tags):
        for j in range(num_tags):
            count = 0

            key = (all_tags[i], all_tags[j])

            if key in trans_key:
                count = transition_counts[key]
            
            transition_matrix[i, j] = (count+alpha)/(tag_counts[all_tags[i]] + num_tags*alpha)
    
    return transition_matrix

In [12]:
alpha = 0.001
A = create_transition_matrix(transition_counts, alpha, tag_counts)

print(f"A at row 0, col 0: {A[0,0]:.9f}")
print(f"A at row 3, col 1: {A[3,1]:.4f}")

print("View a subset of transition matrix A")
A_sub = pd.DataFrame(A[10:15,10:15], index=states[10:15], columns = states[10:15] )
print(A_sub)

A at row 0, col 0: 0.000024939
A at row 3, col 1: 0.0099
View a subset of transition matrix A
           PRT      VERB         X
PRT   0.011875  0.621546  0.000096
VERB  0.065942  0.183862  0.000189
X     0.008575  0.057878  0.505889


In [13]:
def create_emission_matrix(emission_counts, alpha, tag_counts, vocab: list):
    
    all_tags = sorted(tag_counts.keys())
    num_tags = len(all_tags)

    num_words = len(vocab)

    emission_matrix = np.zeros((num_tags, num_words))

    emis_keys = set(emission_counts.keys())

    for i in range(num_tags):
        for j in range(num_words):
            count = 0
            
            key = (all_tags[i], vocab[j])

            if key in emis_keys:
                count = emission_counts[key]
            
            emission_matrix[i, j] = (count+alpha)/(tag_counts[all_tags[i]]+alpha*num_words)
    
    return emission_matrix

In [14]:
alpha = 0.001
B = create_emission_matrix(emission_counts, alpha, tag_counts, list(vocab))

print(f"View Matrix position at row 0, column 0: {B[0,0]:.9f}")
print(f"View Matrix position at row 3, column 1: {B[3,1]:.9f}")

# Try viewing emissions for a few words in a sample dataframe
cidx  = ['$100','Available','comrade', 'relaxing', 'Dryfoos']

# Get the integer ID for each word
cols = [vocab[a] for a in cidx]

# Choose POS tags to show in a sample dataframe
rvals =['ADJ','DET','PRON', 'VERB','X','PRT']

# For each POS tag, get the row number from the 'states' list
rows = [states.index(a) for a in rvals]

# Get the emissions for the sample of words, and the sample of POS tags
B_sub = pd.DataFrame(B[np.ix_(rows,cols)], index=rvals, columns = cidx )
print(B_sub)

View Matrix position at row 0, column 0: 0.000000025
View Matrix position at row 3, column 1: 0.000000010
              $100     Available       comrade      relaxing       Dryfoos
ADJ   1.702329e-08  6.811019e-05  1.702329e-08  1.702329e-08  1.702329e-08
DET   1.044582e-08  1.044582e-08  1.044582e-08  1.044582e-08  1.044582e-08
PRON  2.927225e-08  2.927225e-08  2.927225e-08  2.927225e-08  2.927225e-08
VERB  7.860829e-09  7.860829e-09  7.860829e-09  2.359035e-05  7.860829e-09
X     1.043790e-06  1.043790e-06  1.043790e-06  1.043790e-06  1.043790e-06
PRT   4.782390e-08  4.782390e-08  4.782390e-08  4.782390e-08  4.782390e-08


In [15]:
def viterbi_initialize(A, B, vocab, tag_counts, states, corpus):

    num_tags = len(tag_counts.keys())

    best_probs = np.zeros((num_tags, len(corpus)))

    best_paths = np.zeros((num_tags, len(corpus)), dtype=int)

    s_indx = states.index("--s--")

    for i in range(num_tags):

        if A[s_indx, i] == 0:
            best_probs[i,0] = float('-inf')
        else:
            best_probs[i,0] = math.log(A[s_indx, i])+math.log(B[i, vocab[corpus[0]]])
    
    return best_probs, best_paths


In [16]:
best_probs, best_paths = viterbi_initialize(A, B, vocab, tag_counts, states, test_sentences)

In [17]:
print(f"best_probs[0,0]: {best_probs[0,0]:.4f}")
print(f"best_paths[2,3]: {best_paths[2,3]:.4f}")

best_probs[0,0]: -28.1075
best_paths[2,3]: 0.0000


In [18]:
def viterbi_forward(best_probs, best_paths, A, B, vocab, corpus):
    
    num_tags = best_probs.shape[0]

    for i in range(1, len(corpus)):
        
        for j in range(num_tags):
            best_prob_i = float("-inf")
            best_path_i = None

            for k in range(num_tags):
                prob = best_probs[k, i-1] + math.log(A[k, j]) + math.log(B[j, vocab[corpus[i]]])

                if prob > best_prob_i:
                    best_prob_i = prob
                    best_path_i = k
                
            best_probs[j, i] = best_prob_i
            best_paths[j, i] = best_path_i
    
    return best_probs, best_paths

In [19]:
best_probs, best_paths = viterbi_forward(best_probs, best_paths, A, B, vocab, test_sentences)

In [20]:
print(f"best_probs[0, 1]: {best_probs[0,1]:.4f}")
print(f"best_paths[0,1]: {best_paths[0,1]}")
print("--------")
print(f"best_probs[2, 1]: {best_probs[2,1]:.4f}")
print(f"best_paths[2, 1]: {best_paths[2,1]}")
print("--------")
print(f"best_probs[9, 20]: {best_probs[9,20]:.4f}")
print(f"best_paths[9, 20]: {best_paths[9,20]}")

best_probs[0, 1]: -34.5624
best_paths[0,1]: 7
--------
best_probs[2, 1]: -33.5933
best_paths[2, 1]: 7
--------
best_probs[9, 20]: -157.5209
best_paths[9, 20]: 2


In [21]:
def viterbi_backward(best_probs, best_paths, states):

    num_words = best_probs.shape[1]

    z = np.zeros(num_words, dtype=int)

    pred = [None] * num_words

    last_tag_index = np.argmax(best_probs[:,num_words-1])
    z[num_words-1] = last_tag_index
    pred[num_words-1] = states[last_tag_index]

    for i in range(num_words-1, 0, -1):
        z[i-1] = best_paths[z[i], i]
        pred[i-1] = states[z[i-1]]
    
    return pred



In [22]:
pred = viterbi_backward(best_probs, best_paths, states)

In [23]:
predictions = []
for word, pred_tag in zip(test_sentences, pred):
    predictions.append((word, pred_tag))

In [24]:
def calculate_accuracy(test_set, pred_set):
    correct_count = 0
    for t, p in zip(test_set, pred_set):
        if t[1]==p[1]:
            correct_count+=1
    
    return correct_count/len(test_set)

In [25]:
predictions[:30]

[('Keith', 'NOUN'),
 ('was', 'VERB'),
 ('an', 'DET'),
 ('eagle', 'NOUN'),
 ('.', '.'),
 ('--n--', '--s--'),
 ('Mark', 'NOUN'),
 ('the', 'DET'),
 ('specimen', 'NOUN'),
 ('at', 'ADP'),
 ('the', 'DET'),
 ('outer', 'ADJ'),
 ('edges', 'NOUN'),
 ('of', 'ADP'),
 ('the', 'DET'),
 ('template', 'NOUN'),
 ('with', 'ADP'),
 ('pen', 'NOUN'),
 ('and', 'CONJ'),
 ('indelible', 'ADJ'),
 ('ink', 'NOUN'),
 (';', '.'),
 (';', '.'),
 ('--n--', '--s--'),
 ('The', 'DET'),
 ('--unk_upper--', 'NOUN'),
 ('is', 'VERB'),
 ('purely', 'ADV'),
 ('a', 'DET'),
 ('--unk_noun--', 'NOUN')]

In [26]:
test_set[:30]

[('Keith', 'NOUN'),
 ('was', 'VERB'),
 ('an', 'DET'),
 ('eagle', 'NOUN'),
 ('.', '.'),
 ('--n--', '--s--'),
 ('Mark', 'VERB'),
 ('the', 'DET'),
 ('specimen', 'NOUN'),
 ('at', 'ADP'),
 ('the', 'DET'),
 ('outer', 'ADJ'),
 ('edges', 'NOUN'),
 ('of', 'ADP'),
 ('the', 'DET'),
 ('template', 'NOUN'),
 ('with', 'ADP'),
 ('pen', 'NOUN'),
 ('and', 'CONJ'),
 ('indelible', 'ADJ'),
 ('ink', 'NOUN'),
 (';', '.'),
 (';', '.'),
 ('--n--', '--s--'),
 ('The', 'DET'),
 ('Creston', 'NOUN'),
 ('is', 'VERB'),
 ('purely', 'ADV'),
 ('a', 'DET'),
 ('potboiler', 'NOUN')]

In [27]:
accuaracy = calculate_accuracy(test_set, predictions)

print(f"Accuracy: {accuaracy*100}")

Accuracy: 96.29325375992515


In [28]:
def tag_POS(A, B, input: str, vocab, tag_counts, states):
    corpus = input_preprocess(input, vocab)
    best_probs, best_paths = viterbi_initialize(A, B, vocab, tag_counts, states, corpus)
    best_probs, best_paths = viterbi_forward(best_probs, best_paths, A, B, vocab=vocab, corpus=corpus)
    predictions = viterbi_backward(best_probs, best_paths, states)

    word_tag_predictions = []
    for word, tag in zip(corpus, predictions):
        word_tag_predictions.append((word, tag))
    
    return word_tag_predictions


In [29]:
def tag_words(input: str):
    return tag_POS(A, B, input, vocab, tag_counts, states)

In [34]:
input = """hi gpt, my name is sathvik.
I resigned to einstein. is it a good idea?"""

preds = tag_words(input)

In [35]:
preds

[('--unk_adj--', 'ADJ'),
 ('--unk--', 'NOUN'),
 (',', '.'),
 ('my', 'DET'),
 ('name', 'NOUN'),
 ('is', 'VERB'),
 ('--unk--', 'NOUN'),
 ('.', '.'),
 ('I', 'PRON'),
 ('resigned', 'VERB'),
 ('to', 'PRT'),
 ('--unk--', 'VERB'),
 ('.', '.'),
 ('is', 'VERB'),
 ('it', 'PRON'),
 ('a', 'DET'),
 ('good', 'ADJ'),
 ('idea', 'NOUN'),
 ('?', '.')]